]> git.ipfire.org Git - thirdparty/gcc.git/blobdiff - gcc/config/i386/i386.md
i386: Fix split condition of *<insn>qi_ext<mode>_1_slp patterns
[thirdparty/gcc.git] / gcc / config / i386 / i386.md
index 01d5199bbc202681d4ca278472664c4054c7d246..32535621db48f0718c969ea9080d64881d8d9ccb 100644 (file)
   UNSPEC_INSN_FALSE_DEP
   UNSPEC_SBB
   UNSPEC_CC_NE
+  UNSPEC_STC
+  UNSPEC_PUSHFL
+  UNSPEC_POPFL
 
   ;; For SSE/MMX support:
   UNSPEC_FIX_NOTRUNC
   UNSPEC_RSQRT
   UNSPEC_PSADBW
 
+  ;; Different from generic us_truncate RTX
+  ;; as it does unsigned saturation of signed source.
+  UNSPEC_US_TRUNCATE
+
   ;; For AVX/AVX512F support
   UNSPEC_SCALEF
   UNSPEC_PCMP
   ;; For insn_callee_abi:
   UNSPEC_CALLEE_ABI
 
+  ;; For PUSH2/POP2 support
+  UNSPEC_APXPUSH2
+  UNSPEC_APXPOP2_LOW
+  UNSPEC_APXPOP2_HIGH
 ])
 
 (define_c_enum "unspecv" [
 
   ;; For PREFETCHI support
   UNSPECV_PREFETCHI
+
+  ;; For USER_MSR support
+  UNSPECV_URDMSR
+  UNSPECV_UWRMSR
 ])
 
 ;; Constants to represent rounding modes in the ROUND instruction
    (MASK5_REG                  73)
    (MASK6_REG                  74)
    (MASK7_REG                  75)
-   (FIRST_PSEUDO_REG           76)
+   (R16_REG                    76)
+   (R17_REG                    77)
+   (R18_REG                    78)
+   (R19_REG                    79)
+   (R20_REG                    80)
+   (R21_REG                    81)
+   (R22_REG                    82)
+   (R23_REG                    83)
+   (R24_REG                    84)
+   (R25_REG                    85)
+   (R26_REG                    86)
+   (R27_REG                    87)
+   (R28_REG                    88)
+   (R29_REG                    89)
+   (R30_REG                    90)
+   (R31_REG                    91)
+   (FIRST_PSEUDO_REG           92)
   ])
 
 ;; Insn callee abi index.
 \f
 ;; Processor type.
 (define_attr "cpu" "none,pentium,pentiumpro,geode,k6,athlon,k8,core2,nehalem,
-                   atom,slm,glm,haswell,generic,lujiazui,amdfam10,bdver1,
+                   atom,slm,glm,haswell,generic,lujiazui,yongfeng,amdfam10,bdver1,
                    bdver2,bdver3,bdver4,btver2,znver1,znver2,znver3,znver4"
   (const (symbol_ref "ix86_schedule")))
 
           (const_string "unknown")]
         (const_string "integer")))
 
+;; Used to control the "enabled" attribute on a per-instruction basis.
+(define_attr "isa" "base,x64,nox64,x64_sse2,x64_sse4,x64_sse4_noavx,
+                   x64_avx,x64_avx512bw,x64_avx512dq,aes,
+                   sse_noavx,sse2,sse2_noavx,sse3,sse3_noavx,sse4,sse4_noavx,
+                   avx,noavx,avx2,noavx2,bmi,bmi2,fma4,fma,avx512f,avx512f_512,
+                   noavx512f,avx512bw,avx512bw_512,noavx512bw,avx512dq,
+                   noavx512dq,fma_or_avx512vl,avx512vl,noavx512vl,avxvnni,
+                   avx512vnnivl,avx512fp16,avxifma,avx512ifmavl,avxneconvert,
+                   avx512bf16vl,vpclmulqdqvl,avx_noavx512f,avx_noavx512vl"
+  (const_string "base"))
+
 ;; The (bounding maximum) length of an instruction immediate.
 (define_attr "length_immediate" ""
   (cond [(eq_attr "type" "incdec,setcc,icmov,str,lea,other,multi,idiv,leave,
                          bitmanip,imulx,msklog,mskmov")
           (const_int 0)
+        (ior (eq_attr "type" "sse4arg")
+             (eq_attr "isa" "fma4"))
+          (const_int 1)
         (eq_attr "unit" "i387,sse,mmx")
           (const_int 0)
         (eq_attr "type" "alu,alu1,negnot,imovx,ishift,ishiftx,ishift1,
        (const_int 0)))
 
 ;; There are also additional prefixes in 3DNOW, SSSE3.
-;; ssemuladd,sse4arg default to 0f24/0f25 and DREX byte,
-;; sseiadd1,ssecvt1 to 0f7a with no DREX byte.
 ;; 3DNOW has 0f0f prefix, SSSE3 and SSE4_{1,2} 0f38/0f3a.
+;; While generally inapplicable to VEX/XOP/EVEX encodings, "length_vex" uses
+;; the attribute evaluating to zero to know that VEX2 encoding may be usable.
 (define_attr "prefix_extra" ""
-  (cond [(eq_attr "type" "ssemuladd,sse4arg")
-          (const_int 2)
-        (eq_attr "type" "sseiadd1,ssecvt1")
+  (cond [(eq_attr "type" "ssemuladd,sse4arg,sseiadd1,ssecvt1")
           (const_int 1)
        ]
        (const_int 0)))
            (const_string "vex")
          (eq_attr "mode" "XI,V16SF,V8DF")
            (const_string "evex")
+        (eq_attr "type" "ssemuladd")
+          (if_then_else (eq_attr "isa" "fma4")
+            (const_string "vex")
+            (const_string "maybe_evex"))
+        (eq_attr "type" "sse4arg")
+          (const_string "vex")
         ]
         (const_string "orig")))
 
 ;; Define attribute to indicate unaligned ssemov insns
 (define_attr "movu" "0,1" (const_string "0"))
 
-;; Used to control the "enabled" attribute on a per-instruction basis.
-(define_attr "isa" "base,x64,nox64,x64_sse2,x64_sse4,x64_sse4_noavx,
-                   x64_avx,x64_avx512bw,x64_avx512dq,
-                   sse_noavx,sse2,sse2_noavx,sse3,sse3_noavx,sse4,sse4_noavx,
-                   avx,noavx,avx2,noavx2,bmi,bmi2,fma4,fma,avx512f,noavx512f,
-                   avx512bw,noavx512bw,avx512dq,noavx512dq,fma_or_avx512vl,
-                   avx512vl,noavx512vl,avxvnni,avx512vnnivl,avx512fp16,avxifma,
-                   avx512ifmavl,avxneconvert,avx512bf16vl"
-  (const_string "base"))
+;; Define attribute to limit memory address register set.
+(define_attr "addr" "gpr8,gpr16,gpr32" (const_string "gpr32"))
 
 ;; Define instruction set of MMX instructions
 (define_attr "mmx_isa" "base,native,sse,sse_noavx,avx"
           (symbol_ref "TARGET_64BIT && TARGET_AVX512BW")
         (eq_attr "isa" "x64_avx512dq")
           (symbol_ref "TARGET_64BIT && TARGET_AVX512DQ")
+        (eq_attr "isa" "aes") (symbol_ref "TARGET_AES")
         (eq_attr "isa" "sse_noavx")
           (symbol_ref "TARGET_SSE && !TARGET_AVX")
         (eq_attr "isa" "sse2") (symbol_ref "TARGET_SSE2")
         (eq_attr "isa" "sse4_noavx")
           (symbol_ref "TARGET_SSE4_1 && !TARGET_AVX")
         (eq_attr "isa" "avx") (symbol_ref "TARGET_AVX")
+        (eq_attr "isa" "avx_noavx512f")
+          (symbol_ref "TARGET_AVX && !TARGET_AVX512F")
+        (eq_attr "isa" "avx_noavx512vl")
+          (symbol_ref "TARGET_AVX && !TARGET_AVX512VL")
         (eq_attr "isa" "noavx") (symbol_ref "!TARGET_AVX")
         (eq_attr "isa" "avx2") (symbol_ref "TARGET_AVX2")
         (eq_attr "isa" "noavx2") (symbol_ref "!TARGET_AVX2")
         (eq_attr "isa" "fma_or_avx512vl")
           (symbol_ref "TARGET_FMA || TARGET_AVX512VL")
         (eq_attr "isa" "avx512f") (symbol_ref "TARGET_AVX512F")
+        (eq_attr "isa" "avx512f_512")
+          (symbol_ref "TARGET_AVX512F && TARGET_EVEX512")
         (eq_attr "isa" "noavx512f") (symbol_ref "!TARGET_AVX512F")
         (eq_attr "isa" "avx512bw") (symbol_ref "TARGET_AVX512BW")
+        (eq_attr "isa" "avx512bw_512")
+          (symbol_ref "TARGET_AVX512BW && TARGET_EVEX512")
         (eq_attr "isa" "noavx512bw") (symbol_ref "!TARGET_AVX512BW")
         (eq_attr "isa" "avx512dq") (symbol_ref "TARGET_AVX512DQ")
         (eq_attr "isa" "noavx512dq") (symbol_ref "!TARGET_AVX512DQ")
         (eq_attr "isa" "avxneconvert") (symbol_ref "TARGET_AVXNECONVERT")
         (eq_attr "isa" "avx512bf16vl")
           (symbol_ref "TARGET_AVX512BF16 && TARGET_AVX512VL")
+        (eq_attr "isa" "vpclmulqdqvl")
+          (symbol_ref "TARGET_VPCLMULQDQ && TARGET_AVX512VL")
 
         (eq_attr "mmx_isa" "native")
           (symbol_ref "!TARGET_MMX_WITH_SSE")
    (set_attr "type" "multi")])
 
 (define_code_iterator plusminus [plus minus])
+(define_code_iterator plusminusmult [plus minus mult])
 (define_code_iterator plusminusmultdiv [plus minus mult div])
 
 (define_code_iterator sat_plusminus [ss_plus us_plus ss_minus us_minus])
 (define_code_iterator any_shift [ashift lshiftrt ashiftrt])
 
 ;; Base name for insn mnemonic.
-(define_code_attr shift [(ashift "sll") (lshiftrt "shr") (ashiftrt "sar")])
+(define_code_attr shift [(ashift "sal") (lshiftrt "shr") (ashiftrt "sar")])
 (define_code_attr vshift [(ashift "sll") (lshiftrt "srl") (ashiftrt "sra")])
 
 ;; Mapping of rotate operators
 ;; Mapping of extend operators
 (define_code_iterator any_extend [sign_extend zero_extend])
 
-;; Mapping of extract operators
-(define_code_iterator any_extract [sign_extract zero_extract])
-
 ;; Mapping of highpart multiply operators
 (define_code_iterator any_mul_highpart [smul_highpart umul_highpart])
 
 (include "core2.md")
 (include "haswell.md")
 (include "lujiazui.md")
+(include "yongfeng.md")
 
 \f
 ;; Operand and operator predicates and constraints
 
 (define_expand "cbranch<mode>4"
   [(set (reg:CC FLAGS_REG)
-       (compare:CC (match_operand:SDWIM 1 "nonimmediate_operand")
-                   (match_operand:SDWIM 2 "<general_operand>")))
+       (compare:CC (match_operand:SWIM1248x 1 "nonimmediate_operand")
+                   (match_operand:SWIM1248x 2 "<general_operand>")))
    (set (pc) (if_then_else
               (match_operator 0 "ordered_comparison_operator"
                [(reg:CC FLAGS_REG) (const_int 0)])
   DONE;
 })
 
+(define_expand "cbranchti4"
+  [(set (reg:CC FLAGS_REG)
+       (compare:CC (match_operand:TI 1 "nonimmediate_operand")
+                   (match_operand:TI 2 "ix86_timode_comparison_operand")))
+   (set (pc) (if_then_else
+              (match_operator 0 "ix86_timode_comparison_operator"
+               [(reg:CC FLAGS_REG) (const_int 0)])
+              (label_ref (match_operand 3))
+              (pc)))]
+  "TARGET_64BIT || TARGET_SSE4_1"
+{
+  ix86_expand_branch (GET_CODE (operands[0]),
+                     operands[1], operands[2], operands[3]);
+  DONE;
+})
+
 (define_expand "cbranchoi4"
   [(set (reg:CC FLAGS_REG)
        (compare:CC (match_operand:OI 1 "nonimmediate_operand")
   DONE;
 })
 
+(define_expand "cbranchxi4"
+  [(set (reg:CC FLAGS_REG)
+       (compare:CC (match_operand:XI 1 "nonimmediate_operand")
+                   (match_operand:XI 2 "nonimmediate_operand")))
+   (set (pc) (if_then_else
+              (match_operator 0 "bt_comparison_operator"
+               [(reg:CC FLAGS_REG) (const_int 0)])
+              (label_ref (match_operand 3))
+              (pc)))]
+  "TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256"
+{
+  ix86_expand_branch (GET_CODE (operands[0]),
+                     operands[1], operands[2], operands[3]);
+  DONE;
+})
+
 (define_expand "cstore<mode>4"
   [(set (reg:CC FLAGS_REG)
        (compare:CC (match_operand:SDWIM 2 "nonimmediate_operand")
 
 (define_mode_iterator SWI1248_AVX512BWDQ_64
   [(QI "TARGET_AVX512DQ") HI
-   (SI "TARGET_AVX512BW") (DI "TARGET_AVX512BW && TARGET_64BIT")])
+   (SI "TARGET_AVX512BW")
+   (DI "TARGET_AVX512BW && TARGET_EVEX512 && TARGET_64BIT")])
 
 (define_insn "*cmp<mode>_ccz_1"
   [(set (reg FLAGS_REG)
   [(set_attr "type" "icmp")
    (set_attr "mode" "<MODE>")])
 
-(define_insn "*cmpqi_ext<mode>_1_mem_rex64"
-  [(set (reg FLAGS_REG)
-       (compare
-         (match_operand:QI 0 "norex_memory_operand" "Bn")
-         (subreg:QI
-           (any_extract:SWI248
-             (match_operand 1 "int248_register_operand" "Q")
-             (const_int 8)
-             (const_int 8)) 0)))]
-  "TARGET_64BIT && reload_completed
-   && ix86_match_ccmode (insn, CCmode)"
-  "cmp{b}\t{%h1, %0|%0, %h1}"
-  [(set_attr "type" "icmp")
-   (set_attr "mode" "QI")])
-
 (define_insn "*cmpqi_ext<mode>_1"
   [(set (reg FLAGS_REG)
        (compare
-         (match_operand:QI 0 "nonimmediate_operand" "QBc,m")
+         (match_operand:QI 0 "nonimmediate_operand" "QBn")
          (subreg:QI
-           (any_extract:SWI248
-             (match_operand 1 "int248_register_operand" "Q,Q")
-             (const_int 8)
-             (const_int 8)) 0)))]
+           (match_operator:SWI248 2 "extract_operator"
+             [(match_operand 1 "int248_register_operand" "Q")
+              (const_int 8)
+              (const_int 8)]) 0)))]
   "ix86_match_ccmode (insn, CCmode)"
   "cmp{b}\t{%h1, %0|%0, %h1}"
-  [(set_attr "isa" "*,nox64")
+  [(set_attr "addr" "gpr8")
    (set_attr "type" "icmp")
    (set_attr "mode" "QI")])
 
-(define_peephole2
-  [(set (match_operand:QI 0 "register_operand")
-       (match_operand:QI 1 "norex_memory_operand"))
-   (set (match_operand 3 "flags_reg_operand")
-       (match_operator 4 "compare_operator"
-         [(match_dup 0)
-          (subreg:QI
-            (any_extract:SWI248
-              (match_operand 2 "int248_register_operand")
-              (const_int 8)
-              (const_int 8)) 0)]))]
-  "TARGET_64BIT
-   && peep2_reg_dead_p (2, operands[0])"
-  [(set (match_dup 3)
-       (match_op_dup 4
-         [(match_dup 1)
-          (subreg:QI
-            (any_extract:SWI248
-              (match_dup 2)
-              (const_int 8)
-              (const_int 8)) 0)]))])
-
 (define_insn "*cmpqi_ext<mode>_2"
   [(set (reg FLAGS_REG)
        (compare
          (subreg:QI
-           (any_extract:SWI248
-             (match_operand 0 "int248_register_operand" "Q")
-             (const_int 8)
-             (const_int 8)) 0)
+           (match_operator:SWI248 2 "extract_operator"
+             [(match_operand 0 "int248_register_operand" "Q")
+              (const_int 8)
+              (const_int 8)]) 0)
          (match_operand:QI 1 "const0_operand")))]
   "ix86_match_ccmode (insn, CCNOmode)"
   "test{b}\t%h0, %h0"
              (const_int 8)) 0)
          (match_operand:QI 1 "const_int_operand")))])
 
-(define_insn "*cmpqi_ext<mode>_3_mem_rex64"
-  [(set (reg FLAGS_REG)
-       (compare
-         (subreg:QI
-           (any_extract:SWI248
-             (match_operand 0 "int248_register_operand" "Q")
-             (const_int 8)
-             (const_int 8)) 0)
-         (match_operand:QI 1 "norex_memory_operand" "Bn")))]
-  "TARGET_64BIT && reload_completed
-   && ix86_match_ccmode (insn, CCmode)"
-  "cmp{b}\t{%1, %h0|%h0, %1}"
-  [(set_attr "type" "icmp")
-   (set_attr "mode" "QI")])
-
 (define_insn "*cmpqi_ext<mode>_3"
   [(set (reg FLAGS_REG)
        (compare
          (subreg:QI
-           (any_extract:SWI248
-             (match_operand 0 "int248_register_operand" "Q,Q")
-             (const_int 8)
-             (const_int 8)) 0)
-         (match_operand:QI 1 "general_operand" "QnBc,m")))]
+           (match_operator:SWI248 2 "extract_operator"
+             [(match_operand 0 "int248_register_operand" "Q")
+              (const_int 8)
+              (const_int 8)]) 0)
+         (match_operand:QI 1 "general_operand" "QnBn")))]
   "ix86_match_ccmode (insn, CCmode)"
   "cmp{b}\t{%1, %h0|%h0, %1}"
-  [(set_attr "isa" "*,nox64")
+  [(set_attr "addr" "gpr8")
    (set_attr "type" "icmp")
    (set_attr "mode" "QI")])
 
-(define_peephole2
-  [(set (match_operand:QI 0 "register_operand")
-       (match_operand:QI 1 "norex_memory_operand"))
-   (set (match_operand 3 "flags_reg_operand")
-       (match_operator 4 "compare_operator"
-         [(subreg:QI
-            (any_extract:SWI248
-              (match_operand 2 "int248_register_operand")
-              (const_int 8)
-              (const_int 8)) 0)
-          (match_dup 0)]))]
-  "TARGET_64BIT
-   && peep2_reg_dead_p (2, operands[0])"
-  [(set (match_dup 3)
-       (match_op_dup 4
-         [(subreg:QI
-            (any_extract:SWI248
-              (match_dup 2)
-              (const_int 8)
-              (const_int 8)) 0)
-          (match_dup 1)]))])
-
 (define_insn "*cmpqi_ext<mode>_4"
   [(set (reg FLAGS_REG)
        (compare
          (subreg:QI
-           (any_extract:SWI248
-             (match_operand 0 "int248_register_operand" "Q")
-             (const_int 8)
-             (const_int 8)) 0)
+           (match_operator:SWI248 2 "extract_operator"
+             [(match_operand 0 "int248_register_operand" "Q")
+              (const_int 8)
+              (const_int 8)]) 0)
          (subreg:QI
-           (any_extract:SWI248
-             (match_operand 1 "int248_register_operand" "Q")
-             (const_int 8)
-             (const_int 8)) 0)))]
+           (match_operator:SWI248 3 "extract_operator"
+             [(match_operand 1 "int248_register_operand" "Q")
+              (const_int 8)
+              (const_int 8)]) 0)))]
   "ix86_match_ccmode (insn, CCmode)"
   "cmp{b}\t{%h1, %h0|%h0, %h1}"
   [(set_attr "type" "icmp")
   [(set_attr "type" "ssecomi")
    (set_attr "prefix" "evex")
    (set_attr "mode" "HF")])
+
+;; Set carry flag.
+(define_insn "x86_stc"
+  [(set (reg:CCC FLAGS_REG) (unspec:CCC [(const_int 0)] UNSPEC_STC))]
+  ""
+  "stc"
+  [(set_attr "length" "1")
+   (set_attr "length_immediate" "0")
+   (set_attr "modrm" "0")])
+
+;; On Pentium 4, set the carry flag using mov $1,%al;addb $-1,%al.
+(define_peephole2
+  [(match_scratch:QI 0 "r")
+   (set (reg:CCC FLAGS_REG) (unspec:CCC [(const_int 0)] UNSPEC_STC))]
+  "TARGET_SLOW_STC && !optimize_insn_for_size_p ()"
+  [(set (match_dup 0) (const_int 1))
+   (parallel
+     [(set (reg:CCC FLAGS_REG)
+          (compare:CCC (plus:QI (match_dup 0) (const_int -1))
+                       (match_dup 0)))
+      (set (match_dup 0) (plus:QI (match_dup 0) (const_int -1)))])])
+
+;; Complement carry flag.
+(define_insn "*x86_cmc"
+  [(set (reg:CCC FLAGS_REG)
+       (compare:CCC (neg:QI (ltu:QI (reg:CCC FLAGS_REG) (const_int 0)))
+                    (geu:QI (reg:CCC FLAGS_REG) (const_int 0))))]
+  ""
+  "cmc"
+  [(set_attr "length" "1")
+   (set_attr "length_immediate" "0")
+   (set_attr "use_carry" "1")
+   (set_attr "modrm" "0")])
+
+;; On Pentium 4, cmc is replaced with setnc %al;addb $-1,%al.
+(define_peephole2
+  [(match_scratch:QI 0 "r")
+   (set (reg:CCC FLAGS_REG)
+       (compare:CCC (neg:QI (ltu:QI (reg:CCC FLAGS_REG) (const_int 0)))
+                    (geu:QI (reg:CCC FLAGS_REG) (const_int 0))))]
+  "TARGET_SLOW_STC && !optimize_insn_for_size_p ()"
+  [(set (match_dup 0) (ne:QI (reg:CCC FLAGS_REG) (const_int 0)))
+   (parallel
+     [(set (reg:CCC FLAGS_REG)
+          (compare:CCC (plus:QI (match_dup 0) (const_int -1))
+                       (match_dup 0)))
+      (set (match_dup 0) (plus:QI (match_dup 0) (const_int -1)))])])
 \f
 ;; Push/pop instructions.
 
   [(set_attr "type" "pop")
    (set_attr "mode" "<MODE>")])
 
-(define_insn "*pushfl<mode>2"
+(define_insn "@pushfl<mode>2"
   [(set (match_operand:W 0 "push_operand" "=<")
-       (match_operand:W 1 "flags_reg_operand"))]
+       (unspec:W [(match_operand:CC 1 "flags_reg_operand")]
+                 UNSPEC_PUSHFL))]
   ""
   "pushf{<imodesuffix>}"
   [(set_attr "type" "push")
    (set_attr "mode" "<MODE>")])
 
-(define_insn "*popfl<mode>1"
-  [(set (match_operand:W 0 "flags_reg_operand")
-       (match_operand:W 1 "pop_operand" ">"))]
+(define_insn "@popfl<mode>1"
+  [(set (match_operand:CC 0 "flags_reg_operand")
+       (unspec:CC [(match_operand:W 1 "pop_operand" ">")]
+                  UNSPEC_POPFL))]
   ""
   "popf{<imodesuffix>}"
   [(set_attr "type" "pop")
 (define_expand "movxi"
   [(set (match_operand:XI 0 "nonimmediate_operand")
        (match_operand:XI 1 "general_operand"))]
-  "TARGET_AVX512F"
+  "TARGET_AVX512F && TARGET_EVEX512"
   "ix86_expand_vector_move (XImode, operands); DONE;")
 
 (define_expand "movoi"
 (define_insn "*movxi_internal_avx512f"
   [(set (match_operand:XI 0 "nonimmediate_operand"             "=v,v ,v ,m")
        (match_operand:XI 1 "nonimmediate_or_sse_const_operand" " C,BC,vm,v"))]
-  "TARGET_AVX512F
+  "TARGET_AVX512F && TARGET_EVEX512
    && (register_operand (operands[0], XImode)
        || register_operand (operands[1], XImode))"
 {
    (set_attr "mode" "OI")])
 
 (define_insn "*movti_internal"
-  [(set (match_operand:TI 0 "nonimmediate_operand" "=!r ,o ,v,v ,v ,m,?r,?Yd")
-       (match_operand:TI 1 "general_operand"      "riFo,re,C,BC,vm,v,Yd,r"))]
+  [(set (match_operand:TI 0 "nonimmediate_operand" "=!r ,o ,v,v ,v ,m,?jc,?Yd")
+       (match_operand:TI 1 "general_operand"      "riFo,re,C,BC,vm,v,Yd,jc"))]
   "(TARGET_64BIT
     && !(MEM_P (operands[0]) && MEM_P (operands[1])))
    || (TARGET_SSE
 
 (define_insn "*movdi_internal"
   [(set (match_operand:DI 0 "nonimmediate_operand"
-    "=r  ,o  ,r,r  ,r,m ,*y,*y,?*y,?m,?r,?*y,?v,?v,?v,m ,m,?r ,?*Yd,?r,?v,?*y,?*x,*k,*k  ,*r,*m,*k")
+    "=r  ,o  ,r,r  ,r,m ,*y,*y,?*y,?m,?r,?*y,?Yv,?v,?v,m ,m,?jc,?*Yd,?r,?v,?*y,?*x,*k,*k  ,*r,*m,*k")
        (match_operand:DI 1 "general_operand"
-    "riFo,riF,Z,rem,i,re,C ,*y,Bk ,*y,*y,r  ,C ,?v,Bk,?v,v,*Yd,r   ,?v,r  ,*x ,*y ,*r,*kBk,*k,*k,CBC"))]
+    "riFo,riF,Z,rem,i,re,C ,*y,Bk ,*y,*y,r  ,C  ,?v,Bk,?v,v,*Yd,jc  ,?v,r  ,*x ,*y ,*r,*kBk,*k,*k,CBC"))]
   "!(MEM_P (operands[0]) && MEM_P (operands[1]))
    && ix86_hardreg_mov_ok (operands[0], operands[1])"
 {
    (set (attr "mode")
      (cond [(eq_attr "alternative" "2")
              (const_string "SI")
-           (eq_attr "alternative" "12,13")
+           (eq_attr "alternative" "12")
              (cond [(match_test "TARGET_AVX")
                       (const_string "TI")
                     (ior (not (match_test "TARGET_SSE2"))
                       (const_string "V4SF")
                    ]
                    (const_string "TI"))
+           (eq_attr "alternative" "13")
+             (cond [(match_test "TARGET_AVX512VL")
+                      (const_string "TI")
+                    (match_test "TARGET_AVX512F")
+                      (const_string "DF")
+                    (match_test "TARGET_AVX")
+                      (const_string "TI")
+                    (ior (not (match_test "TARGET_SSE2"))
+                         (match_test "optimize_function_for_size_p (cfun)"))
+                      (const_string "V4SF")
+                   ]
+                   (const_string "TI"))
 
            (and (eq_attr "alternative" "14,15,16")
                 (not (match_test "TARGET_SSE2")))
 
 (define_insn "*movsi_internal"
   [(set (match_operand:SI 0 "nonimmediate_operand"
-    "=r,m ,*y,*y,?*y,?m,?r,?*y,?v,?v,?v,m ,?r,?v,*k,*k  ,*rm,*k")
+    "=r,m ,*y,*y,?*y,?m,?r,?*y,?Yv,?v,?v,m ,?r,?v,*k,*k  ,*rm,*k")
        (match_operand:SI 1 "general_operand"
-    "g ,re,C ,*y,Bk ,*y,*y,r  ,C ,?v,Bk,?v,?v,r  ,*r,*kBk,*k ,CBC"))]
+    "g ,re,C ,*y,Bk ,*y,*y,r  ,C  ,?v,Bk,?v,?v,r  ,*r,*kBk,*k ,CBC"))]
   "!(MEM_P (operands[0]) && MEM_P (operands[1]))
    && ix86_hardreg_mov_ok (operands[0], operands[1])"
 {
    (set (attr "mode")
      (cond [(eq_attr "alternative" "2,3")
              (const_string "DI")
-           (eq_attr "alternative" "8,9")
+           (eq_attr "alternative" "8")
              (cond [(match_test "TARGET_AVX")
                       (const_string "TI")
                     (ior (not (match_test "TARGET_SSE2"))
                       (const_string "V4SF")
                    ]
                    (const_string "TI"))
+           (eq_attr "alternative" "9")
+             (cond [(match_test "TARGET_AVX512VL")
+                      (const_string "TI")
+                    (match_test "TARGET_AVX512F")
+                      (const_string "SF")
+                    (match_test "TARGET_AVX")
+                      (const_string "TI")
+                    (ior (not (match_test "TARGET_SSE2"))
+                         (match_test "optimize_function_for_size_p (cfun)"))
+                      (const_string "V4SF")
+                   ]
+                   (const_string "TI"))
 
            (and (eq_attr "alternative" "10,11")
                 (not (match_test "TARGET_SSE2")))
 
 (define_insn "*movhi_internal"
   [(set (match_operand:HI 0 "nonimmediate_operand"
-    "=r,r,r,m ,*k,*k ,r ,m ,*k ,?r,?*v,*v,*v,*v,m")
+    "=r,r,r,m ,*k,*k ,r ,m ,*k ,?r,?*v,*Yv,*v,*v,jm,m")
        (match_operand:HI 1 "general_operand"
-    "r ,n,m,rn,r ,*km,*k,*k,CBC,*v,r  ,C ,*v,m ,*v"))]
+    "r ,n,m,rn,r ,*km,*k,*k,CBC,*v,r  ,C  ,*v,m ,*x,*v"))]
   "!(MEM_P (operands[0]) && MEM_P (operands[1]))
    && ix86_hardreg_mov_ok (operands[0], operands[1])"
 {
        (cond [(eq_attr "alternative" "9,10,11,12,13")
                  (const_string "sse2")
               (eq_attr "alternative" "14")
-                 (const_string "sse4")
+                 (const_string "sse4_noavx")
+              (eq_attr "alternative" "15")
+                 (const_string "avx")
               ]
               (const_string "*")))
+   (set (attr "addr")
+       (if_then_else (eq_attr "alternative" "14")
+                     (const_string "gpr16")
+                     (const_string "*")))
    (set (attr "type")
      (cond [(eq_attr "alternative" "4,5,6,7")
              (const_string "mskmov")
            (eq_attr "alternative" "8")
              (const_string "msklog")
-           (eq_attr "alternative" "13,14")
+           (eq_attr "alternative" "13,14,15")
              (if_then_else (match_test "TARGET_AVX512FP16")
                (const_string "ssemov")
                (const_string "sselog1"))
    (set (attr "prefix")
        (cond [(eq_attr "alternative" "4,5,6,7,8")
                 (const_string "vex")
-              (eq_attr "alternative" "9,10,11,12,13,14")
+              (eq_attr "alternative" "9,10,11,12,13,14,15")
                 (const_string "maybe_evex")
              ]
              (const_string "orig")))
              (if_then_else (match_test "TARGET_AVX512FP16")
                (const_string "HI")
                (const_string "SI"))
-           (eq_attr "alternative" "13,14")
+           (eq_attr "alternative" "13,14,15")
              (if_then_else (match_test "TARGET_AVX512FP16")
                (const_string "HI")
                (const_string "TI"))
                    ]
                    (const_string "TI"))
            (eq_attr "alternative" "12")
-             (cond [(match_test "TARGET_AVX512FP16")
+             (cond [(match_test "TARGET_AVX512VL")
+                      (const_string "TI")
+                    (match_test "TARGET_AVX512FP16")
                       (const_string "HF")
+                    (match_test "TARGET_AVX512F")
+                      (const_string "SF")
                     (match_test "TARGET_AVX")
                       (const_string "TI")
                     (ior (not (match_test "TARGET_SSE2"))
   [(parallel [(set (match_dup 1) (match_dup 2))
              (set (match_dup 2) (match_dup 1))])])
 
+;; Convert xchg with a REG_UNUSED note to a mov (variant #1).
+(define_peephole2
+  [(parallel [(set (match_operand:SWI 0 "general_reg_operand")
+                  (match_operand:SWI 1 "general_reg_operand"))
+             (set (match_dup 1) (match_dup 0))])]
+  "((REGNO (operands[0]) != AX_REG
+     && REGNO (operands[1]) != AX_REG)
+    || optimize_size < 2
+    || !optimize_insn_for_size_p ())
+   && peep2_reg_dead_p (1, operands[0])"
+  [(set (match_dup 1) (match_dup 0))])
+
+;; Convert xchg with a REG_UNUSED note to a mov (variant #2).
+(define_peephole2
+  [(parallel [(set (match_operand:SWI 0 "general_reg_operand")
+                  (match_operand:SWI 1 "general_reg_operand"))
+             (set (match_dup 1) (match_dup 0))])]
+  "((REGNO (operands[0]) != AX_REG
+     && REGNO (operands[1]) != AX_REG)
+    || optimize_size < 2
+    || !optimize_insn_for_size_p ())
+   && peep2_reg_dead_p (1, operands[1])"
+  [(set (match_dup 0) (match_dup 1))])
+
 ;; Convert moves to/from AX_REG into xchg with -Oz.
 (define_peephole2
   [(set (match_operand:SWI48 0 "general_reg_operand")
 (define_expand "extv<mode>"
   [(set (match_operand:SWI24 0 "register_operand")
        (sign_extract:SWI24 (match_operand:SWI24 1 "register_operand")
-                           (match_operand:SI 2 "const_int_operand")
-                           (match_operand:SI 3 "const_int_operand")))]
+                           (match_operand:QI 2 "const_int_operand")
+                           (match_operand:QI 3 "const_int_operand")))]
   ""
 {
   /* Handle extractions from %ah et al.  */
   [(set_attr "type" "imovx")
    (set_attr "mode" "SI")])
 
+;; Split sign-extension of single least significant bit as and x,$1;neg x
+(define_insn_and_split "*extv<mode>_1_0"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+       (sign_extract:SWI48 (match_operand:SWI48 1 "register_operand" "0")
+                           (const_int 1)
+                           (const_int 0)))
+   (clobber (reg:CC FLAGS_REG))]
+  ""
+  "#"
+  ""
+  [(parallel [(set (match_dup 0) (and:SWI48 (match_dup 1) (const_int 1)))
+             (clobber (reg:CC FLAGS_REG))])
+   (parallel [(set (match_dup 0) (neg:SWI48 (match_dup 0)))
+             (clobber (reg:CC FLAGS_REG))])])
+
 (define_expand "extzv<mode>"
   [(set (match_operand:SWI248 0 "register_operand")
        (zero_extract:SWI248 (match_operand:SWI248 1 "register_operand")
-                            (match_operand:SI 2 "const_int_operand")
-                            (match_operand:SI 3 "const_int_operand")))]
+                            (match_operand:QI 2 "const_int_operand")
+                            (match_operand:QI 3 "const_int_operand")))]
   ""
 {
   if (ix86_expand_pextr (operands))
     operands[1] = copy_to_reg (operands[1]);
 })
 
-(define_insn "*extzvqi_mem_rex64"
-  [(set (match_operand:QI 0 "norex_memory_operand" "=Bn")
-       (subreg:QI
-         (zero_extract:SWI248
-           (match_operand 1 "int248_register_operand" "Q")
-           (const_int 8)
-           (const_int 8)) 0))]
-  "TARGET_64BIT && reload_completed"
-  "mov{b}\t{%h1, %0|%0, %h1}"
-  [(set_attr "type" "imov")
-   (set_attr "mode" "QI")])
-
 (define_insn "*extzv<mode>"
   [(set (match_operand:SWI248 0 "register_operand" "=R")
        (zero_extract:SWI248 (match_operand 1 "int248_register_operand" "Q")
    (set_attr "mode" "SI")])
 
 (define_insn "*extzvqi"
-  [(set (match_operand:QI 0 "nonimmediate_operand" "=QBc,?R,m")
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=QBn,?R")
        (subreg:QI
-         (zero_extract:SWI248
-           (match_operand 1 "int248_register_operand" "Q,Q,Q")
-           (const_int 8)
-           (const_int 8)) 0))]
+         (match_operator:SWI248 2 "extract_operator"
+           [(match_operand 1 "int248_register_operand" "Q,Q")
+            (const_int 8)
+            (const_int 8)]) 0))]
   ""
 {
   switch (get_attr_type (insn))
       return "mov{b}\t{%h1, %0|%0, %h1}";
     }
 }
-  [(set_attr "isa" "*,*,nox64")
+  [(set_attr "addr" "gpr8,*")
    (set (attr "type")
      (if_then_else (and (match_operand:QI 0 "register_operand")
                        (ior (not (match_operand:QI 0 "QIreg_operand"))
        (const_string "SI")
        (const_string "QI")))])
 
-(define_peephole2
-  [(set (match_operand:QI 0 "register_operand")
-       (subreg:QI
-         (zero_extract:SWI248 (match_operand 1 "int248_register_operand")
-                              (const_int 8)
-                              (const_int 8)) 0))
-   (set (match_operand:QI 2 "norex_memory_operand") (match_dup 0))]
-  "TARGET_64BIT
-   && peep2_reg_dead_p (2, operands[0])"
-  [(set (match_dup 2)
-       (subreg:QI
-         (zero_extract:SWI248 (match_dup 1)
-                              (const_int 8)
-                              (const_int 8)) 0))])
-
 (define_expand "insv<mode>"
   [(set (zero_extract:SWI248 (match_operand:SWI248 0 "register_operand")
-                            (match_operand:SI 1 "const_int_operand")
-                            (match_operand:SI 2 "const_int_operand"))
+                            (match_operand:QI 1 "const_int_operand")
+                            (match_operand:QI 2 "const_int_operand"))
         (match_operand:SWI248 3 "register_operand"))]
   ""
 {
   DONE;
 })
 
-(define_insn "*insvqi_1_mem_rex64"
-  [(set (zero_extract:SWI248
-         (match_operand 0 "int248_register_operand" "+Q")
-         (const_int 8)
-         (const_int 8))
-       (subreg:SWI248
-         (match_operand:QI 1 "norex_memory_operand" "Bn") 0))]
-  "TARGET_64BIT && reload_completed"
-  "mov{b}\t{%1, %h0|%h0, %1}"
-  [(set_attr "type" "imov")
-   (set_attr "mode" "QI")])
-
 (define_insn "@insv<mode>_1"
   [(set (zero_extract:SWI248
-         (match_operand 0 "int248_register_operand" "+Q,Q")
+         (match_operand 0 "int248_register_operand" "+Q")
          (const_int 8)
          (const_int 8))
-       (match_operand:SWI248 1 "general_operand" "QnBc,m"))]
+       (match_operand:SWI248 1 "general_operand" "QnBn"))]
   ""
 {
   if (CONST_INT_P (operands[1]))
     operands[1] = gen_int_mode (INTVAL (operands[1]), QImode);
   return "mov{b}\t{%b1, %h0|%h0, %b1}";
 }
-  [(set_attr "isa" "*,nox64")
+  [(set_attr "addr" "gpr8")
    (set_attr "type" "imov")
    (set_attr "mode" "QI")])
 
 (define_insn "*insvqi_1"
   [(set (zero_extract:SWI248
-         (match_operand 0 "int248_register_operand" "+Q,Q")
+         (match_operand 0 "int248_register_operand" "+Q")
          (const_int 8)
          (const_int 8))
        (subreg:SWI248
-         (match_operand:QI 1 "general_operand" "QnBc,m") 0))]
+         (match_operand:QI 1 "general_operand" "QnBn") 0))]
   ""
   "mov{b}\t{%1, %h0|%h0, %1}"
-  [(set_attr "isa" "*,nox64")
+  [(set_attr "addr" "gpr8")
    (set_attr "type" "imov")
    (set_attr "mode" "QI")])
 
-(define_peephole2
-  [(set (match_operand:QI 0 "register_operand")
-       (match_operand:QI 1 "norex_memory_operand"))
-   (set (zero_extract:SWI248 (match_operand 2 "int248_register_operand")
-                            (const_int 8)
-                            (const_int 8))
-       (subreg:SWI248 (match_dup 0) 0))]
-  "TARGET_64BIT
-   && peep2_reg_dead_p (2, operands[0])"
-  [(set (zero_extract:SWI248 (match_dup 2)
-                            (const_int 8)
-                            (const_int 8))
-          (subreg:SWI248 (match_dup 1) 0))])
-
 ;; Eliminate redundant insv, e.g. xorl %eax,%eax; movb $0, %ah
 (define_peephole2
   [(parallel [(set (match_operand:SWI48 0 "general_reg_operand")
          (match_operand 0 "int248_register_operand" "+Q")
          (const_int 8)
          (const_int 8))
-       (any_extract:SWI248
-         (match_operand 1 "int248_register_operand" "Q")
-         (const_int 8)
-         (const_int 8)))]
+       (match_operator:SWI248 2 "extract_operator"
+         [(match_operand 1 "int248_register_operand" "Q")
+          (const_int 8)
+          (const_int 8)]))]
   ""
   "mov{b}\t{%h1, %h0|%h0, %h1}"
   [(set_attr "type" "imov")
   "mov{b}\t{%h1, %h0|%h0, %h1}"
   [(set_attr "type" "imov")
    (set_attr "mode" "QI")])
+
+(define_code_iterator any_or_plus [plus ior xor])
+
+(define_insn_and_split "*insvti_highpart_1"
+  [(set (match_operand:TI 0 "nonimmediate_operand" "=ro,r,r,&r")
+       (any_or_plus:TI
+         (and:TI
+           (match_operand:TI 1 "nonimmediate_operand" "r,m,r,m")
+           (match_operand:TI 3 "const_scalar_int_operand" "n,n,n,n"))
+         (ashift:TI
+           (zero_extend:TI
+             (match_operand:DI 2 "nonimmediate_operand" "r,r,m,m"))
+           (const_int 64))))]
+  "TARGET_64BIT
+   && CONST_WIDE_INT_P (operands[3])
+   && CONST_WIDE_INT_NUNITS (operands[3]) == 2
+   && CONST_WIDE_INT_ELT (operands[3], 0) == -1
+   && CONST_WIDE_INT_ELT (operands[3], 1) == 0"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  operands[4] = gen_lowpart (DImode, operands[1]);
+  split_double_concat (TImode, operands[0], operands[4], operands[2]);
+  DONE;
+})
+
+(define_insn_and_split "*insvti_lowpart_1"
+  [(set (match_operand:TI 0 "nonimmediate_operand" "=ro,r,r,&r")
+       (any_or_plus:TI
+         (and:TI
+           (match_operand:TI 1 "nonimmediate_operand" "r,m,r,m")
+           (match_operand:TI 3 "const_scalar_int_operand" "n,n,n,n"))
+         (zero_extend:TI
+           (match_operand:DI 2 "nonimmediate_operand" "r,r,m,m"))))]
+  "TARGET_64BIT
+   && CONST_WIDE_INT_P (operands[3])
+   && CONST_WIDE_INT_NUNITS (operands[3]) == 2
+   && CONST_WIDE_INT_ELT (operands[3], 0) == 0
+   && CONST_WIDE_INT_ELT (operands[3], 1) == -1"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  operands[4] = gen_highpart (DImode, operands[1]);
+  split_double_concat (TImode, operands[0], operands[2], operands[4]);
+  DONE;
+})
+
+(define_insn_and_split "*insvdi_lowpart_1"
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=ro,r,r,&r")
+       (any_or_plus:DI
+         (and:DI
+           (match_operand:DI 1 "nonimmediate_operand" "r,m,r,m")
+           (match_operand:DI 3 "const_int_operand" "n,n,n,n"))
+         (zero_extend:DI
+           (match_operand:SI 2 "nonimmediate_operand" "r,r,m,m"))))]
+  "!TARGET_64BIT
+   && CONST_INT_P (operands[3])
+   && UINTVAL (operands[3]) == 0xffffffff00000000ll"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  operands[4] = gen_highpart (SImode, operands[1]);
+  split_double_concat (DImode, operands[0], operands[2], operands[4]);
+  DONE;
+})
 \f
 ;; Floating point push instructions.
 
    (set_attr "type" "push,multi")
    (set_attr "mode" "SI,TI")])
 
+(define_insn "push2_di"
+  [(set (match_operand:TI 0 "push_operand" "=<")
+       (unspec:TI [(match_operand:DI 1 "register_operand" "r")
+                   (match_operand:DI 2 "register_operand" "r")]
+                   UNSPEC_APXPUSH2))]
+  "TARGET_APX_PUSH2POP2"
+  "push2\t%1, %2"
+  [(set_attr "mode" "TI")
+   (set_attr "type" "multi")
+   (set_attr "prefix" "evex")])
+
+(define_insn "pop2_di"
+  [(parallel [(set (match_operand:DI 0 "register_operand" "=r")
+                  (unspec:DI [(match_operand:TI 1 "pop_operand" ">")]
+                             UNSPEC_APXPOP2_LOW))
+             (set (match_operand:DI 2 "register_operand" "=r")
+                  (unspec:DI [(const_int 0)] UNSPEC_APXPOP2_HIGH))])]
+  "TARGET_APX_PUSH2POP2"
+  "pop2\t%0, %2"
+  [(set_attr "mode" "TI")
+   (set_attr "prefix" "evex")])
+
 (define_insn "*pushsf_rex64"
   [(set (match_operand:SF 0 "push_operand" "=X,X,X")
        (match_operand:SF 1 "nonmemory_no_elim_operand" "f,rF,v"))]
 ;; Possible store forwarding (partial memory) stall in alternatives 4, 6 and 7.
 (define_insn "*movdf_internal"
   [(set (match_operand:DF 0 "nonimmediate_operand"
-    "=Yf*f,m   ,Yf*f,?r ,!o,?*r ,!o,!o,?r,?m,?r,?r,v,v,v,m,*x,*x,*x,m ,r ,v,r  ,o ,r  ,m")
+    "=Yf*f,m   ,Yf*f,?r ,!o,?*r ,!o,!o,?r,?m,?r,?r,Yv,v,v,m,*x,*x,*x,m ,?r,?v,r  ,o ,r  ,m")
        (match_operand:DF 1 "general_operand"
-    "Yf*fm,Yf*f,G   ,roF,r ,*roF,*r,F ,rm,rC,C ,F ,C,v,m,v,C ,*x,m ,*x,v,r ,roF,rF,rmF,rC"))]
+    "Yf*fm,Yf*f,G   ,roF,r ,*roF,*r,F ,rm,rC,C ,F ,C ,v,m,v,C ,*x,m ,*x, v, r,roF,rF,rmF,rC"))]
   "!(MEM_P (operands[0]) && MEM_P (operands[1]))
    && (lra_in_progress || reload_completed
        || !CONST_DOUBLE_P (operands[1])
 
               /* movaps is one byte shorter for non-AVX targets.  */
               (eq_attr "alternative" "13,17")
-                (cond [(match_test "TARGET_AVX")
+                (cond [(match_test "TARGET_AVX512VL")
+                         (const_string "V2DF")
+                       (match_test "TARGET_AVX512F")
                          (const_string "DF")
+                       (match_test "TARGET_AVX")
+                         (const_string "V2DF")
                        (ior (not (match_test "TARGET_SSE2"))
                             (match_test "optimize_function_for_size_p (cfun)"))
                          (const_string "V4SF")
 
 (define_insn "*movsf_internal"
   [(set (match_operand:SF 0 "nonimmediate_operand"
-         "=Yf*f,m   ,Yf*f,?r ,?m,v,v,v,m,?r,?v,!*y,!*y,!m,!r,!*y,r  ,m")
+         "=Yf*f,m   ,Yf*f,?r ,?m,Yv,v,v,m,?r,?v,!*y,!*y,!m,!r,!*y,r  ,m")
        (match_operand:SF 1 "general_operand"
-         "Yf*fm,Yf*f,G   ,rmF,rF,C,v,m,v,v ,r ,*y ,m  ,*y,*y,r  ,rmF,rF"))]
+         "Yf*fm,Yf*f,G   ,rmF,rF,C ,v,m,v,v ,r ,*y ,m  ,*y,*y,r  ,rmF,rF"))]
   "!(MEM_P (operands[0]) && MEM_P (operands[1]))
    && (lra_in_progress || reload_completed
        || !CONST_DOUBLE_P (operands[1])
               (eq_attr "alternative" "11")
                 (const_string "DI")
               (eq_attr "alternative" "5")
-                (cond [(and (match_test "TARGET_AVX512F")
+                (cond [(and (match_test "TARGET_AVX512F && TARGET_EVEX512")
                             (not (match_test "TARGET_PREFER_AVX256")))
                          (const_string "V16SF")
                        (match_test "TARGET_AVX")
                  better to maintain the whole registers in single format
                  to avoid problems on using packed logical operations.  */
               (eq_attr "alternative" "6")
-                (cond [(ior (match_test "TARGET_SSE_PARTIAL_REG_DEPENDENCY")
+                (cond [(match_test "TARGET_AVX512VL")
+                         (const_string "V4SF")
+                       (match_test "TARGET_AVX512F")
+                         (const_string "SF")
+                       (ior (match_test "TARGET_SSE_PARTIAL_REG_DEPENDENCY")
                             (match_test "TARGET_SSE_SPLIT_REGS"))
                          (const_string "V4SF")
                       ]
 
 (define_insn "*mov<mode>_internal"
  [(set (match_operand:HFBF 0 "nonimmediate_operand"
-        "=?r,?r,?r,?m,v,v,?r,m,?v,v")
+        "=?r,?r,?r,?m           ,Yv,v,?r,jm,m,?v,v")
        (match_operand:HFBF 1 "general_operand"
-        "r  ,F ,m ,r<hfbfconstf>,C,v, v,v,r ,m"))]
+        "r  ,F ,m ,r<hfbfconstf>,C ,v, v,v ,v,r ,m"))]
  "!(MEM_P (operands[0]) && MEM_P (operands[1]))
   && (lra_in_progress
       || reload_completed
     }
 }
   [(set (attr "isa")
-       (cond [(eq_attr "alternative" "4,5,6,8,9")
+       (cond [(eq_attr "alternative" "4,5,6,9,10")
                 (const_string "sse2")
               (eq_attr "alternative" "7")
-                (const_string "sse4")
+                (const_string "sse4_noavx")
+              (eq_attr "alternative" "8")
+                (const_string "avx")
              ]
              (const_string "*")))
+   (set (attr "addr")
+       (if_then_else (eq_attr "alternative" "7")
+                     (const_string "gpr16")
+                     (const_string "*")))
    (set (attr "type")
        (cond [(eq_attr "alternative" "4")
                 (const_string "sselog1")
-              (eq_attr "alternative" "5,6,8")
+              (eq_attr "alternative" "5,6,9")
                 (const_string "ssemov")
-              (eq_attr "alternative" "7,9")
+              (eq_attr "alternative" "7,8,10")
                 (if_then_else
                   (match_test ("TARGET_AVX512FP16"))
                   (const_string "ssemov")
                 ]
              (const_string "imov")))
    (set (attr "prefix")
-       (cond [(eq_attr "alternative" "4,5,6,7,8,9")
+       (cond [(eq_attr "alternative" "4,5,6,7,8,9,10")
                 (const_string "maybe_vex")
              ]
              (const_string "orig")))
    (set (attr "mode")
        (cond [(eq_attr "alternative" "4")
                 (const_string "V4SF")
-              (eq_attr "alternative" "6,8")
+              (eq_attr "alternative" "6,9")
                 (if_then_else
                   (match_test "TARGET_AVX512FP16")
                   (const_string "HI")
                   (const_string "SI"))
-              (eq_attr "alternative" "7,9")
+              (eq_attr "alternative" "7,8,10")
                 (if_then_else
                   (match_test "TARGET_AVX512FP16")
                   (const_string "HI")
                   (const_string "TI"))
               (eq_attr "alternative" "5")
-                (cond [(match_test "TARGET_AVX512FP16")
+                (cond [(match_test "TARGET_AVX512VL")
+                       (const_string "V4SF")
+                       (match_test "TARGET_AVX512FP16")
                          (const_string "HF")
+                       (match_test "TARGET_AVX512F")
+                         (const_string "SF")
+                       (match_test "TARGET_AVX")
+                         (const_string "V4SF")
                        (ior (match_test "TARGET_SSE_PARTIAL_REG_DEPENDENCY")
                             (match_test "TARGET_SSE_SPLIT_REGS"))
                          (const_string "V4SF")
            (eq_attr "alternative" "12")
              (const_string "x64_avx512bw")
            (eq_attr "alternative" "13")
-             (const_string "avx512bw")
+             (const_string "avx512bw_512")
           ]
           (const_string "*")))
    (set (attr "mmx_isa")
   "split_double_mode (DImode, &operands[0], 1, &operands[3], &operands[4]);")
 
 (define_mode_attr kmov_isa
-  [(QI "avx512dq") (HI "avx512f") (SI "avx512bw") (DI "avx512bw")])
+  [(QI "avx512dq") (HI "avx512f") (SI "avx512bw") (DI "avx512bw_512")])
 
 (define_insn "zero_extend<mode>di2"
   [(set (match_operand:DI 0 "register_operand" "=r,*r,*k")
   [(set (match_operand:SWI24 0 "register_operand" "=R")
        (sign_extend:SWI24
          (subreg:QI
-           (zero_extract:SWI248
-             (match_operand 1 "int248_register_operand" "Q")
-             (const_int 8)
-             (const_int 8)) 0)))]
+           (match_operator:SWI248 2 "extract_operator"
+             [(match_operand 1 "int248_register_operand" "Q")
+              (const_int 8)
+              (const_int 8)]) 0)))]
   ""
   "movs{b<SWI24:imodesuffix>|x}\t{%h1, %0|%0, %h1}"
    [(set_attr "type" "imovx")
    && optimize_insn_for_speed_p ()
    && reload_completed
    && (!EXT_REX_SSE_REG_P (operands[0])
-       || TARGET_AVX512VL)"
+       || TARGET_AVX512VL || TARGET_EVEX512)"
    [(set (match_dup 2)
         (float_extend:V2DF
           (vec_select:V2SF
 ;; Don't use float_extend since psrlld doesn't raise
 ;; exceptions and turn a sNaN into a qNaN.
 (define_insn "extendbfsf2_1"
-  [(set (match_operand:SF 0 "register_operand"   "=x,Yw")
+  [(set (match_operand:SF 0 "register_operand"   "=x,Yv,v")
        (unspec:SF
-         [(match_operand:BF 1 "register_operand" " 0,Yw")]
+         [(match_operand:BF 1 "register_operand" " 0,Yv,v")]
          UNSPEC_CVTBFSF))]
  "TARGET_SSE2"
  "@
   pslld\t{$16, %0|%0, 16}
-  vpslld\t{$16, %1, %0|%0, %1, 16}"
-  [(set_attr "isa" "noavx,avx")
+  vpslld\t{$16, %1, %0|%0, %1, 16}
+  vpslld\t{$16, %g1, %g0|%g0, %g1, 16}"
+  [(set_attr "isa" "noavx,avx,*")
    (set_attr "type" "sseishft1")
    (set_attr "length_immediate" "1")
-   (set_attr "prefix_data16" "1,*")
-   (set_attr "prefix" "orig,vex")
-   (set_attr "mode" "TI")
-   (set_attr "memory" "none")])
+   (set_attr "prefix_data16" "1,*,*")
+   (set_attr "prefix" "orig,maybe_evex,evex")
+   (set_attr "mode" "TI,TI,XI")
+   (set_attr "memory" "none")
+   (set (attr "enabled")
+     (if_then_else (eq_attr "alternative" "2")
+       (symbol_ref "TARGET_AVX512F && TARGET_EVEX512
+                   && !TARGET_AVX512VL && !TARGET_PREFER_AVX256")
+       (const_string "*")))])
 
 (define_expand "extend<mode>xf2"
   [(set (match_operand:XF 0 "nonimmediate_operand")
       gcc_assert (TARGET_64BIT);
       return "lea{l}\t{%E1, %k0|%k0, %E1}";
     }
-  else 
+  else
     return "lea{<imodesuffix>}\t{%E1, %0|%0, %E1}";
 }
   [(set_attr "type" "lea")
              (clobber (reg:CC FLAGS_REG))])]
  "split_double_mode (<DWI>mode, &operands[0], 2, &operands[0], &operands[3]);")
 
-(define_insn "*add<mode>_1"
-  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,r,r")
-       (plus:SWI48
-         (match_operand:SWI48 1 "nonimmediate_operand" "%0,0,r,r")
-         (match_operand:SWI48 2 "x86_64_general_operand" "re,BM,0,le")))
+(define_insn_and_split "*add<dwi>3_doubleword_concat"
+  [(set (match_operand:<DWI> 0 "register_operand" "=&r")
+       (plus:<DWI>
+         (any_or_plus:<DWI>
+           (ashift:<DWI>
+             (zero_extend:<DWI>
+               (match_operand:DWIH 2 "nonimmediate_operand" "rm"))
+             (match_operand:QI 3 "const_int_operand"))
+           (zero_extend:<DWI>
+             (match_operand:DWIH 4 "nonimmediate_operand" "rm")))
+         (match_operand:<DWI> 1 "register_operand" "0")))
    (clobber (reg:CC FLAGS_REG))]
-  "ix86_binary_operator_ok (PLUS, <MODE>mode, operands)"
-{
-  switch (get_attr_type (insn))
-    {
-    case TYPE_LEA:
-      return "#";
+  "INTVAL (operands[3]) == <MODE_SIZE> * BITS_PER_UNIT"
+  "#"
+  "&& reload_completed"
+  [(parallel [(set (reg:CCC FLAGS_REG)
+                  (compare:CCC
+                    (plus:DWIH (match_dup 1) (match_dup 4))
+                    (match_dup 1)))
+             (set (match_dup 0)
+                  (plus:DWIH (match_dup 1) (match_dup 4)))])
+   (parallel [(set (match_dup 5)
+                  (plus:DWIH
+                    (plus:DWIH
+                      (ltu:DWIH (reg:CC FLAGS_REG) (const_int 0))
+                      (match_dup 6))
+                    (match_dup 2)))
+             (clobber (reg:CC FLAGS_REG))])]
+ "split_double_mode (<DWI>mode, &operands[0], 2, &operands[0], &operands[5]);")
+
+(define_insn_and_split "*add<dwi>3_doubleword_concat_zext"
+  [(set (match_operand:<DWI> 0 "register_operand" "=&r")
+       (plus:<DWI>
+         (any_or_plus:<DWI>
+           (ashift:<DWI>
+             (zero_extend:<DWI>
+               (match_operand:DWIH 2 "nonimmediate_operand" "rm"))
+             (match_operand:QI 3 "const_int_operand"))
+           (zero_extend:<DWI>
+             (match_operand:DWIH 4 "nonimmediate_operand" "rm")))
+         (zero_extend:<DWI>
+           (match_operand:DWIH 1 "nonimmediate_operand" "rm"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "INTVAL (operands[3]) == <MODE_SIZE> * BITS_PER_UNIT"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (match_dup 4))
+   (set (match_dup 5) (match_dup 2))
+   (parallel [(set (reg:CCC FLAGS_REG)
+                  (compare:CCC
+                    (plus:DWIH (match_dup 0) (match_dup 1))
+                    (match_dup 0)))
+             (set (match_dup 0)
+                  (plus:DWIH (match_dup 0) (match_dup 1)))])
+   (parallel [(set (match_dup 5)
+                  (plus:DWIH
+                    (plus:DWIH
+                      (ltu:DWIH (reg:CC FLAGS_REG) (const_int 0))
+                      (match_dup 5))
+                    (const_int 0)))
+             (clobber (reg:CC FLAGS_REG))])]
+ "split_double_mode (<DWI>mode, &operands[0], 1, &operands[0], &operands[5]);")
+
+(define_insn "*add<mode>_1"
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,r,r")
+       (plus:SWI48
+         (match_operand:SWI48 1 "nonimmediate_operand" "%0,0,r,r")
+         (match_operand:SWI48 2 "x86_64_general_operand" "re,BM,0,le")))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_binary_operator_ok (PLUS, <MODE>mode, operands)"
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_LEA:
+      return "#";
 
     case TYPE_INCDEC:
       gcc_assert (rtx_equal_p (operands[0], operands[1]));
       return "add{<imodesuffix>}\t{%2, %0|%0, %2}";
     }
 }
-  "&& reload_completed"
+  "&& reload_completed
+   && !(rtx_equal_p (operands[0], operands[1])
+       || rtx_equal_p (operands[0], operands[2]))"
   [(set (strict_low_part (match_dup 0)) (match_dup 1))
    (parallel
      [(set (strict_low_part (match_dup 0))
        (const_string "alu")))
    (set_attr "mode" "<MODE>")])
 
+;; Alternative 1 is needed to work around LRA limitation, see PR82524.
+(define_insn_and_split "*addqi_ext<mode>_1_slp"
+  [(set (strict_low_part (match_operand:QI 0 "register_operand" "+Q,&Q"))
+       (plus:QI
+         (subreg:QI
+           (match_operator:SWI248 3 "extract_operator"
+             [(match_operand 2 "int248_register_operand" "Q,Q")
+              (const_int 8)
+              (const_int 8)]) 0)
+         (match_operand:QI 1 "nonimmediate_operand" "0,!qm")))
+   (clobber (reg:CC FLAGS_REG))]
+  "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
+  "@
+   add{b}\t{%h2, %0|%0, %h2}
+   #"
+  "&& reload_completed
+   && !rtx_equal_p (operands[0], operands[1])"
+  [(set (strict_low_part (match_dup 0)) (match_dup 1))
+   (parallel
+     [(set (strict_low_part (match_dup 0))
+          (plus:QI
+            (subreg:QI
+              (match_op_dup 3
+                [(match_dup 2) (const_int 8) (const_int 8)]) 0)
+          (match_dup 0)))
+      (clobber (reg:CC FLAGS_REG))])]
+  ""
+  [(set_attr "type" "alu")
+   (set_attr "mode" "QI")])
+
 ;; Split non destructive adds if we cannot use lea.
 (define_split
   [(set (match_operand:SWI48 0 "register_operand")
    (set_attr "mode" "<MODE>")])
 
 (define_insn "*addqi_ext<mode>_0"
-  [(set (match_operand:QI 0 "nonimm_x64constmem_operand" "=QBc,m")
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=QBn")
        (plus:QI
          (subreg:QI
-           (zero_extract:SWI248
-             (match_operand 2 "int248_register_operand" "Q,Q")
-             (const_int 8)
-             (const_int 8)) 0)
-         (match_operand:QI 1 "nonimm_x64constmem_operand" "0,0")))
+           (match_operator:SWI248 3 "extract_operator"
+             [(match_operand 2 "int248_register_operand" "Q")
+              (const_int 8)
+              (const_int 8)]) 0)
+         (match_operand:QI 1 "nonimmediate_operand" "0")))
    (clobber (reg:CC FLAGS_REG))]
   ""
   "add{b}\t{%h2, %0|%0, %h2}"
-  [(set_attr "isa" "*,nox64")
+  [(set_attr "addr" "gpr8")
    (set_attr "type" "alu")
    (set_attr "mode" "QI")])
 
               (match_operand:QI 2 "const_int_operand")) 0))
       (clobber (reg:CC FLAGS_REG))])])
 
-(define_insn "*addqi_ext<mode>_1"
+;; Alternative 1 is needed to work around LRA limitation, see PR82524.
+(define_insn_and_split "*addqi_ext<mode>_1"
   [(set (zero_extract:SWI248
-         (match_operand 0 "int248_register_operand" "+Q,Q")
+         (match_operand 0 "int248_register_operand" "+Q,&Q")
          (const_int 8)
          (const_int 8))
        (subreg:SWI248
          (plus:QI
            (subreg:QI
-             (zero_extract:SWI248
-               (match_operand 1 "int248_register_operand" "0,0")
-               (const_int 8)
-               (const_int 8)) 0)
-           (match_operand:QI 2 "general_x64constmem_operand" "QnBc,m")) 0))
+             (match_operator:SWI248 3 "extract_operator"
+               [(match_operand 1 "int248_register_operand" "0,!Q")
+                (const_int 8)
+                (const_int 8)]) 0)
+           (match_operand:QI 2 "general_operand" "QnBn,QnBn")) 0))
    (clobber (reg:CC FLAGS_REG))]
-  "/* FIXME: without this LRA can't reload this pattern, see PR82524.  */
-   rtx_equal_p (operands[0], operands[1])"
+  ""
 {
+  if (which_alternative)
+    return "#";
+
   switch (get_attr_type (insn))
     {
     case TYPE_INCDEC:
       if (operands[2] == const1_rtx)
        return "inc{b}\t%h0";
       else
-        {
+       {
          gcc_assert (operands[2] == constm1_rtx);
-          return "dec{b}\t%h0";
-        }
+         return "dec{b}\t%h0";
+       }
 
     default:
       return "add{b}\t{%2, %h0|%h0, %2}";
     }
 }
-  [(set_attr "isa" "*,nox64")
+  "reload_completed
+   && !rtx_equal_p (operands[0], operands[1])"
+  [(set (zero_extract:SWI248
+         (match_dup 0) (const_int 8) (const_int 8))
+       (zero_extract:SWI248
+         (match_dup 1) (const_int 8) (const_int 8)))
+   (parallel
+     [(set (zero_extract:SWI248
+            (match_dup 0) (const_int 8) (const_int 8))
+          (subreg:SWI248
+            (plus:QI
+              (subreg:QI
+                (match_op_dup 3
+                  [(match_dup 0) (const_int 8) (const_int 8)]) 0)
+              (match_dup 2)) 0))
+      (clobber (reg:CC FLAGS_REG))])]
+  ""
+  [(set_attr "addr" "gpr8")
    (set (attr "type")
      (if_then_else (match_operand:QI 2 "incdec_operand")
        (const_string "incdec")
        (const_string "alu")))
    (set_attr "mode" "QI")])
 
-(define_insn "*addqi_ext<mode>_2"
+;; Alternative 1 is needed to work around LRA limitation, see PR82524.
+(define_insn_and_split "*<insn>qi_ext<mode>_2"
   [(set (zero_extract:SWI248
-         (match_operand 0 "int248_register_operand" "+Q")
+         (match_operand 0 "int248_register_operand" "+Q,&Q")
          (const_int 8)
          (const_int 8))
        (subreg:SWI248
-         (plus:QI
+         (plusminus:QI
            (subreg:QI
-             (zero_extract:SWI248
-               (match_operand 1 "int248_register_operand" "%0")
-               (const_int 8)
-               (const_int 8)) 0)
+             (match_operator:SWI248 3 "extract_operator"
+               [(match_operand 1 "int248_register_operand" "<comm>0,!Q")
+                (const_int 8)
+                (const_int 8)]) 0)
            (subreg:QI
-             (zero_extract:SWI248
-               (match_operand 2 "int248_register_operand" "Q")
-               (const_int 8)
-               (const_int 8)) 0)) 0))
-  (clobber (reg:CC FLAGS_REG))]
-  "/* FIXME: without this LRA can't reload this pattern, see PR82524.  */
-   rtx_equal_p (operands[0], operands[1])
-   || rtx_equal_p (operands[0], operands[2])"
-  "add{b}\t{%h2, %h0|%h0, %h2}"
+             (match_operator:SWI248 4 "extract_operator"
+               [(match_operand 2 "int248_register_operand" "Q,Q")
+                (const_int 8)
+                (const_int 8)]) 0)) 0))
+   (clobber (reg:CC FLAGS_REG))]
+  ""
+  "@
+   <insn>{b}\t{%h2, %h0|%h0, %h2}
+   #"
+  "reload_completed
+   && !(rtx_equal_p (operands[0], operands[1])
+       || (<CODE> == PLUS && rtx_equal_p (operands[0], operands[2])))"
+  [(set (zero_extract:SWI248
+         (match_dup 0) (const_int 8) (const_int 8))
+       (zero_extract:SWI248
+         (match_dup 1) (const_int 8) (const_int 8)))
+   (parallel
+     [(set (zero_extract:SWI248
+            (match_dup 0) (const_int 8) (const_int 8))
+          (subreg:SWI248
+            (plusminus:QI
+              (subreg:QI
+                (match_op_dup 3
+                  [(match_dup 0) (const_int 8) (const_int 8)]) 0)
+              (subreg:QI
+                (match_op_dup 4
+                  [(match_dup 2) (const_int 8) (const_int 8)]) 0)) 0))
+      (clobber (reg:CC FLAGS_REG))])]
+  ""
   [(set_attr "type" "alu")
    (set_attr "mode" "QI")])
 
 (define_insn_and_split "*lea<mode>_general_1"
   [(set (match_operand:SWI12 0 "register_operand" "=r")
        (plus:SWI12
-         (plus:SWI12 (match_operand:SWI12 1 "index_register_operand" "l")
+         (plus:SWI12 (match_operand:SWI12 1 "register_no_SP_operand" "l")
                      (match_operand:SWI12 2 "register_operand" "r"))
          (match_operand:SWI12 3 "immediate_operand" "i")))]
   "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
 (define_insn_and_split "*lea<mode>_general_2"
   [(set (match_operand:SWI12 0 "register_operand" "=r")
        (plus:SWI12
-         (mult:SWI12 (match_operand:SWI12 1 "index_register_operand" "l")
+         (mult:SWI12 (match_operand:SWI12 1 "register_no_SP_operand" "l")
                      (match_operand 2 "const248_operand" "n"))
          (match_operand:SWI12 3 "nonmemory_operand" "ri")))]
   "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
 (define_insn_and_split "*lea<mode>_general_2b"
   [(set (match_operand:SWI12 0 "register_operand" "=r")
        (plus:SWI12
-         (ashift:SWI12 (match_operand:SWI12 1 "index_register_operand" "l")
+         (ashift:SWI12 (match_operand:SWI12 1 "register_no_SP_operand" "l")
                        (match_operand 2 "const123_operand" "n"))
          (match_operand:SWI12 3 "nonmemory_operand" "ri")))]
   "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
   [(set (match_operand:SWI12 0 "register_operand" "=r")
        (plus:SWI12
          (plus:SWI12
-           (mult:SWI12 (match_operand:SWI12 1 "index_register_operand" "l")
+           (mult:SWI12 (match_operand:SWI12 1 "register_no_SP_operand" "l")
                        (match_operand 2 "const248_operand" "n"))
            (match_operand:SWI12 3 "register_operand" "r"))
          (match_operand:SWI12 4 "immediate_operand" "i")))]
   [(set (match_operand:SWI12 0 "register_operand" "=r")
        (plus:SWI12
          (plus:SWI12
-           (ashift:SWI12 (match_operand:SWI12 1 "index_register_operand" "l")
+           (ashift:SWI12 (match_operand:SWI12 1 "register_no_SP_operand" "l")
                          (match_operand 2 "const123_operand" "n"))
            (match_operand:SWI12 3 "register_operand" "r"))
          (match_operand:SWI12 4 "immediate_operand" "i")))]
   [(set (match_operand:SWI12 0 "register_operand" "=r")
        (any_or:SWI12
          (ashift:SWI12
-           (match_operand:SWI12 1 "index_register_operand" "l")
+           (match_operand:SWI12 1 "register_no_SP_operand" "l")
            (match_operand 2 "const_0_to_3_operand"))
          (match_operand 3 "const_int_operand")))]
   "(!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun))
   [(set (match_operand:SWI48 0 "register_operand" "=r")
        (any_or:SWI48
          (ashift:SWI48
-           (match_operand:SWI48 1 "index_register_operand" "l")
+           (match_operand:SWI48 1 "register_no_SP_operand" "l")
            (match_operand 2 "const_0_to_3_operand"))
          (match_operand 3 "const_int_operand")))]
   "(unsigned HOST_WIDE_INT) INTVAL (operands[3])
   "@
    sub{<imodesuffix>}\t{%2, %0|%0, %2}
    #"
-  "&& reload_completed"
+  "&& reload_completed
+   && !(rtx_equal_p (operands[0], operands[1]))"
   [(set (strict_low_part (match_dup 0)) (match_dup 1))
    (parallel
      [(set (strict_low_part (match_dup 0))
   [(set_attr "type" "alu")
    (set_attr "mode" "<MODE>")])
 
+;; Alternative 1 is needed to work around LRA limitation, see PR82524.
+(define_insn_and_split "*subqi_ext<mode>_1_slp"
+  [(set (strict_low_part (match_operand:QI 0 "register_operand" "+Q,&Q"))
+       (minus:QI
+         (match_operand:QI 1 "nonimmediate_operand" "0,!qm")
+         (subreg:QI
+           (match_operator:SWI248 3 "extract_operator"
+             [(match_operand 2 "int248_register_operand" "Q,Q")
+              (const_int 8)
+              (const_int 8)]) 0)))
+   (clobber (reg:CC FLAGS_REG))]
+  "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
+  "@
+   sub{b}\t{%h2, %0|%0, %h2}
+   #"
+  "&& reload_completed
+   && !rtx_equal_p (operands[0], operands[1])"
+  [(set (strict_low_part (match_dup 0)) (match_dup 1))
+   (parallel
+     [(set (strict_low_part (match_dup 0))
+          (minus:QI
+            (match_dup 0)
+            (subreg:QI
+              (match_op_dup 3
+                [(match_dup 2) (const_int 8) (const_int 8)]) 0)))
+      (clobber (reg:CC FLAGS_REG))])]
+  ""
+  [(set_attr "type" "alu")
+   (set_attr "mode" "QI")])
+
 (define_insn "*sub<mode>_2"
   [(set (reg FLAGS_REG)
        (compare
    (set_attr "mode" "SI")])
 
 (define_insn "*subqi_ext<mode>_0"
-  [(set (match_operand:QI 0 "nonimm_x64constmem_operand" "=QBc,m")
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=QBn")
        (minus:QI
-         (match_operand:QI 1 "nonimm_x64constmem_operand" "0,0")
+         (match_operand:QI 1 "nonimmediate_operand" "0")
          (subreg:QI
-           (zero_extract:SWI248
-             (match_operand 2 "int248_register_operand" "Q,Q")
-             (const_int 8)
-             (const_int 8)) 0)))
+           (match_operator:SWI248 3 "extract_operator"
+             [(match_operand 2 "int248_register_operand" "Q")
+              (const_int 8)
+              (const_int 8)]) 0)))
    (clobber (reg:CC FLAGS_REG))]
   ""
   "sub{b}\t{%h2, %0|%0, %h2}"
-  [(set_attr "isa" "*,nox64")
+  [(set_attr "addr" "gpr8")
    (set_attr "type" "alu")
    (set_attr "mode" "QI")])
 
-(define_insn "*subqi_ext<mode>_2"
+;; Alternative 1 is needed to work around LRA limitation, see PR82524.
+(define_insn_and_split "*subqi_ext<mode>_1"
   [(set (zero_extract:SWI248
-         (match_operand 0 "int248_register_operand" "+Q")
+         (match_operand 0 "int248_register_operand" "+Q,&Q")
          (const_int 8)
          (const_int 8))
        (subreg:SWI248
          (minus:QI
            (subreg:QI
-             (zero_extract:SWI248
-               (match_operand 1 "int248_register_operand" "0")
-               (const_int 8)
-               (const_int 8)) 0)
-           (subreg:QI
-             (zero_extract:SWI248
-               (match_operand 2 "int248_register_operand" "Q")
-               (const_int 8)
-               (const_int 8)) 0)) 0))
-  (clobber (reg:CC FLAGS_REG))]
-  "/* FIXME: without this LRA can't reload this pattern, see PR82524.  */
-   rtx_equal_p (operands[0], operands[1])"
-  "sub{b}\t{%h2, %h0|%h0, %h2}"
-  [(set_attr "type" "alu")
+             (match_operator:SWI248 3 "extract_operator"
+               [(match_operand 1 "int248_register_operand" "0,!Q")
+                (const_int 8)
+                (const_int 8)]) 0)
+           (match_operand:QI 2 "general_operand" "QnBn,QnBn")) 0))
+   (clobber (reg:CC FLAGS_REG))]
+  ""
+  "@
+   sub{b}\t{%2, %h0|%h0, %2}
+   #"
+  "reload_completed
+   && !(rtx_equal_p (operands[0], operands[1]))"
+  [(set (zero_extract:SWI248
+         (match_dup 0) (const_int 8) (const_int 8))
+       (zero_extract:SWI248
+         (match_dup 1) (const_int 8) (const_int 8)))
+   (parallel
+     [(set (zero_extract:SWI248
+            (match_dup 0) (const_int 8) (const_int 8))
+          (subreg:SWI248
+            (minus:QI
+              (subreg:QI
+                (match_op_dup 3
+                  [(match_dup 0) (const_int 8) (const_int 8)]) 0)
+              (match_dup 2)) 0))
+      (clobber (reg:CC FLAGS_REG))])]
+  ""
+  [(set_attr "addr" "gpr8")
+   (set_attr "type" "alu")
    (set_attr "mode" "QI")])
 
 ;; Subtract with jump on overflow.
   [(set (reg:CC FLAGS_REG)
        (compare:CC (match_dup 0) (match_dup 1)))])
 
+(define_peephole2
+  [(set (match_operand:SWI 0 "general_reg_operand")
+       (match_operand:SWI 1 "memory_operand"))
+   (parallel [(set (reg:CC FLAGS_REG)
+                  (compare:CC (match_dup 0)
+                              (match_operand:SWI 2 "memory_operand")))
+             (set (match_dup 0)
+                  (minus:SWI (match_dup 0) (match_dup 2)))])
+   (set (match_dup 1) (match_dup 0))]
+  "(TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ())
+   && peep2_reg_dead_p (3, operands[0])
+   && !reg_overlap_mentioned_p (operands[0], operands[1])
+   && !reg_overlap_mentioned_p (operands[0], operands[2])"
+  [(set (match_dup 0) (match_dup 2))
+   (parallel [(set (reg:CC FLAGS_REG)
+                  (compare:CC (match_dup 1) (match_dup 0)))
+             (set (match_dup 1)
+                  (minus:SWI (match_dup 1) (match_dup 0)))])])
+
 ;; decl %eax; cmpl $-1, %eax; jne .Lxx; can be optimized into
 ;; subl $1, %eax; jnc .Lxx;
 (define_peephole2
    (set_attr "pent_pair" "pu")
    (set_attr "mode" "<MODE>")])
 
+(define_peephole2
+  [(set (match_operand:SWI 0 "general_reg_operand")
+       (match_operand:SWI 1 "memory_operand"))
+   (parallel [(set (match_dup 0)
+                  (plus:SWI
+                    (plus:SWI
+                      (match_operator:SWI 4 "ix86_carry_flag_operator"
+                        [(match_operand 3 "flags_reg_operand")
+                         (const_int 0)])
+                      (match_dup 0))
+                    (match_operand:SWI 2 "memory_operand")))
+             (clobber (reg:CC FLAGS_REG))])
+   (set (match_dup 1) (match_dup 0))]
+  "(TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ())
+   && peep2_reg_dead_p (3, operands[0])
+   && !reg_overlap_mentioned_p (operands[0], operands[1])
+   && !reg_overlap_mentioned_p (operands[0], operands[2])"
+  [(set (match_dup 0) (match_dup 2))
+   (parallel [(set (match_dup 1)
+                  (plus:SWI (plus:SWI (match_op_dup 4
+                                        [(match_dup 3) (const_int 0)])
+                                      (match_dup 1))
+                            (match_dup 0)))
+             (clobber (reg:CC FLAGS_REG))])])
+
+(define_peephole2
+  [(set (match_operand:SWI 0 "general_reg_operand")
+       (match_operand:SWI 1 "memory_operand"))
+   (parallel [(set (match_dup 0)
+                  (plus:SWI
+                    (plus:SWI
+                      (match_operator:SWI 4 "ix86_carry_flag_operator"
+                        [(match_operand 3 "flags_reg_operand")
+                         (const_int 0)])
+                      (match_dup 0))
+                    (match_operand:SWI 2 "memory_operand")))
+             (clobber (reg:CC FLAGS_REG))])
+   (set (match_operand:SWI 5 "general_reg_operand") (match_dup 0))
+   (set (match_dup 1) (match_dup 5))]
+  "(TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ())
+   && peep2_reg_dead_p (3, operands[0])
+   && peep2_reg_dead_p (4, operands[5])
+   && !reg_overlap_mentioned_p (operands[0], operands[1])
+   && !reg_overlap_mentioned_p (operands[0], operands[2])
+   && !reg_overlap_mentioned_p (operands[5], operands[1])"
+  [(set (match_dup 0) (match_dup 2))
+   (parallel [(set (match_dup 1)
+                  (plus:SWI (plus:SWI (match_op_dup 4
+                                        [(match_dup 3) (const_int 0)])
+                                      (match_dup 1))
+                            (match_dup 0)))
+             (clobber (reg:CC FLAGS_REG))])])
+
 (define_insn "*add<mode>3_carry_0"
   [(set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m")
        (plus:SWI
    (set_attr "pent_pair" "pu")
    (set_attr "mode" "<MODE>")])
 
+(define_peephole2
+  [(parallel [(set (reg:CCC FLAGS_REG)
+                  (compare:CCC
+                    (zero_extend:<DWI>
+                      (plus:SWI48
+                        (plus:SWI48
+                          (match_operator:SWI48 4 "ix86_carry_flag_operator"
+                            [(match_operand 2 "flags_reg_operand")
+                             (const_int 0)])
+                          (match_operand:SWI48 0 "general_reg_operand"))
+                        (match_operand:SWI48 1 "memory_operand")))
+                    (plus:<DWI>
+                      (zero_extend:<DWI> (match_dup 1))
+                      (match_operator:<DWI> 3 "ix86_carry_flag_operator"
+                        [(match_dup 2) (const_int 0)]))))
+             (set (match_dup 0)
+                  (plus:SWI48 (plus:SWI48 (match_op_dup 4
+                                            [(match_dup 2) (const_int 0)])
+                                          (match_dup 0))
+                              (match_dup 1)))])
+   (set (match_dup 1) (match_dup 0))]
+  "(TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ())
+   && peep2_reg_dead_p (2, operands[0])
+   && !reg_overlap_mentioned_p (operands[0], operands[1])"
+  [(parallel [(set (reg:CCC FLAGS_REG)
+                  (compare:CCC
+                    (zero_extend:<DWI>
+                      (plus:SWI48
+                        (plus:SWI48
+                          (match_op_dup 4
+                            [(match_dup 2) (const_int 0)])
+                          (match_dup 1))
+                        (match_dup 0)))
+                    (plus:<DWI>
+                      (zero_extend:<DWI> (match_dup 0))
+                      (match_op_dup 3
+                        [(match_dup 2) (const_int 0)]))))
+             (set (match_dup 1)
+                  (plus:SWI48 (plus:SWI48 (match_op_dup 4
+                                            [(match_dup 2) (const_int 0)])
+                                          (match_dup 1))
+                              (match_dup 0)))])])
+
+(define_peephole2
+  [(set (match_operand:SWI48 0 "general_reg_operand")
+       (match_operand:SWI48 1 "memory_operand"))
+   (parallel [(set (reg:CCC FLAGS_REG)
+                  (compare:CCC
+                    (zero_extend:<DWI>
+                      (plus:SWI48
+                        (plus:SWI48
+                          (match_operator:SWI48 5 "ix86_carry_flag_operator"
+                            [(match_operand 3 "flags_reg_operand")
+                             (const_int 0)])
+                          (match_dup 0))
+                        (match_operand:SWI48 2 "memory_operand")))
+                    (plus:<DWI>
+                      (zero_extend:<DWI> (match_dup 2))
+                      (match_operator:<DWI> 4 "ix86_carry_flag_operator"
+                        [(match_dup 3) (const_int 0)]))))
+             (set (match_dup 0)
+                  (plus:SWI48 (plus:SWI48 (match_op_dup 5
+                                            [(match_dup 3) (const_int 0)])
+                                          (match_dup 0))
+                              (match_dup 2)))])
+   (set (match_dup 1) (match_dup 0))]
+  "(TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ())
+   && peep2_reg_dead_p (3, operands[0])
+   && !reg_overlap_mentioned_p (operands[0], operands[1])
+   && !reg_overlap_mentioned_p (operands[0], operands[2])"
+  [(set (match_dup 0) (match_dup 2))
+   (parallel [(set (reg:CCC FLAGS_REG)
+                  (compare:CCC
+                    (zero_extend:<DWI>
+                      (plus:SWI48
+                        (plus:SWI48
+                          (match_op_dup 5
+                            [(match_dup 3) (const_int 0)])
+                          (match_dup 1))
+                        (match_dup 0)))
+                    (plus:<DWI>
+                      (zero_extend:<DWI> (match_dup 0))
+                      (match_op_dup 4
+                        [(match_dup 3) (const_int 0)]))))
+             (set (match_dup 1)
+                  (plus:SWI48 (plus:SWI48 (match_op_dup 5
+                                            [(match_dup 3) (const_int 0)])
+                                          (match_dup 1))
+                              (match_dup 0)))])])
+
+(define_peephole2
+  [(parallel [(set (reg:CCC FLAGS_REG)
+                  (compare:CCC
+                    (zero_extend:<DWI>
+                      (plus:SWI48
+                        (plus:SWI48
+                          (match_operator:SWI48 4 "ix86_carry_flag_operator"
+                            [(match_operand 2 "flags_reg_operand")
+                             (const_int 0)])
+                          (match_operand:SWI48 0 "general_reg_operand"))
+                        (match_operand:SWI48 1 "memory_operand")))
+                    (plus:<DWI>
+                      (zero_extend:<DWI> (match_dup 1))
+                      (match_operator:<DWI> 3 "ix86_carry_flag_operator"
+                        [(match_dup 2) (const_int 0)]))))
+             (set (match_dup 0)
+                  (plus:SWI48 (plus:SWI48 (match_op_dup 4
+                                            [(match_dup 2) (const_int 0)])
+                                          (match_dup 0))
+                              (match_dup 1)))])
+   (set (match_operand:QI 5 "general_reg_operand")
+       (ltu:QI (reg:CCC FLAGS_REG) (const_int 0)))
+   (set (match_operand:SWI48 6 "general_reg_operand")
+       (zero_extend:SWI48 (match_dup 5)))
+   (set (match_dup 1) (match_dup 0))]
+  "(TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ())
+   && peep2_reg_dead_p (4, operands[0])
+   && !reg_overlap_mentioned_p (operands[0], operands[1])
+   && !reg_overlap_mentioned_p (operands[0], operands[5])
+   && !reg_overlap_mentioned_p (operands[5], operands[1])
+   && !reg_overlap_mentioned_p (operands[0], operands[6])
+   && !reg_overlap_mentioned_p (operands[6], operands[1])"
+  [(parallel [(set (reg:CCC FLAGS_REG)
+                  (compare:CCC
+                    (zero_extend:<DWI>
+                      (plus:SWI48
+                        (plus:SWI48
+                          (match_op_dup 4
+                            [(match_dup 2) (const_int 0)])
+                          (match_dup 1))
+                        (match_dup 0)))
+                    (plus:<DWI>
+                      (zero_extend:<DWI> (match_dup 0))
+                      (match_op_dup 3
+                        [(match_dup 2) (const_int 0)]))))
+             (set (match_dup 1)
+                  (plus:SWI48 (plus:SWI48 (match_op_dup 4
+                                            [(match_dup 2) (const_int 0)])
+                                          (match_dup 1))
+                              (match_dup 0)))])
+   (set (match_dup 5) (ltu:QI (reg:CCC FLAGS_REG) (const_int 0)))
+   (set (match_dup 6) (zero_extend:SWI48 (match_dup 5)))])
+
 (define_expand "addcarry<mode>_0"
   [(parallel
      [(set (reg:CCC FLAGS_REG)
    (set_attr "pent_pair" "pu")
    (set_attr "mode" "<MODE>")])
 
+(define_peephole2
+  [(set (match_operand:SWI 0 "general_reg_operand")
+       (match_operand:SWI 1 "memory_operand"))
+   (parallel [(set (match_dup 0)
+                  (minus:SWI
+                    (minus:SWI
+                      (match_dup 0)
+                      (match_operator:SWI 4 "ix86_carry_flag_operator"
+                        [(match_operand 3 "flags_reg_operand")
+                         (const_int 0)]))
+                    (match_operand:SWI 2 "memory_operand")))
+             (clobber (reg:CC FLAGS_REG))])
+   (set (match_dup 1) (match_dup 0))]
+  "(TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ())
+   && peep2_reg_dead_p (3, operands[0])
+   && !reg_overlap_mentioned_p (operands[0], operands[1])
+   && !reg_overlap_mentioned_p (operands[0], operands[2])"
+  [(set (match_dup 0) (match_dup 2))
+   (parallel [(set (match_dup 1)
+                  (minus:SWI (minus:SWI (match_dup 1)
+                                        (match_op_dup 4
+                                          [(match_dup 3) (const_int 0)]))
+                             (match_dup 0)))
+             (clobber (reg:CC FLAGS_REG))])])
+
+(define_peephole2
+  [(set (match_operand:SWI 0 "general_reg_operand")
+       (match_operand:SWI 1 "memory_operand"))
+   (parallel [(set (match_dup 0)
+                  (minus:SWI
+                    (minus:SWI
+                      (match_dup 0)
+                      (match_operator:SWI 4 "ix86_carry_flag_operator"
+                        [(match_operand 3 "flags_reg_operand")
+                         (const_int 0)]))
+                    (match_operand:SWI 2 "memory_operand")))
+             (clobber (reg:CC FLAGS_REG))])
+   (set (match_operand:SWI 5 "general_reg_operand") (match_dup 0))
+   (set (match_dup 1) (match_dup 5))]
+  "(TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ())
+   && peep2_reg_dead_p (3, operands[0])
+   && peep2_reg_dead_p (4, operands[5])
+   && !reg_overlap_mentioned_p (operands[0], operands[1])
+   && !reg_overlap_mentioned_p (operands[0], operands[2])
+   && !reg_overlap_mentioned_p (operands[5], operands[1])"
+  [(set (match_dup 0) (match_dup 2))
+   (parallel [(set (match_dup 1)
+                  (minus:SWI (minus:SWI (match_dup 1)
+                                        (match_op_dup 4
+                                          [(match_dup 3) (const_int 0)]))
+                             (match_dup 0)))
+             (clobber (reg:CC FLAGS_REG))])])
+
 (define_insn "*sub<mode>3_carry_0"
   [(set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m")
        (minus:SWI
   [(set (reg:CCC FLAGS_REG)
        (compare:CCC
          (zero_extend:<DWI>
-           (match_operand:SWI48 1 "nonimmediate_operand" "0"))
+           (match_operand:SWI48 1 "nonimmediate_operand" "0,0"))
          (plus:<DWI>
            (match_operator:<DWI> 4 "ix86_carry_flag_operator"
              [(match_operand 3 "flags_reg_operand") (const_int 0)])
            (zero_extend:<DWI>
-             (match_operand:SWI48 2 "nonimmediate_operand" "rm")))))
-   (set (match_operand:SWI48 0 "register_operand" "=r")
+             (match_operand:SWI48 2 "nonimmediate_operand" "r,rm")))))
+   (set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r")
        (minus:SWI48 (minus:SWI48
                       (match_dup 1)
                       (match_operator:SWI48 5 "ix86_carry_flag_operator"
    (set_attr "pent_pair" "pu")
    (set_attr "mode" "<MODE>")])
 
+(define_peephole2
+  [(set (match_operand:SWI48 0 "general_reg_operand")
+       (match_operand:SWI48 1 "memory_operand"))
+   (parallel [(set (reg:CCC FLAGS_REG)
+                  (compare:CCC
+                    (zero_extend:<DWI> (match_dup 0))
+                    (plus:<DWI>
+                      (match_operator:<DWI> 4 "ix86_carry_flag_operator"
+                        [(match_operand 3 "flags_reg_operand") (const_int 0)])
+                      (zero_extend:<DWI>
+                        (match_operand:SWI48 2 "memory_operand")))))
+             (set (match_dup 0)
+                  (minus:SWI48
+                    (minus:SWI48
+                      (match_dup 0)
+                      (match_operator:SWI48 5 "ix86_carry_flag_operator"
+                        [(match_dup 3) (const_int 0)]))
+                    (match_dup 2)))])
+   (set (match_dup 1) (match_dup 0))]
+  "(TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ())
+   && peep2_reg_dead_p (3, operands[0])
+   && !reg_overlap_mentioned_p (operands[0], operands[1])
+   && !reg_overlap_mentioned_p (operands[0], operands[2])"
+  [(set (match_dup 0) (match_dup 2))
+   (parallel [(set (reg:CCC FLAGS_REG)
+                  (compare:CCC
+                    (zero_extend:<DWI> (match_dup 1))
+                    (plus:<DWI> (match_op_dup 4
+                                  [(match_dup 3) (const_int 0)])
+                                (zero_extend:<DWI> (match_dup 0)))))
+             (set (match_dup 1)
+                  (minus:SWI48 (minus:SWI48 (match_dup 1)
+                                            (match_op_dup 5
+                                              [(match_dup 3) (const_int 0)]))
+                               (match_dup 0)))])])
+
+(define_peephole2
+  [(set (match_operand:SWI48 6 "general_reg_operand")
+       (match_operand:SWI48 7 "memory_operand"))
+   (set (match_operand:SWI48 8 "general_reg_operand")
+       (match_operand:SWI48 9 "memory_operand"))
+   (parallel [(set (reg:CCC FLAGS_REG)
+                  (compare:CCC
+                    (zero_extend:<DWI>
+                      (match_operand:SWI48 0 "general_reg_operand"))
+                    (plus:<DWI>
+                      (match_operator:<DWI> 4 "ix86_carry_flag_operator"
+                        [(match_operand 3 "flags_reg_operand") (const_int 0)])
+                      (zero_extend:<DWI>
+                        (match_operand:SWI48 2 "general_reg_operand")))))
+             (set (match_dup 0)
+                  (minus:SWI48
+                    (minus:SWI48
+                      (match_dup 0)
+                      (match_operator:SWI48 5 "ix86_carry_flag_operator"
+                        [(match_dup 3) (const_int 0)]))
+                    (match_dup 2)))])
+   (set (match_operand:SWI48 1 "memory_operand") (match_dup 0))]
+  "(TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ())
+   && peep2_reg_dead_p (4, operands[0])
+   && peep2_reg_dead_p (3, operands[2])
+   && !reg_overlap_mentioned_p (operands[0], operands[1])
+   && !reg_overlap_mentioned_p (operands[2], operands[1])
+   && !reg_overlap_mentioned_p (operands[6], operands[9])
+   && (rtx_equal_p (operands[6], operands[0])
+       ? (rtx_equal_p (operands[7], operands[1])
+         && rtx_equal_p (operands[8], operands[2]))
+       : (rtx_equal_p (operands[8], operands[0])
+         && rtx_equal_p (operands[9], operands[1])
+         && rtx_equal_p (operands[6], operands[2])))"
+  [(set (match_dup 0) (match_dup 9))
+   (parallel [(set (reg:CCC FLAGS_REG)
+                  (compare:CCC
+                    (zero_extend:<DWI> (match_dup 1))
+                    (plus:<DWI> (match_op_dup 4
+                                  [(match_dup 3) (const_int 0)])
+                                (zero_extend:<DWI> (match_dup 0)))))
+             (set (match_dup 1)
+                  (minus:SWI48 (minus:SWI48 (match_dup 1)
+                                            (match_op_dup 5
+                                              [(match_dup 3) (const_int 0)]))
+                               (match_dup 0)))])]
+{
+  if (!rtx_equal_p (operands[6], operands[0]))
+    operands[9] = operands[7];
+})
+
+(define_peephole2
+  [(set (match_operand:SWI48 6 "general_reg_operand")
+       (match_operand:SWI48 7 "memory_operand"))
+   (set (match_operand:SWI48 8 "general_reg_operand")
+       (match_operand:SWI48 9 "memory_operand"))
+   (parallel [(set (reg:CCC FLAGS_REG)
+                  (compare:CCC
+                    (zero_extend:<DWI>
+                      (match_operand:SWI48 0 "general_reg_operand"))
+                    (plus:<DWI>
+                      (match_operator:<DWI> 4 "ix86_carry_flag_operator"
+                        [(match_operand 3 "flags_reg_operand") (const_int 0)])
+                      (zero_extend:<DWI>
+                        (match_operand:SWI48 2 "general_reg_operand")))))
+             (set (match_dup 0)
+                  (minus:SWI48
+                    (minus:SWI48
+                      (match_dup 0)
+                      (match_operator:SWI48 5 "ix86_carry_flag_operator"
+                        [(match_dup 3) (const_int 0)]))
+                    (match_dup 2)))])
+   (set (match_operand:QI 10 "general_reg_operand")
+       (ltu:QI (reg:CCC FLAGS_REG) (const_int 0)))
+   (set (match_operand:SWI48 11 "general_reg_operand")
+       (zero_extend:SWI48 (match_dup 10)))
+   (set (match_operand:SWI48 1 "memory_operand") (match_dup 0))]
+  "(TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ())
+   && peep2_reg_dead_p (6, operands[0])
+   && peep2_reg_dead_p (3, operands[2])
+   && !reg_overlap_mentioned_p (operands[0], operands[1])
+   && !reg_overlap_mentioned_p (operands[2], operands[1])
+   && !reg_overlap_mentioned_p (operands[6], operands[9])
+   && !reg_overlap_mentioned_p (operands[0], operands[10])
+   && !reg_overlap_mentioned_p (operands[10], operands[1])
+   && !reg_overlap_mentioned_p (operands[0], operands[11])
+   && !reg_overlap_mentioned_p (operands[11], operands[1])
+   && (rtx_equal_p (operands[6], operands[0])
+       ? (rtx_equal_p (operands[7], operands[1])
+         && rtx_equal_p (operands[8], operands[2]))
+       : (rtx_equal_p (operands[8], operands[0])
+         && rtx_equal_p (operands[9], operands[1])
+         && rtx_equal_p (operands[6], operands[2])))"
+  [(set (match_dup 0) (match_dup 9))
+   (parallel [(set (reg:CCC FLAGS_REG)
+                  (compare:CCC
+                    (zero_extend:<DWI> (match_dup 1))
+                    (plus:<DWI> (match_op_dup 4
+                                  [(match_dup 3) (const_int 0)])
+                                (zero_extend:<DWI> (match_dup 0)))))
+             (set (match_dup 1)
+                  (minus:SWI48 (minus:SWI48 (match_dup 1)
+                                            (match_op_dup 5
+                                              [(match_dup 3) (const_int 0)]))
+                               (match_dup 0)))])
+   (set (match_dup 10) (ltu:QI (reg:CCC FLAGS_REG) (const_int 0)))
+   (set (match_dup 11) (zero_extend:SWI48 (match_dup 10)))]
+{
+  if (!rtx_equal_p (operands[6], operands[0]))
+    operands[9] = operands[7];
+})
+
 (define_expand "subborrow<mode>_0"
   [(parallel
      [(set (reg:CC FLAGS_REG)
           (minus:SWI48 (match_dup 1) (match_dup 2)))])]
   "ix86_binary_operator_ok (MINUS, <MODE>mode, operands)")
 
+(define_expand "uaddc<mode>5"
+  [(match_operand:SWI48 0 "register_operand")
+   (match_operand:SWI48 1 "register_operand")
+   (match_operand:SWI48 2 "register_operand")
+   (match_operand:SWI48 3 "register_operand")
+   (match_operand:SWI48 4 "nonmemory_operand")]
+  ""
+{
+  rtx cf = gen_rtx_REG (CCCmode, FLAGS_REG), pat, pat2;
+  if (operands[4] == const0_rtx)
+    emit_insn (gen_addcarry<mode>_0 (operands[0], operands[2], operands[3]));
+  else
+    {
+      ix86_expand_carry (operands[4]);
+      pat = gen_rtx_LTU (<DWI>mode, cf, const0_rtx);
+      pat2 = gen_rtx_LTU (<MODE>mode, cf, const0_rtx);
+      emit_insn (gen_addcarry<mode> (operands[0], operands[2], operands[3],
+                                    cf, pat, pat2));
+    }
+  rtx cc = gen_reg_rtx (QImode);
+  pat = gen_rtx_LTU (QImode, cf, const0_rtx);
+  emit_insn (gen_rtx_SET (cc, pat));
+  emit_insn (gen_zero_extendqi<mode>2 (operands[1], cc));
+  DONE;
+})
+
+(define_expand "usubc<mode>5"
+  [(match_operand:SWI48 0 "register_operand")
+   (match_operand:SWI48 1 "register_operand")
+   (match_operand:SWI48 2 "register_operand")
+   (match_operand:SWI48 3 "register_operand")
+   (match_operand:SWI48 4 "nonmemory_operand")]
+  ""
+{
+  rtx cf, pat, pat2;
+  if (operands[4] == const0_rtx)
+    {
+      cf = gen_rtx_REG (CCmode, FLAGS_REG);
+      emit_insn (gen_subborrow<mode>_0 (operands[0], operands[2],
+                                       operands[3]));
+    }
+  else
+    {
+      cf = gen_rtx_REG (CCCmode, FLAGS_REG);
+      ix86_expand_carry (operands[4]);
+      pat = gen_rtx_LTU (<DWI>mode, cf, const0_rtx);
+      pat2 = gen_rtx_LTU (<MODE>mode, cf, const0_rtx);
+      emit_insn (gen_subborrow<mode> (operands[0], operands[2], operands[3],
+                                     cf, pat, pat2));
+    }
+  rtx cc = gen_reg_rtx (QImode);
+  pat = gen_rtx_LTU (QImode, cf, const0_rtx);
+  emit_insn (gen_rtx_SET (cc, pat));
+  emit_insn (gen_zero_extendqi<mode>2 (operands[1], cc));
+  DONE;
+})
+
 (define_mode_iterator CC_CCC [CC CCC])
 
 ;; Pre-reload splitter to optimize
 ;; operand and no intervening flags modifications into nothing.
 (define_insn_and_split "*setcc_qi_addqi3_cconly_overflow_1_<mode>"
   [(set (reg:CCC FLAGS_REG)
-       (compare:CCC (neg:QI (geu:QI (reg:CC_CCC FLAGS_REG) (const_int 0)))
-                    (ltu:QI (reg:CC_CCC FLAGS_REG) (const_int 0))))]
+       (compare:CCC (neg:QI (geu:QI (reg:CC_CCC FLAGS_REG) (const_int 0)))
+                    (ltu:QI (reg:CC_CCC FLAGS_REG) (const_int 0))))]
+  "ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+  "emit_note (NOTE_INSN_DELETED); DONE;")
+
+;; Set the carry flag from the carry flag.
+(define_insn_and_split "*setccc"
+  [(set (reg:CCC FLAGS_REG)
+       (reg:CCC FLAGS_REG))]
+  "ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+  "emit_note (NOTE_INSN_DELETED); DONE;")
+
+;; Set the carry flag from the carry flag.
+(define_insn_and_split "*setcc_qi_negqi_ccc_1_<mode>"
+  [(set (reg:CCC FLAGS_REG)
+       (ltu:CCC (reg:CC_CCC FLAGS_REG) (const_int 0)))]
+  "ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+  "emit_note (NOTE_INSN_DELETED); DONE;")
+
+;; Set the carry flag from the carry flag.
+(define_insn_and_split "*setcc_qi_negqi_ccc_2_<mode>"
+  [(set (reg:CCC FLAGS_REG)
+       (unspec:CCC [(ltu:QI (reg:CC_CCC FLAGS_REG) (const_int 0))
+                    (const_int 0)] UNSPEC_CC_NE))]
   "ix86_pre_reload_split ()"
   "#"
   "&& 1"
-  [(const_int 0)])
+  [(const_int 0)]
+  "emit_note (NOTE_INSN_DELETED); DONE;")
 \f
 ;; Overflow setting add instructions
 
   [(set_attr "type" "alu")
    (set_attr "mode" "<MODE>")])
 
-(define_insn "*add<mode>3_cc_overflow_1"
+(define_insn "@add<mode>3_cc_overflow_1"
   [(set (reg:CCC FLAGS_REG)
        (compare:CCC
            (plus:SWI
                     (match_dup 1)))
              (set (match_dup 1) (plus:SWI (match_dup 1) (match_dup 0)))])])
 
+(define_peephole2
+  [(set (match_operand:SWI 0 "general_reg_operand")
+       (match_operand:SWI 1 "memory_operand"))
+   (parallel [(set (reg:CCC FLAGS_REG)
+                  (compare:CCC
+                    (plus:SWI (match_dup 0)
+                              (match_operand:SWI 2 "memory_operand"))
+                    (match_dup 0)))
+             (set (match_dup 0) (plus:SWI (match_dup 0) (match_dup 2)))])
+   (set (match_dup 1) (match_dup 0))]
+  "(TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ())
+   && peep2_reg_dead_p (3, operands[0])
+   && !reg_overlap_mentioned_p (operands[0], operands[1])
+   && !reg_overlap_mentioned_p (operands[0], operands[2])"
+  [(set (match_dup 0) (match_dup 2))
+   (parallel [(set (reg:CCC FLAGS_REG)
+                  (compare:CCC
+                    (plus:SWI (match_dup 1) (match_dup 0))
+                    (match_dup 1)))
+             (set (match_dup 1) (plus:SWI (match_dup 1) (match_dup 0)))])])
+
 (define_insn "*addsi3_zext_cc_overflow_1"
   [(set (reg:CCC FLAGS_REG)
        (compare:CCC
   [(parallel [(set (match_operand:<DWI> 0 "register_operand")
                   (mult:<DWI>
                     (any_extend:<DWI>
-                      (match_operand:DWIH 1 "nonimmediate_operand"))
+                      (match_operand:DWIH 1 "register_operand"))
                     (any_extend:<DWI>
-                      (match_operand:DWIH 2 "register_operand"))))
+                      (match_operand:DWIH 2 "nonimmediate_operand"))))
              (clobber (reg:CC FLAGS_REG))])])
 
 (define_expand "<u>mulqihi3"
   [(parallel [(set (match_operand:HI 0 "register_operand")
                   (mult:HI
                     (any_extend:HI
-                      (match_operand:QI 1 "nonimmediate_operand"))
+                      (match_operand:QI 1 "register_operand"))
                     (any_extend:HI
-                      (match_operand:QI 2 "register_operand"))))
+                      (match_operand:QI 2 "nonimmediate_operand"))))
              (clobber (reg:CC FLAGS_REG))])]
   "TARGET_QIMODE_MATH")
 
 (define_insn "*bmi2_umul<mode><dwi>3_1"
   [(set (match_operand:DWIH 0 "register_operand" "=r")
        (mult:DWIH
-         (match_operand:DWIH 2 "nonimmediate_operand" "%d")
+         (match_operand:DWIH 2 "register_operand" "%d")
          (match_operand:DWIH 3 "nonimmediate_operand" "rm")))
    (set (match_operand:DWIH 1 "register_operand" "=r")
-       (truncate:DWIH
-         (lshiftrt:<DWI>
-           (mult:<DWI> (zero_extend:<DWI> (match_dup 2))
-                       (zero_extend:<DWI> (match_dup 3)))
-           (match_operand:QI 4 "const_int_operand"))))]
-  "TARGET_BMI2 && INTVAL (operands[4]) == <MODE_SIZE> * BITS_PER_UNIT
-   && !(MEM_P (operands[2]) && MEM_P (operands[3]))"
+       (umul_highpart:DWIH (match_dup 2) (match_dup 3)))]
+  "TARGET_BMI2"
   "mulx\t{%3, %0, %1|%1, %0, %3}"
   [(set_attr "type" "imulx")
    (set_attr "prefix" "vex")
    (set_attr "mode" "<MODE>")])
 
+;; Tweak *bmi2_umul<mode><dwi>3_1 to eliminate following mov.
+(define_peephole2
+  [(parallel [(set (match_operand:DWIH 0 "general_reg_operand")
+                  (mult:DWIH (match_operand:DWIH 2 "register_operand")
+                             (match_operand:DWIH 3 "nonimmediate_operand")))
+             (set (match_operand:DWIH 1 "general_reg_operand")
+                  (umul_highpart:DWIH (match_dup 2) (match_dup 3)))])
+   (set (match_operand:DWIH 4 "general_reg_operand")
+       (match_operand:DWIH 5 "general_reg_operand"))]
+  "TARGET_BMI2
+   && ((REGNO (operands[5]) == REGNO (operands[0])
+        && REGNO (operands[1]) != REGNO (operands[4]))
+       || (REGNO (operands[5]) == REGNO (operands[1])
+          && REGNO (operands[0]) != REGNO (operands[4])))
+   && peep2_reg_dead_p (2, operands[5])"
+  [(parallel [(set (match_dup 0) (mult:DWIH (match_dup 2) (match_dup 3)))
+             (set (match_dup 1)
+                  (umul_highpart:DWIH (match_dup 2) (match_dup 3)))])]
+{
+  if (REGNO (operands[5]) == REGNO (operands[0]))
+    operands[0] = operands[4];
+  else
+    operands[1] = operands[4];
+})
+
 (define_insn "*umul<mode><dwi>3_1"
   [(set (match_operand:<DWI> 0 "register_operand" "=r,A")
        (mult:<DWI>
          (zero_extend:<DWI>
-           (match_operand:DWIH 1 "nonimmediate_operand" "%d,0"))
+           (match_operand:DWIH 1 "register_operand" "%d,a"))
          (zero_extend:<DWI>
            (match_operand:DWIH 2 "nonimmediate_operand" "rm,rm"))))
    (clobber (reg:CC FLAGS_REG))]
   [(parallel [(set (match_dup 3)
                   (mult:DWIH (match_dup 1) (match_dup 2)))
              (set (match_dup 4)
-                  (truncate:DWIH
-                    (lshiftrt:<DWI>
-                      (mult:<DWI> (zero_extend:<DWI> (match_dup 1))
-                                  (zero_extend:<DWI> (match_dup 2)))
-                      (match_dup 5))))])]
+                  (umul_highpart:DWIH (match_dup 1) (match_dup 2)))])]
 {
   split_double_mode (<DWI>mode, &operands[0], 1, &operands[3], &operands[4]);
 
   [(set (match_operand:<DWI> 0 "register_operand" "=A")
        (mult:<DWI>
          (sign_extend:<DWI>
-           (match_operand:DWIH 1 "nonimmediate_operand" "%0"))
+           (match_operand:DWIH 1 "register_operand" "%a"))
          (sign_extend:<DWI>
            (match_operand:DWIH 2 "nonimmediate_operand" "rm"))))
    (clobber (reg:CC FLAGS_REG))]
   [(set (match_operand:HI 0 "register_operand" "=a")
        (mult:HI
          (any_extend:HI
-           (match_operand:QI 1 "nonimmediate_operand" "%0"))
+           (match_operand:QI 1 "register_operand" "%0"))
          (any_extend:HI
            (match_operand:QI 2 "nonimmediate_operand" "qm"))))
    (clobber (reg:CC FLAGS_REG))]
    (set_attr "bdver1_decode" "direct")
    (set_attr "mode" "QI")])
 
+;; Widening multiplication peephole2s to tweak register allocation.
+;; mov imm,%rdx; mov %rdi,%rax; mulq %rdx  ->  mov imm,%rax; mulq %rdi
+(define_peephole2
+  [(set (match_operand:DWIH 0 "general_reg_operand")
+       (match_operand:DWIH 1 "immediate_operand"))
+   (set (match_operand:DWIH 2 "general_reg_operand")
+       (match_operand:DWIH 3 "general_reg_operand"))
+   (parallel [(set (match_operand:<DWI> 4 "general_reg_operand")
+                  (mult:<DWI> (zero_extend:<DWI> (match_dup 2))
+                              (zero_extend:<DWI> (match_dup 0))))
+             (clobber (reg:CC FLAGS_REG))])]
+  "REGNO (operands[3]) != AX_REG
+   && REGNO (operands[0]) != REGNO (operands[2])
+   && REGNO (operands[0]) != REGNO (operands[3])
+   && (REGNO (operands[0]) == REGNO (operands[4])
+       || REGNO (operands[0]) == DX_REG
+       || peep2_reg_dead_p (3, operands[0]))"
+  [(set (match_dup 2) (match_dup 1))
+   (parallel [(set (match_dup 4)
+                  (mult:<DWI> (zero_extend:<DWI> (match_dup 2))
+                              (zero_extend:<DWI> (match_dup 3))))
+             (clobber (reg:CC FLAGS_REG))])])
+
+;; mov imm,%rax; mov %rdi,%rdx; mulx %rax  ->  mov imm,%rdx; mulx %rdi
+(define_peephole2
+  [(set (match_operand:DWIH 0 "general_reg_operand")
+       (match_operand:DWIH 1 "immediate_operand"))
+   (set (match_operand:DWIH 2 "general_reg_operand")
+       (match_operand:DWIH 3 "general_reg_operand"))
+   (parallel [(set (match_operand:DWIH 4 "general_reg_operand")
+                  (mult:DWIH (match_dup 2) (match_dup 0)))
+             (set (match_operand:DWIH 5 "general_reg_operand")
+                  (umul_highpart:DWIH (match_dup 2) (match_dup 0)))])]
+  "REGNO (operands[3]) != DX_REG
+   && REGNO (operands[0]) != REGNO (operands[2])
+   && REGNO (operands[0]) != REGNO (operands[3])
+   && (REGNO (operands[0]) == REGNO (operands[4])
+       || REGNO (operands[0]) == REGNO (operands[5])
+       || peep2_reg_dead_p (3, operands[0]))"
+  [(set (match_dup 2) (match_dup 1))
+   (parallel [(set (match_dup 4)
+                  (mult:DWIH (match_dup 2) (match_dup 3)))
+             (set (match_dup 5)
+                  (umul_highpart:DWIH (match_dup 2) (match_dup 3)))])])
+
 ;; Highpart multiplication patterns
 (define_insn "<s>mul<mode>3_highpart"
   [(set (match_operand:DWIH 0 "register_operand" "=d")
        (compare
          (and:QI
            (subreg:QI
-             (zero_extract:SWI248
-               (match_operand 0 "int248_register_operand" "Q,Q")
-               (const_int 8)
-               (const_int 8)) 0)
-           (match_operand:QI 1 "general_x64constmem_operand" "QnBc,m"))
+             (match_operator:SWI248 2 "extract_operator"
+               [(match_operand 0 "int248_register_operand" "Q")
+                (const_int 8)
+                (const_int 8)]) 0)
+           (match_operand:QI 1 "general_operand" "QnBn"))
          (const_int 0)))]
   "ix86_match_ccmode (insn, CCNOmode)"
   "test{b}\t{%1, %h0|%h0, %1}"
-  [(set_attr "isa" "*,nox64")
+  [(set_attr "addr" "gpr8")
    (set_attr "type" "test")
    (set_attr "mode" "QI")])
 
        (compare
          (and:QI
            (subreg:QI
-             (zero_extract:SWI248
-               (match_operand 0 "int248_register_operand" "Q")
-               (const_int 8)
-               (const_int 8)) 0)
+             (match_operator:SWI248 2 "extract_operator"
+               [(match_operand 0 "int248_register_operand" "Q")
+                (const_int 8)
+                (const_int 8)]) 0)
            (subreg:QI
-             (zero_extract:SWI248
-               (match_operand 1 "int248_register_operand" "Q")
-               (const_int 8)
-               (const_int 8)) 0))
+             (match_operator:SWI248 3 "extract_operator"
+               [(match_operand 1 "int248_register_operand" "Q")
+                (const_int 8)
+                (const_int 8)]) 0))
          (const_int 0)))]
   "ix86_match_ccmode (insn, CCNOmode)"
   "test{b}\t{%h1, %h0|%h0, %h1}"
         (match_operator 1 "compare_operator"
          [(zero_extract:SWI248
             (match_operand 2 "int_nonimmediate_operand" "rm")
-            (match_operand 3 "const_int_operand")
-            (match_operand 4 "const_int_operand"))
+            (match_operand:QI 3 "const_int_operand")
+            (match_operand:QI 4 "const_int_operand"))
           (const_int 0)]))]
   "/* Ensure that resulting mask is zero or sign extended operand.  */
    INTVAL (operands[4]) >= 0
   operands[3] = gen_int_mode (INTVAL (operands[3]), QImode);
 })
 
+;; Narrow test instructions with immediate operands that test
+;; memory locations for zero.  E.g. testl $0x00aa0000, mem can be
+;; converted to testb $0xaa, mem+2.  Reject volatile locations and
+;; targets where reading (possibly unaligned) part of memory
+;; location after a large write to the same address causes
+;; store-to-load forwarding stall.
+(define_peephole2
+  [(set (reg:CCZ FLAGS_REG)
+       (compare:CCZ
+         (and:SWI248 (match_operand:SWI248 0 "memory_operand")
+                     (match_operand 1 "const_int_operand"))
+         (const_int 0)))]
+  "!TARGET_PARTIAL_MEMORY_READ_STALL && !MEM_VOLATILE_P (operands[0])"
+  [(set (reg:CCZ FLAGS_REG)
+       (compare:CCZ (match_dup 2) (const_int 0)))]
+{
+  unsigned HOST_WIDE_INT ival = UINTVAL (operands[1]);
+  int first_nonzero_byte, bitsize;
+  rtx new_addr, new_const;
+  machine_mode new_mode;
+
+  if (ival == 0)
+    FAIL;
+
+  /* Clear bits outside mode width.  */
+  ival &= GET_MODE_MASK (<MODE>mode);
+
+  first_nonzero_byte = ctz_hwi (ival) / BITS_PER_UNIT;
+
+  ival >>= first_nonzero_byte * BITS_PER_UNIT;
+
+  bitsize = sizeof (ival) * BITS_PER_UNIT - clz_hwi (ival);
+
+  if (bitsize <= GET_MODE_BITSIZE (QImode))
+    new_mode = QImode;
+  else if (bitsize <= GET_MODE_BITSIZE (HImode))
+    new_mode = HImode;
+  else if (bitsize <= GET_MODE_BITSIZE (SImode))
+    new_mode = SImode;
+  else
+    new_mode = DImode;
+
+  if (GET_MODE_SIZE (new_mode) >= GET_MODE_SIZE (<MODE>mode))
+    FAIL;
+
+  new_addr = adjust_address (operands[0], new_mode, first_nonzero_byte);
+  new_const = gen_int_mode (ival, new_mode);
+
+  operands[2] = gen_rtx_AND (new_mode, new_addr, new_const);
+})
+
 ;; %%% This used to optimize known byte-wide and operations to memory,
 ;; and sometimes to QImode registers.  If this is considered useful,
 ;; it should be done with splitters.
    and{q}\t{%2, %0|%0, %2}
    #
    #"
-  [(set_attr "isa" "x64,x64,x64,x64,avx512bw")
+  [(set_attr "isa" "x64,x64,x64,x64,avx512bw_512")
    (set_attr "type" "alu,alu,alu,imovx,msklog")
    (set_attr "length_immediate" "*,*,*,0,*")
    (set (attr "prefix_rex")
           (symbol_ref "true")))])
 
 ;; Alternative 1 is needed to work around LRA limitation, see PR82524.
-(define_insn_and_split "*and<mode>_1_slp"
+(define_insn_and_split "*<code><mode>_1_slp"
   [(set (strict_low_part (match_operand:SWI12 0 "register_operand" "+<r>,&<r>"))
-       (and:SWI12 (match_operand:SWI12 1 "nonimmediate_operand" "%0,!<r>")
-                  (match_operand:SWI12 2 "general_operand" "<r>mn,<r>mn")))
+       (any_logic:SWI12 (match_operand:SWI12 1 "nonimmediate_operand" "%0,!<r>")
+                     (match_operand:SWI12 2 "general_operand" "<r>mn,<r>mn")))
    (clobber (reg:CC FLAGS_REG))]
   "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
   "@
-   and{<imodesuffix>}\t{%2, %0|%0, %2}
+   <logic>{<imodesuffix>}\t{%2, %0|%0, %2}
    #"
-  "&& reload_completed"
+  "&& reload_completed
+   && !(rtx_equal_p (operands[0], operands[1])
+       || rtx_equal_p (operands[0], operands[2]))"
   [(set (strict_low_part (match_dup 0)) (match_dup 1))
    (parallel
      [(set (strict_low_part (match_dup 0))
-          (and:SWI12 (match_dup 0) (match_dup 2)))
+          (any_logic:SWI12 (match_dup 0) (match_dup 2)))
       (clobber (reg:CC FLAGS_REG))])]
   ""
   [(set_attr "type" "alu")
    (set_attr "mode" "<MODE>")])
 
+;; Alternative 1 is needed to work around LRA limitation, see PR82524.
+(define_insn_and_split "*<code>qi_ext<mode>_1_slp"
+  [(set (strict_low_part (match_operand:QI 0 "register_operand" "+Q,&Q"))
+       (any_logic:QI
+         (subreg:QI
+           (match_operator:SWI248 3 "extract_operator"
+             [(match_operand 2 "int248_register_operand" "Q,Q")
+              (const_int 8)
+              (const_int 8)]) 0)
+         (match_operand:QI 1 "nonimmediate_operand" "0,!qm")))
+   (clobber (reg:CC FLAGS_REG))]
+  "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
+  "@
+   <logic>{b}\t{%h2, %0|%0, %h2}
+   #"
+  "&& reload_completed
+   && !rtx_equal_p (operands[0], operands[1])"
+  [(set (strict_low_part (match_dup 0)) (match_dup 1))
+   (parallel
+     [(set (strict_low_part (match_dup 0))
+          (any_logic:QI
+            (subreg:QI
+              (match_op_dup 3
+                [(match_dup 2) (const_int 8) (const_int 8)]) 0)))
+          (match_dup 0)
+      (clobber (reg:CC FLAGS_REG))])]
+  ""
+  [(set_attr "type" "alu")
+   (set_attr "mode" "QI")])
+
 (define_split
   [(set (match_operand:SWI248 0 "register_operand")
        (and:SWI248 (match_operand:SWI248 1 "nonimmediate_operand")
   [(set_attr "type" "alu")
    (set_attr "mode" "<MODE>")])
 
-(define_insn "*andqi_ext<mode>_0"
-  [(set (match_operand:QI 0 "nonimm_x64constmem_operand" "=QBc,m")
-       (and:QI
+(define_insn "*<code>qi_ext<mode>_0"
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=QBn")
+       (any_logic:QI
          (subreg:QI
-           (zero_extract:SWI248
-             (match_operand 2 "int248_register_operand" "Q,Q")
-             (const_int 8)
-             (const_int 8)) 0)
-         (match_operand:QI 1 "nonimm_x64constmem_operand" "0,0")))
+           (match_operator:SWI248 3 "extract_operator"
+             [(match_operand 2 "int248_register_operand" "Q")
+              (const_int 8)
+              (const_int 8)]) 0)
+         (match_operand:QI 1 "nonimmediate_operand" "0")))
    (clobber (reg:CC FLAGS_REG))]
   ""
-  "and{b}\t{%h2, %0|%0, %h2}"
-  [(set_attr "isa" "*,nox64")
+  "<logic>{b}\t{%h2, %0|%0, %h2}"
+  [(set_attr "addr" "gpr8")
    (set_attr "type" "alu")
    (set_attr "mode" "QI")])
 
               (match_operand:QI 2 "const_int_operand")) 0))
       (clobber (reg:CC FLAGS_REG))])])
 
-(define_insn "*andqi_ext<mode>_1"
+;; Alternative 1 is needed to work around LRA limitation, see PR82524.
+(define_insn_and_split "*<code>qi_ext<mode>_1"
   [(set (zero_extract:SWI248
-         (match_operand 0 "int248_register_operand" "+Q,Q")
+         (match_operand 0 "int248_register_operand" "+Q,&Q")
          (const_int 8)
          (const_int 8))
        (subreg:SWI248
-         (and:QI
+         (any_logic:QI
            (subreg:QI
-             (zero_extract:SWI248
-               (match_operand 1 "int248_register_operand" "0,0")
-               (const_int 8)
-               (const_int 8)) 0)
-           (match_operand:QI 2 "general_x64constmem_operand" "QnBc,m")) 0))
+             (match_operator:SWI248 3 "extract_operator"
+               [(match_operand 1 "int248_register_operand" "0,!Q")
+                (const_int 8)
+                (const_int 8)]) 0)
+           (match_operand:QI 2 "general_operand" "QnBn,QnBn")) 0))
    (clobber (reg:CC FLAGS_REG))]
-  "/* FIXME: without this LRA can't reload this pattern, see PR82524.  */
-   rtx_equal_p (operands[0], operands[1])"
-  "and{b}\t{%2, %h0|%h0, %2}"
-  [(set_attr "isa" "*,nox64")
+  ""
+  "@
+   <logic>{b}\t{%2, %h0|%h0, %2}
+   #"
+  "reload_completed
+   && !(rtx_equal_p (operands[0], operands[1]))"
+  [(set (zero_extract:SWI248
+         (match_dup 0) (const_int 8) (const_int 8))
+       (zero_extract:SWI248
+         (match_dup 1) (const_int 8) (const_int 8)))
+   (parallel
+     [(set (zero_extract:SWI248
+            (match_dup 0) (const_int 8) (const_int 8))
+          (subreg:SWI248
+            (any_logic:QI
+              (subreg:QI
+                (match_op_dup 3
+                  [(match_dup 0) (const_int 8) (const_int 8)]) 0)
+              (match_dup 2)) 0))
+      (clobber (reg:CC FLAGS_REG))])]
+  ""
+  [(set_attr "addr" "gpr8")
    (set_attr "type" "alu")
    (set_attr "mode" "QI")])
 
-;; Generated by peephole translating test to and.  This shows up
-;; often in fp comparisons.
-(define_insn "*andqi_ext<mode>_1_cc"
-  [(set (reg FLAGS_REG)
-       (compare
-         (and:QI
-           (subreg:QI
-             (zero_extract:SWI248
-               (match_operand 1 "int248_register_operand" "0,0")
-               (const_int 8)
-               (const_int 8)) 0)
-           (match_operand:QI 2 "general_x64constmem_operand" "QnBc,m"))
-         (const_int 0)))
+;; Alternative 1 is needed to work around LRA limitation, see PR82524.
+(define_insn_and_split "*<code>qi_ext<mode>_1_cc"
+  [(set (match_operand 4 "flags_reg_operand")
+       (match_operator 5 "compare_operator"
+         [(any_logic:QI
+            (subreg:QI
+              (match_operator:SWI248 3 "extract_operator"
+                [(match_operand 1 "int248_register_operand" "0,!Q")
+                 (const_int 8)
+                 (const_int 8)]) 0)
+            (match_operand:QI 2 "general_operand" "QnBn,QnBn"))
+         (const_int 0)]))
    (set (zero_extract:SWI248
-         (match_operand 0 "int248_register_operand" "+Q,Q")
+         (match_operand 0 "int248_register_operand" "+Q,&Q")
          (const_int 8)
          (const_int 8))
        (subreg:SWI248
-         (and:QI
+         (any_logic:QI
            (subreg:QI
-             (zero_extract:SWI248
-               (match_dup 1)
-               (const_int 8)
-               (const_int 8)) 0)
+             (match_op_dup 3
+               [(match_dup 0) (const_int 8) (const_int 8)]) 0)
            (match_dup 2)) 0))]
-  "ix86_match_ccmode (insn, CCNOmode)
-   /* FIXME: without this LRA can't reload this pattern, see PR82524.  */
-   && rtx_equal_p (operands[0], operands[1])"
-  "and{b}\t{%2, %h0|%h0, %2}"
-  [(set_attr "isa" "*,nox64")
+  "ix86_match_ccmode (insn, CCNOmode)"
+  "@
+   <logic>{b}\t{%2, %h0|%h0, %2}
+   #"
+  "&& reload_completed
+   && !(rtx_equal_p (operands[0], operands[1]))"
+  [(set (zero_extract:SWI248
+         (match_dup 0) (const_int 8) (const_int 8))
+       (zero_extract:SWI248
+         (match_dup 1) (const_int 8) (const_int 8)))
+   (parallel
+     [(set (match_dup 4)
+          (match_op_dup 5
+            [(any_logic:QI
+               (subreg:QI
+                 (match_op_dup 3
+                   [(match_dup 0) (const_int 8) (const_int 8)]) 0)
+               (match_dup 2))
+             (const_int 0)]))
+      (set (zero_extract:SWI248
+            (match_dup 0) (const_int 8) (const_int 8))
+          (subreg:SWI248
+            (any_logic:QI
+              (subreg:QI
+                (match_op_dup 3
+                  [(match_dup 1) (const_int 8) (const_int 8)]) 0)
+              (match_dup 2)) 0))])]
+  ""
+  [(set_attr "addr" "gpr8")
    (set_attr "type" "alu")
    (set_attr "mode" "QI")])
 
-(define_insn "*andqi_ext<mode>_2"
+;; Alternative 1 is needed to work around LRA limitation, see PR82524.
+(define_insn_and_split "*<code>qi_ext<mode>_2"
   [(set (zero_extract:SWI248
-         (match_operand 0 "int248_register_operand" "+Q")
+         (match_operand 0 "int248_register_operand" "+Q,&Q")
          (const_int 8)
          (const_int 8))
        (subreg:SWI248
-         (and:QI
+         (any_logic:QI
            (subreg:QI
-             (zero_extract:SWI248
-               (match_operand 1 "int248_register_operand" "%0")
-               (const_int 8)
-               (const_int 8)) 0)
+             (match_operator:SWI248 3 "extract_operator"
+               [(match_operand 1 "int248_register_operand" "%0,!Q")
+                (const_int 8)
+                (const_int 8)]) 0)
            (subreg:QI
-             (zero_extract:SWI248
-               (match_operand 2 "int248_register_operand" "Q")
-               (const_int 8)
-               (const_int 8)) 0)) 0))
+             (match_operator:SWI248 4 "extract_operator"
+               [(match_operand 2 "int248_register_operand" "Q,Q")
+                (const_int 8)
+                (const_int 8)]) 0)) 0))
    (clobber (reg:CC FLAGS_REG))]
-  "/* FIXME: without this LRA can't reload this pattern, see PR82524.  */
-   rtx_equal_p (operands[0], operands[1])
-   || rtx_equal_p (operands[0], operands[2])"
-  "and{b}\t{%h2, %h0|%h0, %h2}"
+  ""
+  "@
+   <logic>{b}\t{%h2, %h0|%h0, %h2}
+   #"
+  "reload_completed
+   && !(rtx_equal_p (operands[0], operands[1])
+       || rtx_equal_p (operands[0], operands[2]))"
+  [(set (zero_extract:SWI248
+         (match_dup 0) (const_int 8) (const_int 8))
+       (zero_extract:SWI248
+         (match_dup 1) (const_int 8) (const_int 8)))
+   (parallel
+     [(set (zero_extract:SWI248
+            (match_dup 0) (const_int 8) (const_int 8))
+          (subreg:SWI248
+            (any_logic:QI
+              (subreg:QI
+                (match_op_dup 3
+                  [(match_dup 0) (const_int 8) (const_int 8)]) 0)
+              (subreg:QI
+                (match_op_dup 4
+                  [(match_dup 2) (const_int 8) (const_int 8)]) 0)) 0))
+      (clobber (reg:CC FLAGS_REG))])]
+  ""
+  [(set_attr "type" "alu")
+   (set_attr "mode" "QI")])
+
+;; Alternative 1 is needed to work around LRA limitation, see PR82524.
+(define_insn_and_split "*<code>qi_ext<mode>_3"
+  [(set (zero_extract:SWI248
+         (match_operand 0 "int248_register_operand" "+Q,&Q")
+         (const_int 8)
+         (const_int 8))
+       (match_operator:SWI248 3 "extract_operator"
+         [(any_logic
+            (match_operand 1 "int248_register_operand" "%0,!Q")
+            (match_operand 2 "int248_register_operand" "Q,Q"))
+          (const_int 8)
+          (const_int 8)]))
+   (clobber (reg:CC FLAGS_REG))]
+  "GET_MODE (operands[1]) == GET_MODE (operands[2])"
+  "@
+   <logic>{b}\t{%h2, %h0|%h0, %h2}
+   #"
+  "&& reload_completed
+   && !(rtx_equal_p (operands[0], operands[1])
+       || rtx_equal_p (operands[0], operands[2]))"
+  [(set (zero_extract:SWI248
+         (match_dup 0) (const_int 8) (const_int 8))
+       (zero_extract:SWI248
+         (match_dup 1) (const_int 8) (const_int 8)))
+   (parallel
+     [(set (zero_extract:SWI248
+            (match_dup 0) (const_int 8) (const_int 8))
+          (match_op_dup 3
+            [(any_logic (match_dup 4) (match_dup 2))
+             (const_int 8) (const_int 8)]))
+      (clobber (reg:CC FLAGS_REG))])]
+  "operands[4] = gen_lowpart (GET_MODE (operands[1]), operands[0]);"
   [(set_attr "type" "alu")
    (set_attr "mode" "QI")])
 
          (not:SWI48 (match_operand:SWI48 1 "register_operand" "r,r,k"))
          (match_operand:SWI48 2 "nonimmediate_operand" "r,m,k")))
    (clobber (reg:CC FLAGS_REG))]
-  "TARGET_BMI || TARGET_AVX512BW"
+  "TARGET_BMI
+   || (TARGET_AVX512BW && (<MODE>mode == SImode || TARGET_EVEX512))"
   "@
    andn\t{%2, %1, %0|%0, %1, %2}
    andn\t{%2, %1, %0|%0, %1, %2}
    #"
-  [(set_attr "isa" "bmi,bmi,avx512bw")
+  [(set_attr "isa" "bmi,bmi,<kmov_isa>")
    (set_attr "type" "bitmanip,bitmanip,msklog")
    (set_attr "btver2_decode" "direct, double,*")
    (set_attr "mode" "<MODE>")])
    <logic>{<imodesuffix>}\t{%2, %0|%0, %2}
    <logic>{<imodesuffix>}\t{%2, %0|%0, %2}
    #"
-  [(set (attr "isa")
-       (cond [(eq_attr "alternative" "2")
-                (if_then_else (eq_attr "mode" "SI,DI")
-                  (const_string "avx512bw")
-                  (const_string "avx512f"))
-             ]
-             (const_string "*")))
+  [(set_attr "isa" "*,*,<kmov_isa>")
    (set_attr "type" "alu, alu, msklog")
    (set_attr "mode" "<MODE>")])
 
       DONE;
     }
 }
-  [(set (attr "isa")
-       (cond [(eq_attr "alternative" "2")
-                (if_then_else (eq_attr "mode" "SI,DI")
-                  (const_string "avx512bw")
-                  (const_string "avx512f"))
-             ]
-             (const_string "*")))
+  [(set_attr "isa" "*,*,<kmov_isa>")
    (set_attr "type" "alu, alu, msklog")
    (set_attr "mode" "<MODE>")])
 
              (symbol_ref "!TARGET_PARTIAL_REG_STALL")]
           (symbol_ref "true")))])
 
-;; Alternative 1 is needed to work around LRA limitation, see PR82524.
-(define_insn_and_split "*<code><mode>_1_slp"
-  [(set (strict_low_part (match_operand:SWI12 0 "register_operand" "+<r>,&<r>"))
-       (any_or:SWI12 (match_operand:SWI12 1 "nonimmediate_operand" "%0,!<r>")
-                     (match_operand:SWI12 2 "general_operand" "<r>mn,<r>mn")))
-   (clobber (reg:CC FLAGS_REG))]
-  "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
-  "@
-   <logic>{<imodesuffix>}\t{%2, %0|%0, %2}
-   #"
-  "&& reload_completed"
-  [(set (strict_low_part (match_dup 0)) (match_dup 1))
-   (parallel
-     [(set (strict_low_part (match_dup 0))
-          (any_or:SWI12 (match_dup 0) (match_dup 2)))
-      (clobber (reg:CC FLAGS_REG))])]
-  ""
-  [(set_attr "type" "alu")
-   (set_attr "mode" "<MODE>")])
-
 ;; convert (sign_extend:WIDE (any_logic:NARROW (memory, immediate)))
 ;; to (any_logic:WIDE (sign_extend (memory)), (sign_extend (immediate))).
 ;; This eliminates sign extension after logic operation.
   [(set_attr "type" "alu")
    (set_attr "mode" "<MODE>")])
 
-(define_insn "*<code>qi_ext<mode>_0"
-  [(set (match_operand:QI 0 "nonimm_x64constmem_operand" "=QBc,m")
-       (any_or:QI
-         (subreg:QI
-           (zero_extract:SWI248
-             (match_operand 2 "int248_register_operand" "Q,Q")
-             (const_int 8)
-             (const_int 8)) 0)
-         (match_operand:QI 1 "nonimm_x64constmem_operand" "0,0")))
-   (clobber (reg:CC FLAGS_REG))]
-  ""
-  "<logic>{b}\t{%h2, %0|%0, %h2}"
-  [(set_attr "isa" "*,nox64")
-   (set_attr "type" "alu")
-   (set_attr "mode" "QI")])
-
-(define_insn "*<code>qi_ext<mode>_1"
-  [(set (zero_extract:SWI248
-         (match_operand 0 "int248_register_operand" "+Q,Q")
-         (const_int 8)
-         (const_int 8))
-       (subreg:SWI248
-         (any_or:QI
-           (subreg:QI
-             (zero_extract:SWI248
-               (match_operand 1 "int248_register_operand" "0,0")
-               (const_int 8)
-               (const_int 8)) 0)
-           (match_operand:QI 2 "general_x64constmem_operand" "QnBc,m")) 0))
-   (clobber (reg:CC FLAGS_REG))]
-  "(!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun))
-   /* FIXME: without this LRA can't reload this pattern, see PR82524.  */
-   && rtx_equal_p (operands[0], operands[1])"
-  "<logic>{b}\t{%2, %h0|%h0, %2}"
-  [(set_attr "isa" "*,nox64")
-   (set_attr "type" "alu")
-   (set_attr "mode" "QI")])
-
-(define_insn "*<code>qi_ext<mode>_2"
-  [(set (zero_extract:SWI248
-         (match_operand 0 "int248_register_operand" "+Q")
-         (const_int 8)
-         (const_int 8))
-       (subreg:SWI248
-         (any_or:QI
-           (subreg:QI
-             (zero_extract:SWI248
-               (match_operand 1 "int248_register_operand" "%0")
-               (const_int 8)
-               (const_int 8)) 0)
-           (subreg:QI
-             (zero_extract:SWI248
-               (match_operand 2 "int248_register_operand" "Q")
-               (const_int 8)
-               (const_int 8)) 0)) 0))
-   (clobber (reg:CC FLAGS_REG))]
-  "(!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun))
-   /* FIXME: without this LRA can't reload this pattern, see PR82524.  */
-   && (rtx_equal_p (operands[0], operands[1])
-       || rtx_equal_p (operands[0], operands[2]))"
-  "<logic>{b}\t{%h2, %h0|%h0, %h2}"
-  [(set_attr "type" "alu")
-   (set_attr "mode" "QI")])
-
 ;; Convert wide OR instructions with immediate operand to shorter QImode
 ;; equivalents when possible.
 ;; Don't do the splitting with memory operands, since it introduces risk
                                  (const_int 8)) 0)
             (match_dup 2)) 0))])])
 
-(define_insn "*xorqi_ext<mode>_1_cc"
-  [(set (reg FLAGS_REG)
-       (compare
-         (xor:QI
-           (subreg:QI
-             (zero_extract:SWI248
-               (match_operand 1 "int248_register_operand" "0,0")
-               (const_int 8)
-               (const_int 8)) 0)
-           (match_operand:QI 2 "general_x64constmem_operand" "QnBc,m"))
-         (const_int 0)))
-   (set (zero_extract:SWI248
-         (match_operand 0 "int248_register_operand" "+Q,Q")
-         (const_int 8)
-         (const_int 8))
-       (subreg:SWI248
-         (xor:QI
-           (subreg:QI
-             (zero_extract:SWI248
-               (match_dup 1)
-               (const_int 8)
-               (const_int 8)) 0)
-         (match_dup 2)) 0))]
-  "ix86_match_ccmode (insn, CCNOmode)
-   /* FIXME: without this LRA can't reload this pattern, see PR82524.  */
-   && rtx_equal_p (operands[0], operands[1])"
-  "xor{b}\t{%2, %h0|%h0, %2}"
-  [(set_attr "isa" "*,nox64")
-   (set_attr "type" "alu")
-   (set_attr "mode" "QI")])
+;; Peephole2 rega = 0; rega op= regb into rega = regb.
+(define_peephole2
+  [(parallel [(set (match_operand:SWI 0 "general_reg_operand")
+                  (const_int 0))
+             (clobber (reg:CC FLAGS_REG))])
+   (parallel [(set (match_dup 0)
+                  (any_or_plus:SWI (match_dup 0)
+                                   (match_operand:SWI 1 "<general_operand>")))
+             (clobber (reg:CC FLAGS_REG))])]
+  "!reg_mentioned_p (operands[0], operands[1])"
+  [(set (match_dup 0) (match_dup 1))])
+
+;; Peephole2 dead instruction in rega = 0; rega op= rega.
+(define_peephole2
+  [(parallel [(set (match_operand:SWI 0 "general_reg_operand")
+                  (const_int 0))
+             (clobber (reg:CC FLAGS_REG))])
+   (parallel [(set (match_dup 0)
+                  (any_or_plus:SWI (match_dup 0) (match_dup 0)))
+             (clobber (reg:CC FLAGS_REG))])]
+  ""
+  [(parallel [(set (match_dup 0) (const_int 0))
+             (clobber (reg:CC FLAGS_REG))])])
 
 ;; Split DST = (HI<<32)|LO early to minimize register usage.
-(define_code_iterator any_or_plus [plus ior xor])
 (define_insn_and_split "*concat<mode><dwi>3_1"
   [(set (match_operand:<DWI> 0 "nonimmediate_operand" "=ro,r")
        (any_or_plus:<DWI>
          (ashift:<DWI> (match_operand:<DWI> 1 "register_operand" "r,r")
-                       (match_operand:<DWI> 2 "const_int_operand"))
+                       (match_operand:QI 2 "const_int_operand"))
          (zero_extend:<DWI>
            (match_operand:DWIH 3 "nonimmediate_operand" "r,m"))))]
   "INTVAL (operands[2]) == <MODE_SIZE> * BITS_PER_UNIT"
   "#"
   "&& reload_completed"
-  [(clobber (const_int 0))]
+  [(const_int 0)]
 {
   split_double_concat (<DWI>mode, operands[0], operands[3],
                       gen_lowpart (<MODE>mode, operands[1]));
          (zero_extend:<DWI>
            (match_operand:DWIH 1 "nonimmediate_operand" "r,m"))
          (ashift:<DWI> (match_operand:<DWI> 2 "register_operand" "r,r")
-                       (match_operand:<DWI> 3 "const_int_operand"))))]
+                       (match_operand:QI 3 "const_int_operand"))))]
   "INTVAL (operands[3]) == <MODE_SIZE> * BITS_PER_UNIT"
   "#"
   "&& reload_completed"
-  [(clobber (const_int 0))]
+  [(const_int 0)]
 {
   split_double_concat (<DWI>mode, operands[0], operands[1],
                       gen_lowpart (<MODE>mode, operands[2]));
 })
 
 (define_insn_and_split "*concat<mode><dwi>3_3"
-  [(set (match_operand:<DWI> 0 "nonimmediate_operand" "=ro,r,r,&r")
+  [(set (match_operand:<DWI> 0 "nonimmediate_operand" "=ro,r,r,&r,x")
        (any_or_plus:<DWI>
          (ashift:<DWI>
            (zero_extend:<DWI>
-             (match_operand:DWIH 1 "nonimmediate_operand" "r,m,r,m"))
-           (match_operand:<DWI> 2 "const_int_operand"))
+             (match_operand:DWIH 1 "nonimmediate_operand" "r,m,r,m,x"))
+           (match_operand:QI 2 "const_int_operand"))
          (zero_extend:<DWI>
-           (match_operand:DWIH 3 "nonimmediate_operand" "r,r,m,m"))))]
+           (match_operand:DWIH 3 "nonimmediate_operand" "r,r,m,m,0"))))]
   "INTVAL (operands[2]) == <MODE_SIZE> * BITS_PER_UNIT"
   "#"
   "&& reload_completed"
-  [(clobber (const_int 0))]
+  [(const_int 0)]
 {
-  split_double_concat (<DWI>mode, operands[0], operands[3], operands[1]);
+  if (SSE_REG_P (operands[0]))
+    {
+      rtx tmp = gen_rtx_REG (V2DImode, REGNO (operands[0]));
+      emit_insn (gen_vec_concatv2di (tmp, operands[3], operands[1]));
+    }
+  else
+    split_double_concat (<DWI>mode, operands[0], operands[3], operands[1]);
   DONE;
-})
+}
+  [(set_attr "isa" "*,*,*,x64,x64")])
 
 (define_insn_and_split "*concat<mode><dwi>3_4"
   [(set (match_operand:<DWI> 0 "nonimmediate_operand" "=ro,r,r,&r")
          (ashift:<DWI>
            (zero_extend:<DWI>
              (match_operand:DWIH 2 "nonimmediate_operand" "r,r,m,m"))
-           (match_operand:<DWI> 3 "const_int_operand"))))]
+           (match_operand:QI 3 "const_int_operand"))))]
   "INTVAL (operands[3]) == <MODE_SIZE> * BITS_PER_UNIT"
   "#"
   "&& reload_completed"
-  [(clobber (const_int 0))]
+  [(const_int 0)]
 {
   split_double_concat (<DWI>mode, operands[0], operands[1], operands[2]);
   DONE;
-})
+}
+  [(set_attr "isa" "*,*,*,x64")])
 
 (define_insn_and_split "*concat<half><mode>3_5"
   [(set (match_operand:DWI 0 "nonimmediate_operand" "=r,o,o")
        (any_or_plus:DWI
          (ashift:DWI (match_operand:DWI 1 "register_operand" "r,r,r")
-                     (match_operand:DWI 2 "const_int_operand"))
+                     (match_operand:QI 2 "const_int_operand"))
          (match_operand:DWI 3 "const_scalar_int_operand" "n,n,Wd")))]
   "INTVAL (operands[2]) == <MODE_SIZE> * BITS_PER_UNIT / 2
    && (<MODE>mode == DImode
                                        VOIDmode))"
   "#"
   "&& reload_completed"
-  [(clobber (const_int 0))]
+  [(const_int 0)]
 {
   rtx op3 = simplify_subreg (<HALF>mode, operands[3], <MODE>mode, 0);
   split_double_concat (<MODE>mode, operands[0], op3,
          (ashift:<DWI>
            (zero_extend:<DWI>
              (match_operand:DWIH 1 "nonimmediate_operand" "r,r,r,m"))
-           (match_operand:<DWI> 2 "const_int_operand"))
+           (match_operand:QI 2 "const_int_operand"))
          (match_operand:<DWI> 3 "const_scalar_int_operand" "n,n,Wd,n")))]
   "INTVAL (operands[2]) == <MODE_SIZE> * BITS_PER_UNIT
    && (<DWI>mode == DImode
                                        VOIDmode))"
   "#"
   "&& reload_completed"
-  [(clobber (const_int 0))]
+  [(const_int 0)]
 {
   rtx op3 = simplify_subreg (<MODE>mode, operands[3], <DWI>mode, 0);
   split_double_concat (<DWI>mode, operands[0], op3, operands[1]);
                                       VOIDmode)"
   "#"
   "&& reload_completed"
-  [(clobber (const_int 0))]
+  [(const_int 0)]
 {
   rtx op2;
   if (<DWI>mode == DImode)
   "@
    neg{<imodesuffix>}\t%0
    #"
-  "&& reload_completed"
+  "&& reload_completed
+   && !(rtx_equal_p (operands[0], operands[1]))"
   [(set (strict_low_part (match_dup 0)) (match_dup 1))
    (parallel
      [(set (strict_low_part (match_dup 0))
      (set (match_operand:SWI48 0 "register_operand")
          (neg:SWI48 (match_dup 1)))])])
 
-(define_insn "*negqi_ext<mode>_2"
+;; Alternative 1 is needed to work around LRA limitation, see PR82524.
+(define_insn_and_split "*negqi_ext<mode>_1"
   [(set (zero_extract:SWI248
-         (match_operand 0 "int248_register_operand" "+Q")
+         (match_operand 0 "int248_register_operand" "+Q,&Q")
          (const_int 8)
          (const_int 8))
        (subreg:SWI248
          (neg:QI
            (subreg:QI
-             (zero_extract:SWI248
-               (match_operand 1 "int248_register_operand" "0")
-               (const_int 8)
-               (const_int 8)) 0)) 0))
+             (match_operator:SWI248 2 "extract_operator"
+               [(match_operand 1 "int248_register_operand" "0,!Q")
+                (const_int 8)
+                (const_int 8)]) 0)) 0))
    (clobber (reg:CC FLAGS_REG))]
-  "/* FIXME: without this LRA can't reload this pattern, see PR82524.  */
-   rtx_equal_p (operands[0], operands[1])"
-  "neg{b}\t%h0"
+  ""
+  "@
+   neg{b}\t%h0
+   #"
+  "reload_completed
+   && !(rtx_equal_p (operands[0], operands[1]))"
+  [(set (zero_extract:SWI248
+         (match_dup 0) (const_int 8) (const_int 8))
+       (zero_extract:SWI248
+         (match_dup 1) (const_int 8) (const_int 8)))
+   (parallel
+     [(set (zero_extract:SWI248
+            (match_dup 0) (const_int 8) (const_int 8))
+          (subreg:SWI248
+            (neg:QI
+              (subreg:QI
+                (match_op_dup 2
+                  [(match_dup 0) (const_int 8) (const_int 8)]) 0)) 0))
+      (clobber (reg:CC FLAGS_REG))])]
+  ""
   [(set_attr "type" "negnot")
    (set_attr "mode" "QI")])
 
   "@
    not{<imodesuffix>}\t%0
    #"
-  [(set (attr "isa")
-       (cond [(eq_attr "alternative" "1")
-                (if_then_else (eq_attr "mode" "SI,DI")
-                  (const_string "avx512bw")
-                  (const_string "avx512f"))
-             ]
-             (const_string "*")))
+  [(set_attr "isa" "*,<kmov_isa>")
    (set_attr "type" "negnot,msklog")
    (set_attr "mode" "<MODE>")])
 
   "@
    not{l}\t%k0
    #"
-  [(set_attr "isa" "x64,avx512bw")
+  [(set_attr "isa" "x64,avx512bw_512")
    (set_attr "type" "negnot,msklog")
    (set_attr "mode" "SI,SI")])
 
   "@
    not{<imodesuffix>}\t%0
    #"
-  "&& reload_completed"
+  "&& reload_completed
+   && !(rtx_equal_p (operands[0], operands[1]))"
   [(set (strict_low_part (match_dup 0)) (match_dup 1))
    (set (strict_low_part (match_dup 0))
        (not:SWI12 (match_dup 0)))]
                                    (const_int 0)]))
              (set (match_dup 1)
                   (zero_extend:DI (xor:SI (match_dup 3) (const_int -1))))])])
+
+;; Alternative 1 is needed to work around LRA limitation, see PR82524.
+(define_insn_and_split "*one_cmplqi_ext<mode>_1"
+  [(set (zero_extract:SWI248
+         (match_operand 0 "int248_register_operand" "+Q,&Q")
+         (const_int 8)
+         (const_int 8))
+       (subreg:SWI248
+         (not:QI
+           (subreg:QI
+             (match_operator:SWI248 2 "extract_operator"
+               [(match_operand 1 "int248_register_operand" "0,!Q")
+                (const_int 8)
+                (const_int 8)]) 0)) 0))]
+  ""
+  "@
+   not{b}\t%h0
+   #"
+  "reload_completed
+   && !(rtx_equal_p (operands[0], operands[1]))"
+  [(set (zero_extract:SWI248
+         (match_dup 0) (const_int 8) (const_int 8))
+       (zero_extract:SWI248
+         (match_dup 1) (const_int 8) (const_int 8)))
+   (set (zero_extract:SWI248
+         (match_dup 0) (const_int 8) (const_int 8))
+       (subreg:SWI248
+         (not:QI
+           (subreg:QI
+             (match_op_dup 2
+               [(match_dup 0) (const_int 8) (const_int 8)]) 0)) 0))]
+  ""
+  [(set_attr "type" "negnot")
+   (set_attr "mode" "QI")])
 \f
 ;; Shift instructions
 
   [(const_int 0)]
   "ix86_split_ashl (operands, operands[3], <DWI>mode); DONE;")
 
+(define_insn_and_split "*ashl<dwi>3_doubleword_highpart"
+  [(set (match_operand:<DWI> 0 "register_operand" "=r")
+       (ashift:<DWI>
+         (any_extend:<DWI> (match_operand:DWIH 1 "nonimmediate_operand" "rm"))
+         (match_operand:QI 2 "const_int_operand")))
+   (clobber (reg:CC FLAGS_REG))]
+  "INTVAL (operands[2]) >= <MODE_SIZE> * BITS_PER_UNIT
+   && INTVAL (operands[2]) < <MODE_SIZE> * BITS_PER_UNIT * 2"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_double_mode (<DWI>mode, &operands[0], 1, &operands[0], &operands[3]);
+  int bits = INTVAL (operands[2]) - (<MODE_SIZE> * BITS_PER_UNIT);
+  if (!rtx_equal_p (operands[3], operands[1]))
+    emit_move_insn (operands[3], operands[1]);
+  if (bits > 0)
+    emit_insn (gen_ashl<mode>3 (operands[3], operands[3], GEN_INT (bits)));
+  ix86_expand_clear (operands[0]);
+  DONE;
+})
+
 (define_insn "x86_64_shld"
   [(set (match_operand:DI 0 "nonimmediate_operand" "+r*m")
         (ior:DI (ashift:DI (match_dup 0)
        return "sal{<imodesuffix>}\t{%2, %0|%0, %2}";
     }
 }
-  [(set_attr "isa" "*,*,bmi2,avx512bw")
+  [(set_attr "isa" "*,*,bmi2,<kmov_isa>")
    (set (attr "type")
      (cond [(eq_attr "alternative" "1")
              (const_string "lea")
        return "sal{<imodesuffix>}\t{%2, %0|%0, %2}";
     }
 }
-  "&& reload_completed"
+  "&& reload_completed
+   && !(rtx_equal_p (operands[0], operands[1]))"
   [(set (strict_low_part (match_dup 0)) (match_dup 1))
    (parallel
      [(set (strict_low_part (match_dup 0))
 
 ;; Convert ashift to the lea pattern to avoid flags dependency.
 (define_split
-  [(set (match_operand:SWI 0 "register_operand")
-       (ashift:SWI (match_operand:SWI 1 "index_register_operand")
+  [(set (match_operand:SWI 0 "general_reg_operand")
+       (ashift:SWI (match_operand:SWI 1 "index_reg_operand")
                    (match_operand 2 "const_0_to_3_operand")))
    (clobber (reg:CC FLAGS_REG))]
   "reload_completed
 
 ;; Convert ashift to the lea pattern to avoid flags dependency.
 (define_split
-  [(set (match_operand:DI 0 "register_operand")
+  [(set (match_operand:DI 0 "general_reg_operand")
        (zero_extend:DI
-         (ashift:SI (match_operand:SI 1 "index_register_operand")
+         (ashift:SI (match_operand:SI 1 "index_reg_operand")
                     (match_operand 2 "const_0_to_3_operand"))))
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_64BIT && reload_completed
        (const_string "*")))
    (set_attr "mode" "<MODE>")])
 
-(define_insn "*ashlqi_ext<mode>_2"
+;; Alternative 1 is needed to work around LRA limitation, see PR82524.
+(define_insn_and_split "*ashlqi_ext<mode>_1"
   [(set (zero_extract:SWI248
-         (match_operand 0 "int248_register_operand" "+Q")
+         (match_operand 0 "int248_register_operand" "+Q,&Q")
          (const_int 8)
          (const_int 8))
        (subreg:SWI248
          (ashift:QI
            (subreg:QI
-             (zero_extract:SWI248
-               (match_operand 1 "int248_register_operand" "0")
-               (const_int 8)
-               (const_int 8)) 0)
-           (match_operand:QI 2 "nonmemory_operand" "cI")) 0))
-  (clobber (reg:CC FLAGS_REG))]
-  "/* FIXME: without this LRA can't reload this pattern, see PR82524.  */
-   rtx_equal_p (operands[0], operands[1])"
+             (match_operator:SWI248 3 "extract_operator"
+               [(match_operand 1 "int248_register_operand" "0,!Q")
+                (const_int 8)
+                (const_int 8)]) 0)
+           (match_operand:QI 2 "nonmemory_operand" "cI,cI")) 0))
+   (clobber (reg:CC FLAGS_REG))]
+  ""
 {
+  if (which_alternative)
+    return "#";
+
   switch (get_attr_type (insn))
     {
     case TYPE_ALU:
        return "sal{b}\t{%2, %h0|%h0, %2}";
     }
 }
+  "reload_completed
+   && !(rtx_equal_p (operands[0], operands[1]))"
+  [(set (zero_extract:SWI248
+         (match_dup 0) (const_int 8) (const_int 8))
+       (match_dup 1))
+   (parallel
+     [(set (zero_extract:SWI248
+            (match_dup 0) (const_int 8) (const_int 8))
+          (subreg:SWI248
+            (ashift:QI
+              (subreg:QI
+                (match_op_dup 3
+                  [(match_dup 0) (const_int 8) (const_int 8)]) 0)
+              (match_dup 2)) 0))
+      (clobber (reg:CC FLAGS_REG))])]
+  ""
   [(set (attr "type")
      (cond [(and (match_test "TARGET_DOUBLE_WITH_ADD")
                 (match_operand 2 "const1_operand"))
   [(const_int 0)]
   "ix86_split_<insn> (operands, operands[3], <DWI>mode); DONE;")
 
+;; Split truncations of double word right shifts into x86_shrd_1.
+(define_insn_and_split "<insn><dwi>3_doubleword_lowpart"
+  [(set (match_operand:DWIH 0 "register_operand" "=&r")
+       (subreg:DWIH
+         (any_shiftrt:<DWI> (match_operand:<DWI> 1 "register_operand" "r")
+                            (match_operand:QI 2 "const_int_operand")) 0))
+   (clobber (reg:CC FLAGS_REG))]
+  "UINTVAL (operands[2]) < <MODE_SIZE> * BITS_PER_UNIT"
+  "#"
+  "&& reload_completed"
+  [(parallel
+      [(set (match_dup 0)
+           (ior:DWIH (lshiftrt:DWIH (match_dup 0) (match_dup 2))
+                     (subreg:DWIH
+                       (ashift:<DWI> (zero_extend:<DWI> (match_dup 3))
+                                     (match_dup 4)) 0)))
+       (clobber (reg:CC FLAGS_REG))])]
+{
+  split_double_mode (<DWI>mode, &operands[1], 1, &operands[1], &operands[3]);
+  operands[4] = GEN_INT ((<MODE_SIZE> * BITS_PER_UNIT) - INTVAL (operands[2]));
+  if (!rtx_equal_p (operands[0], operands[1]))
+    emit_move_insn (operands[0], operands[1]);
+})
+
 (define_insn "x86_64_shrd"
   [(set (match_operand:DI 0 "nonimmediate_operand" "+r*m")
         (ior:DI (lshiftrt:DI (match_dup 0)
        return "shr{<imodesuffix>}\t{%2, %0|%0, %2}";
     }
 }
-  [(set_attr "isa" "*,bmi2,avx512bw")
+  [(set_attr "isa" "*,bmi2,<kmov_isa>")
    (set_attr "type" "ishift,ishiftx,msklog")
    (set (attr "length_immediate")
      (if_then_else
   else
     return "<shift>{<imodesuffix>}\t{%2, %0|%0, %2}";
 }
-  "&& reload_completed"
+  "&& reload_completed
+   && !(rtx_equal_p (operands[0], operands[1]))"
   [(set (strict_low_part (match_dup 0)) (match_dup 1))
    (parallel
      [(set (strict_low_part (match_dup 0))
        (const_string "*")))
    (set_attr "mode" "<MODE>")])
 
-(define_insn "*<insn>qi_ext<mode>_2"
+;; Alternative 1 is needed to work around LRA limitation, see PR82524.
+(define_insn_and_split "*<insn>qi_ext<mode>_1"
   [(set (zero_extract:SWI248
-         (match_operand 0 "int248_register_operand" "+Q")
+         (match_operand 0 "int248_register_operand" "+Q,&Q")
          (const_int 8)
          (const_int 8))
        (subreg:SWI248
          (any_shiftrt:QI
            (subreg:QI
-             (zero_extract:SWI248
-               (match_operand 1 "int248_register_operand" "0")
-               (const_int 8)
-               (const_int 8)) 0)
-           (match_operand:QI 2 "nonmemory_operand" "cI")) 0))
-  (clobber (reg:CC FLAGS_REG))]
-  "/* FIXME: without this LRA can't reload this pattern, see PR82524.  */
-   rtx_equal_p (operands[0], operands[1])"
+             (match_operator:SWI248 3 "extract_operator"
+               [(match_operand 1 "int248_register_operand" "0,!Q")
+                (const_int 8)
+                (const_int 8)]) 0)
+           (match_operand:QI 2 "nonmemory_operand" "cI,cI")) 0))
+   (clobber (reg:CC FLAGS_REG))]
+  ""
 {
+  if (which_alternative)
+    return "#";
+
   if (operands[2] == const1_rtx
       && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
     return "<shift>{b}\t%h0";
   else
     return "<shift>{b}\t{%2, %h0|%h0, %2}";
 }
+  "reload_completed
+   && !(rtx_equal_p (operands[0], operands[1]))"
+  [(set (zero_extract:SWI248
+         (match_dup 0) (const_int 8) (const_int 8))
+       (match_dup 1))
+   (parallel
+     [(set (zero_extract:SWI248
+            (match_dup 0) (const_int 8) (const_int 8))
+          (subreg:SWI248
+            (any_shiftrt:QI
+              (subreg:QI
+                (match_op_dup 3
+                  [(match_dup 0) (const_int 8) (const_int 8)]) 0)
+              (match_dup 2)) 0))
+      (clobber (reg:CC FLAGS_REG))])]
+  ""
   [(set_attr "type" "ishift")
    (set (attr "length_immediate")
      (if_then_else
        (const_string "0")
        (const_string "*")))
    (set_attr "mode" "QI")])
+
+(define_insn_and_split "*extend<dwi>2_doubleword_highpart"
+  [(set (match_operand:<DWI> 0 "register_operand" "=r")
+       (ashiftrt:<DWI>
+         (ashift:<DWI> (match_operand:<DWI> 1 "nonimmediate_operand" "0")
+                       (match_operand:QI 2 "const_int_operand"))
+         (match_operand:QI 3 "const_int_operand")))
+   (clobber (reg:CC FLAGS_REG))]
+  "INTVAL (operands[2]) == INTVAL (operands[3])
+   && UINTVAL (operands[2]) < <MODE_SIZE> * BITS_PER_UNIT"
+  "#"
+  "&& reload_completed"
+  [(parallel [(set (match_dup 4)
+                  (ashift:DWIH (match_dup 4) (match_dup 2)))
+             (clobber (reg:CC FLAGS_REG))])
+   (parallel [(set (match_dup 4)
+                  (ashiftrt:DWIH (match_dup 4) (match_dup 2)))
+             (clobber (reg:CC FLAGS_REG))])]
+  "split_double_mode (<DWI>mode, &operands[0], 1, &operands[0], &operands[4]);")
+
+(define_insn_and_split "*extendv2di2_highpart_stv"
+  [(set (match_operand:V2DI 0 "register_operand" "=v")
+       (ashiftrt:V2DI
+         (ashift:V2DI (match_operand:V2DI 1 "nonimmediate_operand" "vm")
+                      (match_operand:QI 2 "const_int_operand"))
+         (match_operand:QI 3 "const_int_operand")))]
+  "!TARGET_64BIT && TARGET_STV && TARGET_AVX512VL
+   && INTVAL (operands[2]) == INTVAL (operands[3])
+   && UINTVAL (operands[2]) < 32"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0)
+       (ashift:V2DI (match_dup 1) (match_dup 2)))
+   (set (match_dup 0)
+       (ashiftrt:V2DI (match_dup 0) (match_dup 2)))])
 \f
 ;; Rotate instructions
 
     emit_insn (gen_ix86_<insn>ti3_doubleword
                (operands[0], operands[1], operands[2]));
   else if (CONST_INT_P (operands[2]) && INTVAL (operands[2]) == 64)
-    emit_insn (gen_<insn>64ti2_doubleword (operands[0], operands[1]));
+    {
+      operands[1] = force_reg (TImode, operands[1]);
+      emit_insn (gen_<insn>64ti2_doubleword (operands[0], operands[1]));
+    }
   else
     {
       rtx amount = force_reg (QImode, operands[2]);
     emit_insn (gen_ix86_<insn>di3_doubleword
                (operands[0], operands[1], operands[2]));
   else if (CONST_INT_P (operands[2]) && INTVAL (operands[2]) == 32)
-    emit_insn (gen_<insn>32di2_doubleword (operands[0], operands[1]));
+    {
+      operands[1] = force_reg (DImode, operands[1]);
+      emit_insn (gen_<insn>32di2_doubleword (operands[0], operands[1]));
+    }
   else
     FAIL;
 
 })
 
 (define_insn_and_split "<insn>32di2_doubleword"
- [(set (match_operand:DI 0 "register_operand" "=r,r,r")
-       (any_rotate:DI (match_operand:DI 1 "nonimmediate_operand" "0,r,o")
+ [(set (match_operand:DI 0 "register_operand" "=r,r")
+       (any_rotate:DI (match_operand:DI 1 "register_operand" "0,r")
                       (const_int 32)))]
  "!TARGET_64BIT"
  "#"
 })
 
 (define_insn_and_split "<insn>64ti2_doubleword"
- [(set (match_operand:TI 0 "register_operand" "=r,r,r")
-       (any_rotate:TI (match_operand:TI 1 "nonimmediate_operand" "0,r,o")
+ [(set (match_operand:TI 0 "register_operand" "=r,r")
+       (any_rotate:TI (match_operand:TI 1 "register_operand" "0,r")
                       (const_int 64)))]
  "TARGET_64BIT"
  "#"
   else
     return "<rotate>{<imodesuffix>}\t{%2, %0|%0, %2}";
 }
-  "&& reload_completed"
+  "&& reload_completed
+   && !(rtx_equal_p (operands[0], operands[1]))"
   [(set (strict_low_part (match_dup 0)) (match_dup 1))
    (parallel
      [(set (strict_low_part (match_dup 0))
  [(parallel [(set (strict_low_part (match_dup 0))
                  (bswap:HI (match_dup 0)))
             (clobber (reg:CC FLAGS_REG))])])
+
+;; Rotations through carry flag
+(define_insn "rcrsi2"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+       (plus:SI
+         (lshiftrt:SI (match_operand:SI 1 "register_operand" "0")
+                      (const_int 1))
+         (ashift:SI (ltu:SI (reg:CCC FLAGS_REG) (const_int 0))
+                    (const_int 31))))
+   (clobber (reg:CC FLAGS_REG))]
+  ""
+  "rcr{l}\t%0"
+  [(set_attr "type" "ishift1")
+   (set_attr "memory" "none")
+   (set_attr "length_immediate" "0")
+   (set_attr "mode" "SI")])
+
+(define_insn "rcrdi2"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+       (plus:DI
+         (lshiftrt:DI (match_operand:DI 1 "register_operand" "0")
+                      (const_int 1))
+         (ashift:DI (ltu:DI (reg:CCC FLAGS_REG) (const_int 0))
+                    (const_int 63))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT"
+  "rcr{q}\t%0"
+  [(set_attr "type" "ishift1")
+   (set_attr "length_immediate" "0")
+   (set_attr "mode" "DI")])
+
+;; Versions of sar and shr that set the carry flag.
+(define_insn "<insn><mode>3_carry"
+  [(set (reg:CCC FLAGS_REG)
+       (unspec:CCC [(and:SWI48 (match_operand:SWI48 1 "register_operand" "0")
+                               (const_int 1))
+                    (const_int 0)] UNSPEC_CC_NE))
+   (set (match_operand:SWI48 0 "register_operand" "=r")
+       (any_shiftrt:SWI48 (match_dup 1) (const_int 1)))]
+  ""
+{
+  if (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))
+    return "<shift>{<imodesuffix>}\t%0";
+  return "<shift>{<imodesuffix>}\t{1, %0|%0, 1}";
+}
+  [(set_attr "type" "ishift1")
+   (set (attr "length_immediate")
+     (if_then_else
+       (ior (match_test "TARGET_SHIFT1")
+           (match_test "optimize_function_for_size_p (cfun)"))
+       (const_string "0")
+       (const_string "*")))
+   (set_attr "mode" "<MODE>")])
 \f
 ;; Bit set / bit test instructions
 
   [(set (zero_extract:HI
          (match_operand:SWI12 0 "nonimmediate_operand")
          (const_int 1)
-         (zero_extend:SI (match_operand:QI 1 "register_operand")))
+         (match_operand:QI 1 "register_operand"))
        (const_int 0))
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_USE_BT && ix86_pre_reload_split ()"
   [(set (zero_extract:HI
          (match_operand:SWI12 0 "register_operand")
          (const_int 1)
-         (zero_extend:SI (match_operand:QI 1 "register_operand")))
+         (match_operand:QI 1 "register_operand"))
        (const_int 0))
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_USE_BT && ix86_pre_reload_split ()"
 (define_insn "*btsq_imm"
   [(set (zero_extract:DI (match_operand:DI 0 "nonimmediate_operand" "+rm")
                         (const_int 1)
-                        (match_operand 1 "const_0_to_63_operand"))
+                        (match_operand:QI 1 "const_0_to_63_operand"))
        (const_int 1))
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_64BIT && (TARGET_USE_BT || reload_completed)"
 (define_insn "*btrq_imm"
   [(set (zero_extract:DI (match_operand:DI 0 "nonimmediate_operand" "+rm")
                         (const_int 1)
-                        (match_operand 1 "const_0_to_63_operand"))
+                        (match_operand:QI 1 "const_0_to_63_operand"))
        (const_int 0))
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_64BIT && (TARGET_USE_BT || reload_completed)"
 (define_insn "*btcq_imm"
   [(set (zero_extract:DI (match_operand:DI 0 "nonimmediate_operand" "+rm")
                         (const_int 1)
-                        (match_operand 1 "const_0_to_63_operand"))
+                        (match_operand:QI 1 "const_0_to_63_operand"))
        (not:DI (zero_extract:DI (match_dup 0) (const_int 1) (match_dup 1))))
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_64BIT && (TARGET_USE_BT || reload_completed)"
    (parallel [(set (zero_extract:DI
                     (match_operand:DI 0 "nonimmediate_operand")
                     (const_int 1)
-                    (match_operand 1 "const_0_to_63_operand"))
+                    (match_operand:QI 1 "const_0_to_63_operand"))
                   (const_int 1))
              (clobber (reg:CC FLAGS_REG))])]
   "TARGET_64BIT && !TARGET_USE_BT"
    (parallel [(set (zero_extract:DI
                     (match_operand:DI 0 "nonimmediate_operand")
                     (const_int 1)
-                    (match_operand 1 "const_0_to_63_operand"))
+                    (match_operand:QI 1 "const_0_to_63_operand"))
                   (const_int 0))
              (clobber (reg:CC FLAGS_REG))])]
   "TARGET_64BIT && !TARGET_USE_BT"
    (parallel [(set (zero_extract:DI
                     (match_operand:DI 0 "nonimmediate_operand")
                     (const_int 1)
-                    (match_operand 1 "const_0_to_63_operand"))
+                    (match_operand:QI 1 "const_0_to_63_operand"))
              (not:DI (zero_extract:DI
                        (match_dup 0) (const_int 1) (match_dup 1))))
              (clobber (reg:CC FLAGS_REG))])]
          (zero_extract:SWI48
            (match_operand:SWI48 0 "nonimmediate_operand" "r,m")
            (const_int 1)
-           (match_operand:SI 1 "nonmemory_operand" "r<S>,<S>"))
+           (match_operand:QI 1 "nonmemory_operand" "q<S>,<S>"))
          (const_int 0)))]
   ""
 {
   switch (get_attr_mode (insn))
     {
     case MODE_SI:
-      return "bt{l}\t{%1, %k0|%k0, %1}";
+      return "bt{l}\t{%k1, %k0|%k0, %k1}";
 
     case MODE_DI:
       return "bt{q}\t{%q1, %0|%0, %q1}";
          (const_string "SI")
          (const_string "<MODE>")))])
 
+(define_insn_and_split "*bt<SWI48:mode>_mask"
+  [(set (reg:CCC FLAGS_REG)
+        (compare:CCC
+          (zero_extract:SWI48
+            (match_operand:SWI48 0 "nonimmediate_operand" "r,m")
+            (const_int 1)
+           (subreg:QI
+             (and:SWI248
+               (match_operand:SWI248 1 "register_operand")
+               (match_operand 2 "const_int_operand")) 0))
+          (const_int 0)))]
+  "TARGET_USE_BT
+   && (INTVAL (operands[2]) & (GET_MODE_BITSIZE (<SWI48:MODE>mode)-1))
+      == GET_MODE_BITSIZE (<SWI48:MODE>mode)-1
+   && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (reg:CCC FLAGS_REG)
+        (compare:CCC
+         (zero_extract:SWI48 (match_dup 0) (const_int 1) (match_dup 1))
+         (const_int 0)))]
+  "operands[1] = gen_lowpart (QImode, operands[1]);")
+
 (define_insn_and_split "*jcc_bt<mode>"
   [(set (pc)
        (if_then_else (match_operator 0 "bt_comparison_operator"
                        [(zero_extract:SWI48
                           (match_operand:SWI48 1 "nonimmediate_operand")
                           (const_int 1)
-                          (match_operand:SI 2 "nonmemory_operand"))
+                          (match_operand:QI 2 "nonmemory_operand"))
                         (const_int 0)])
                      (label_ref (match_operand 3))
                      (pc)))
   PUT_CODE (operands[0], reverse_condition (GET_CODE (operands[0])));
 })
 
-(define_insn_and_split "*jcc_bt<mode>_1"
-  [(set (pc)
-       (if_then_else (match_operator 0 "bt_comparison_operator"
-                       [(zero_extract:SWI48
-                          (match_operand:SWI48 1 "register_operand")
-                          (const_int 1)
-                          (zero_extend:SI
-                            (match_operand:QI 2 "register_operand")))
-                        (const_int 0)])
-                     (label_ref (match_operand 3))
-                     (pc)))
-   (clobber (reg:CC FLAGS_REG))]
-  "(TARGET_USE_BT || optimize_function_for_size_p (cfun))
-   && ix86_pre_reload_split ()"
-  "#"
-  "&& 1"
-  [(set (reg:CCC FLAGS_REG)
-       (compare:CCC
-         (zero_extract:SWI48
-           (match_dup 1)
-           (const_int 1)
-           (match_dup 2))
-         (const_int 0)))
-   (set (pc)
-       (if_then_else (match_op_dup 0 [(reg:CCC FLAGS_REG) (const_int 0)])
-                     (label_ref (match_dup 3))
-                     (pc)))]
-{
-  operands[2] = lowpart_subreg (SImode, operands[2], QImode);
-  operands[0] = shallow_copy_rtx (operands[0]);
-  PUT_CODE (operands[0], reverse_condition (GET_CODE (operands[0])));
-})
-
 ;; Avoid useless masking of bit offset operand.
 (define_insn_and_split "*jcc_bt<mode>_mask"
   [(set (pc)
                        [(zero_extract:SWI48
                           (match_operand:SWI48 1 "register_operand")
                           (const_int 1)
-                          (and:SI
-                            (match_operand:SI 2 "register_operand")
+                          (and:QI
+                            (match_operand:QI 2 "register_operand")
                             (match_operand 3 "const_int_operand")))])
                      (label_ref (match_operand 4))
                      (pc)))
   PUT_CODE (operands[0], reverse_condition (GET_CODE (operands[0])));
 })
 
-(define_insn_and_split "*jcc_bt<mode>_mask_1"
+;; Avoid useless masking of bit offset operand.
+(define_insn_and_split "*jcc_bt<SWI48:mode>_mask_1"
   [(set (pc)
-       (if_then_else (match_operator 0 "bt_comparison_operator"
+       (if_then_else (match_operator 0 "bt_comparison_operator"
                        [(zero_extract:SWI48
                           (match_operand:SWI48 1 "register_operand")
                           (const_int 1)
-                          (zero_extend:SI
-                            (subreg:QI
-                              (and
-                                (match_operand 2 "int248_register_operand")
-                                (match_operand 3 "const_int_operand")) 0)))])
+                          (subreg:QI
+                            (and:SWI248
+                              (match_operand:SWI248 2 "register_operand")
+                              (match_operand 3 "const_int_operand")) 0))])
                      (label_ref (match_operand 4))
                      (pc)))
    (clobber (reg:CC FLAGS_REG))]
   "(TARGET_USE_BT || optimize_function_for_size_p (cfun))
-   && (INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1))
-      == GET_MODE_BITSIZE (<MODE>mode)-1
+   && (INTVAL (operands[3]) & (GET_MODE_BITSIZE (<SWI48:MODE>mode)-1))
+      == GET_MODE_BITSIZE (<SWI48:MODE>mode)-1
    && ix86_pre_reload_split ()"
   "#"
   "&& 1"
                      (label_ref (match_dup 4))
                      (pc)))]
 {
-  operands[2] = force_reg (GET_MODE (operands[2]), operands[2]);
-  operands[2] = gen_lowpart (SImode, operands[2]);
   operands[0] = shallow_copy_rtx (operands[0]);
   PUT_CODE (operands[0], reverse_condition (GET_CODE (operands[0])));
+  operands[2] = gen_lowpart (QImode, operands[2]);
 })
 
 ;; Help combine recognize bt followed by cmov
          [(zero_extract:SWI48
            (match_operand:SWI48 1 "register_operand")
            (const_int 1)
-           (zero_extend:SI (match_operand:QI 2 "register_operand")))
+           (match_operand:QI 2 "register_operand"))
           (const_int 0)])
         (match_operand:SWI248 3 "nonimmediate_operand")
         (match_operand:SWI248 4 "nonimmediate_operand")))]
 {
   if (GET_CODE (operands[5]) == EQ)
     std::swap (operands[3], operands[4]);
-  operands[2] = lowpart_subreg (SImode, operands[2], QImode);
 })
 
 ;; Help combine recognize bt followed by setc
         (zero_extract:SWI48
          (match_operand:SWI48 1 "register_operand")
          (const_int 1)
-         (zero_extend:SI (match_operand:QI 2 "register_operand"))))
+         (match_operand:QI 2 "register_operand")))
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_USE_BT && ix86_pre_reload_split ()"
   "#"
          (zero_extract:SWI48 (match_dup 1) (const_int 1) (match_dup 2))
          (const_int 0)))
    (set (match_dup 0)
-        (eq:QI (reg:CCC FLAGS_REG) (const_int 0)))]
-{
-  operands[2] = lowpart_subreg (SImode, operands[2], QImode);
-})
+        (eq:QI (reg:CCC FLAGS_REG) (const_int 0)))])
 
 ;; Help combine recognize bt followed by setnc
 (define_insn_and_split "*bt<mode>_setncqi"
          (zero_extract:SWI48 (match_dup 1) (const_int 1) (match_dup 2))
          (const_int 0)))
    (set (match_dup 0)
-        (ne:QI (reg:CCC FLAGS_REG) (const_int 0)))]
-{
-  operands[2] = lowpart_subreg (SImode, operands[2], QImode);
-})
+        (ne:QI (reg:CCC FLAGS_REG) (const_int 0)))])
+
+(define_insn_and_split "*bt<mode>_setnc<mode>"
+  [(set (match_operand:SWI48 0 "register_operand")
+       (and:SWI48
+        (not:SWI48
+         (lshiftrt:SWI48 (match_operand:SWI48 1 "register_operand")
+                         (match_operand:QI 2 "register_operand")))
+        (const_int 1)))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_USE_BT && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (reg:CCC FLAGS_REG)
+        (compare:CCC
+         (zero_extract:SWI48 (match_dup 1) (const_int 1) (match_dup 2))
+         (const_int 0)))
+   (set (match_dup 3)
+        (ne:QI (reg:CCC FLAGS_REG) (const_int 0)))
+   (set (match_dup 0) (zero_extend:SWI48 (match_dup 3)))]
+  "operands[3] = gen_reg_rtx (QImode);")
+
+;; Help combine recognize bt followed by setnc (PR target/110588)
+(define_insn_and_split "*bt<mode>_setncqi_2"
+  [(set (match_operand:QI 0 "register_operand")
+       (eq:QI
+         (zero_extract:SWI48
+           (match_operand:SWI48 1 "register_operand")
+           (const_int 1)
+           (match_operand:QI 2 "register_operand"))
+         (const_int 0)))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_USE_BT && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (reg:CCC FLAGS_REG)
+        (compare:CCC
+         (zero_extract:SWI48 (match_dup 1) (const_int 1) (match_dup 2))
+         (const_int 0)))
+   (set (match_dup 0)
+        (ne:QI (reg:CCC FLAGS_REG) (const_int 0)))])
 
-(define_insn_and_split "*bt<mode>_setnc<mode>"
+;; Help combine recognize bt followed by setc
+(define_insn_and_split "*bt<mode>_setc<mode>_mask"
   [(set (match_operand:SWI48 0 "register_operand")
-       (and:SWI48
-        (not:SWI48
-         (lshiftrt:SWI48 (match_operand:SWI48 1 "register_operand")
-                         (match_operand:QI 2 "register_operand")))
-        (const_int 1)))
+       (zero_extract:SWI48
+         (match_operand:SWI48 1 "register_operand")
+         (const_int 1)
+         (subreg:QI
+           (and:SWI48
+             (match_operand:SWI48 2 "register_operand")
+             (match_operand 3 "const_int_operand")) 0)))
    (clobber (reg:CC FLAGS_REG))]
-  "TARGET_USE_BT && ix86_pre_reload_split ()"
+  "TARGET_USE_BT
+   && (INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1))
+      == GET_MODE_BITSIZE (<MODE>mode)-1
+   && ix86_pre_reload_split ()"
   "#"
   "&& 1"
   [(set (reg:CCC FLAGS_REG)
          (zero_extract:SWI48 (match_dup 1) (const_int 1) (match_dup 2))
          (const_int 0)))
    (set (match_dup 3)
-        (ne:QI (reg:CCC FLAGS_REG) (const_int 0)))
+        (eq:QI (reg:CCC FLAGS_REG) (const_int 0)))
    (set (match_dup 0) (zero_extend:SWI48 (match_dup 3)))]
 {
-  operands[2] = lowpart_subreg (SImode, operands[2], QImode);
+  operands[2] = gen_lowpart (QImode, operands[2]);
   operands[3] = gen_reg_rtx (QImode);
 })
 \f
   [(set (match_operand:MODEF 0 "register_operand" "=x,x")
        (match_operator:MODEF 3 "sse_comparison_operator"
          [(match_operand:MODEF 1 "register_operand" "0,x")
-          (match_operand:MODEF 2 "nonimmediate_operand" "xm,xm")]))]
+          (match_operand:MODEF 2 "nonimmediate_operand" "xm,xjm")]))]
   "SSE_FLOAT_MODE_P (<MODE>mode)"
   "@
    cmp%D3<ssemodesuffix>\t{%2, %0|%0, %2}
    vcmp%D3<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "isa" "noavx,avx")
+   (set_attr "addr" "*,gpr16")
    (set_attr "type" "ssecmp")
    (set_attr "length_immediate" "1")
    (set_attr "prefix" "orig,vex")
   [(parallel
     [(set (match_operand:SWI48 0 "register_operand")
          (if_then_else:SWI48
-           (ne:QI (and:SWI48 (match_operand:SWI48 2 "register_operand")
-                             (const_int 255))
+           (ne:QI (match_operand:QI 2 "register_operand")
                   (const_int 0))
            (zero_extract:SWI48
              (match_operand:SWI48 1 "nonimmediate_operand")
-             (umin:SWI48 (and:SWI48 (match_dup 2) (const_int 255))
-                         (match_dup 3))
+             (umin:QI (match_dup 2) (match_dup 3))
              (const_int 0))
            (const_int 0)))
      (clobber (reg:CC FLAGS_REG))])]
   "TARGET_BMI2"
-  "operands[3] = GEN_INT (<MODE_SIZE> * BITS_PER_UNIT);")
+{
+  operands[2] = gen_lowpart (QImode, operands[2]);
+  operands[3] = GEN_INT (<MODE_SIZE> * BITS_PER_UNIT);
+})
 
 (define_insn "*bmi2_bzhi_<mode>3"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
        (if_then_else:SWI48
-         (ne:QI (and:SWI48 (match_operand:SWI48 2 "register_operand" "r")
-                           (const_int 255))
+         (ne:QI (match_operand:QI 2 "register_operand" "q")
                 (const_int 0))
          (zero_extract:SWI48
            (match_operand:SWI48 1 "nonimmediate_operand" "rm")
-           (umin:SWI48 (and:SWI48 (match_dup 2) (const_int 255))
-                       (match_operand:SWI48 3 "const_int_operand"))
-           (const_int 0))
-         (const_int 0)))
-   (clobber (reg:CC FLAGS_REG))]
-  "TARGET_BMI2 && INTVAL (operands[3]) == <MODE_SIZE> * BITS_PER_UNIT"
-  "bzhi\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "type" "bitmanip")
-   (set_attr "prefix" "vex")
-   (set_attr "mode" "<MODE>")])
-
-(define_insn "*bmi2_bzhi_<mode>3_1"
-  [(set (match_operand:SWI48 0 "register_operand" "=r")
-       (if_then_else:SWI48
-         (ne:QI (match_operand:QI 2 "register_operand" "r") (const_int 0))
-         (zero_extract:SWI48
-           (match_operand:SWI48 1 "nonimmediate_operand" "rm")
-           (umin:SWI48 (zero_extend:SWI48 (match_dup 2))
-                       (match_operand:SWI48 3 "const_int_operand"))
+           (umin:QI (match_dup 2)
+                    (match_operand:QI 3 "const_int_operand"))
            (const_int 0))
          (const_int 0)))
    (clobber (reg:CC FLAGS_REG))]
            (ne:QI (match_operand:QI 2 "register_operand" "r") (const_int 0))
            (zero_extract:SWI48
              (match_operand:SWI48 1 "nonimmediate_operand" "rm")
-             (umin:SWI48 (zero_extend:SWI48 (match_dup 2))
-                         (match_operand:SWI48 3 "const_int_operand"))
+             (umin:QI (match_dup 2)
+                      (match_operand:QI 3 "const_int_operand"))
              (const_int 0))
            (const_int 0))
        (const_int 0)))
   [(set (match_operand:SWI48 0 "register_operand" "=r")
         (zero_extract:SWI48
           (match_operand:SWI48 1 "nonimmediate_operand" "rm")
-          (match_operand 2 "const_0_to_255_operand")
-          (match_operand 3 "const_0_to_255_operand")))
+          (match_operand:QI 2 "const_0_to_255_operand")
+          (match_operand:QI 3 "const_0_to_255_operand")))
    (clobber (reg:CC FLAGS_REG))]
    "TARGET_TBM"
 {
    (set_attr "mode" "HF")])
 
 (define_insn "*rcpsf2_sse"
-  [(set (match_operand:SF 0 "register_operand" "=x,x,x")
-       (unspec:SF [(match_operand:SF 1 "nonimmediate_operand" "0,x,m")]
+  [(set (match_operand:SF 0 "register_operand" "=x,x,x,x")
+       (unspec:SF [(match_operand:SF 1 "nonimmediate_operand" "0,x,m,ja")]
                   UNSPEC_RCP))]
   "TARGET_SSE && TARGET_SSE_MATH"
   "@
    %vrcpss\t{%d1, %0|%0, %d1}
    %vrcpss\t{%d1, %0|%0, %d1}
-   %vrcpss\t{%1, %d0|%d0, %1}"
-  [(set_attr "type" "sse")
+   rcpss\t{%1, %d0|%d0, %1}
+   vrcpss\t{%1, %d0|%d0, %1}"
+  [(set_attr "isa" "*,*,noavx,avx")
+   (set_attr "addr" "*,*,*,gpr16")
+   (set_attr "type" "sse")
    (set_attr "atom_sse_attr" "rcp")
    (set_attr "btver2_sse_attr" "rcp")
    (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "SF")
-   (set_attr "avx_partial_xmm_update" "false,false,true")
+   (set_attr "avx_partial_xmm_update" "false,false,true,true")
    (set (attr "preferred_for_speed")
       (cond [(match_test "TARGET_AVX")
               (symbol_ref "true")
-            (eq_attr "alternative" "1,2")
+            (eq_attr "alternative" "1,2,3")
               (symbol_ref "!TARGET_SSE_PARTIAL_REG_DEPENDENCY")
            ]
            (symbol_ref "true")))])
    (set_attr "bdver1_decode" "direct")])
 
 (define_insn "*rsqrtsf2_sse"
-  [(set (match_operand:SF 0 "register_operand" "=x,x,x")
-       (unspec:SF [(match_operand:SF 1 "nonimmediate_operand" "0,x,m")]
+  [(set (match_operand:SF 0 "register_operand" "=x,x,x,x")
+       (unspec:SF [(match_operand:SF 1 "nonimmediate_operand" "0,x,m,ja")]
                   UNSPEC_RSQRT))]
   "TARGET_SSE && TARGET_SSE_MATH"
   "@
    %vrsqrtss\t{%d1, %0|%0, %d1}
    %vrsqrtss\t{%d1, %0|%0, %d1}
-   %vrsqrtss\t{%1, %d0|%d0, %1}"
-  [(set_attr "type" "sse")
+   rsqrtss\t{%1, %d0|%d0, %1}
+   vrsqrtss\t{%1, %d0|%d0, %1}"
+  [(set_attr "isa" "*,*,noavx,avx")
+   (set_attr "addr" "*,*,*,gpr16")
+   (set_attr "type" "sse")
    (set_attr "atom_sse_attr" "rcp")
    (set_attr "btver2_sse_attr" "rcp")
    (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "SF")
-   (set_attr "avx_partial_xmm_update" "false,false,true")
+   (set_attr "avx_partial_xmm_update" "false,false,true,true")
    (set (attr "preferred_for_speed")
       (cond [(match_test "TARGET_AVX")
               (symbol_ref "true")
-            (eq_attr "alternative" "1,2")
+            (eq_attr "alternative" "1,2,3")
               (symbol_ref "!TARGET_SSE_PARTIAL_REG_DEPENDENCY")
            ]
            (symbol_ref "true")))])
 (define_insn "sse4_1_round<mode>2"
   [(set (match_operand:MODEFH 0 "register_operand" "=x,x,x,v,v")
        (unspec:MODEFH
-         [(match_operand:MODEFH 1 "nonimmediate_operand" "0,x,m,v,m")
+         [(match_operand:MODEFH 1 "nonimmediate_operand" "0,x,jm,v,m")
           (match_operand:SI 2 "const_0_to_15_operand")]
          UNSPEC_ROUND))]
   "TARGET_SSE4_1"
    vrndscale<ssemodesuffix>\t{%2, %1, %d0|%d0, %1, %2}"
   [(set_attr "type" "ssecvt")
    (set_attr "prefix_extra" "1,1,1,*,*")
-   (set_attr "length_immediate" "*,*,*,1,1")
+   (set_attr "length_immediate" "1")
+   (set_attr "addr" "*,*,gpr16,*,*")
    (set_attr "prefix" "maybe_vex,maybe_vex,maybe_vex,evex,evex")
    (set_attr "isa" "noavx512f,noavx512f,noavx512f,avx512f,avx512f")
    (set_attr "avx_partial_xmm_update" "false,false,true,false,true")
   DONE;
 })
 
+(define_expand "roundhf2"
+  [(match_operand:HF 0 "register_operand")
+   (match_operand:HF 1 "register_operand")]
+  "TARGET_AVX512FP16 && !flag_trapping_math && !flag_rounding_math"
+{
+  ix86_expand_round_sse4 (operands[0], operands[1]);
+  DONE;
+})
+
 (define_expand "round<mode>2"
   [(match_operand:X87MODEF 0 "register_operand")
    (match_operand:X87MODEF 1 "nonimmediate_operand")]
   [(set_attr "type" "fpspc")
    (set_attr "mode" "<MODE>")])
 
+(define_expand "lroundhf<mode>2"
+  [(set (match_operand:SWI248 0 "register_operand")
+     (unspec:SWI248 [(match_operand:HF 1 "nonimmediate_operand")]
+                  UNSPEC_FIX_NOTRUNC))]
+  "TARGET_AVX512FP16 && !flag_trapping_math && !flag_rounding_math"
+{
+  ix86_expand_lround (operands[0], operands[1]);
+  DONE;
+})
+
+(define_expand "lrinthf<mode>2"
+  [(set (match_operand:SWI48 0 "register_operand")
+     (unspec:SWI48 [(match_operand:HF 1 "nonimmediate_operand")]
+                  UNSPEC_FIX_NOTRUNC))]
+  "TARGET_AVX512FP16")
+
 (define_expand "lrint<MODEF:mode><SWI48:mode>2"
   [(set (match_operand:SWI48 0 "register_operand")
      (unspec:SWI48 [(match_operand:MODEF 1 "nonimmediate_operand")]
    && (!TARGET_SSE_MATH || TARGET_MIX_SSE_I387)
    && flag_unsafe_math_optimizations")
 
+(define_expand "l<rounding_insn>hf<mode>2"
+  [(set (match_operand:SWI48 0 "nonimmediate_operand")
+       (unspec:SWI48 [(match_operand:HF 1 "register_operand")]
+                   FIST_ROUNDING))]
+  "TARGET_AVX512FP16"
+{
+  rtx tmp = gen_reg_rtx (HFmode);
+  emit_insn (gen_sse4_1_roundhf2 (tmp, operands[1],
+                                GEN_INT (ROUND_<ROUNDING> | ROUND_NO_EXC)));
+  emit_insn (gen_fix_trunchf<mode>2 (operands[0], tmp));
+  DONE;
+})
+
 (define_expand "l<rounding_insn><MODEF:mode><SWI48:mode>2"
   [(parallel [(set (match_operand:SWI48 0 "nonimmediate_operand")
                   (unspec:SWI48 [(match_operand:MODEF 1 "register_operand")]
 })
 
 (define_insn "movmsk_df"
-  [(set (match_operand:SI 0 "register_operand" "=r")
+  [(set (match_operand:SI 0 "register_operand" "=r,jr")
        (unspec:SI
-         [(match_operand:DF 1 "register_operand" "x")]
+         [(match_operand:DF 1 "register_operand" "x,x")]
          UNSPEC_MOVMSK))]
   "SSE_FLOAT_MODE_P (DFmode) && TARGET_SSE_MATH"
   "%vmovmskpd\t{%1, %0|%0, %1}"
-  [(set_attr "type" "ssemov")
-   (set_attr "prefix" "maybe_vex")
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "ssemov")
+   (set_attr "prefix" "maybe_evex")
    (set_attr "mode" "DF")])
 
 ;; Use movmskpd in SSE mode to avoid store forwarding stall
          (match_operand:MODEF 3 "register_operand" "x")))]
   "TARGET_XOP"
   "vpcmov\t{%1, %3, %2, %0|%0, %2, %3, %1}"
-  [(set_attr "type" "sse4arg")])
+  [(set_attr "type" "sse4arg")
+   (set_attr "mode" "TI")])
 
 ;; These versions of the min/max patterns are intentionally ignorant of
 ;; their behavior wrt -0.0 and NaN (via the commutative operand mark).
    (set_attr "type" "sseadd")
    (set_attr "mode" "<MODE>")])
 
+;; Operands order in min/max instruction matters for signed zero and NANs.
+(define_insn_and_split "*ieee_max<mode>3_1"
+  [(set (match_operand:MODEF 0 "register_operand")
+       (unspec:MODEF
+         [(match_operand:MODEF 1 "register_operand")
+          (match_operand:MODEF 2 "register_operand")
+          (lt:MODEF
+            (match_operand:MODEF 3 "register_operand")
+            (match_operand:MODEF 4 "register_operand"))]
+         UNSPEC_BLENDV))]
+  "SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH
+  && (rtx_equal_p (operands[1], operands[3])
+      && rtx_equal_p (operands[2], operands[4]))
+  && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+       (unspec:MODEF
+         [(match_dup 2)
+          (match_dup 1)]
+        UNSPEC_IEEE_MAX))])
+
+(define_insn_and_split "*ieee_min<mode>3_1"
+  [(set (match_operand:MODEF 0 "register_operand")
+       (unspec:MODEF
+         [(match_operand:MODEF 1 "register_operand")
+          (match_operand:MODEF 2 "register_operand")
+          (lt:MODEF
+            (match_operand:MODEF 3 "register_operand")
+            (match_operand:MODEF 4 "register_operand"))]
+         UNSPEC_BLENDV))]
+  "SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH
+  && (rtx_equal_p (operands[1], operands[4])
+      && rtx_equal_p (operands[2], operands[3]))
+  && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+       (unspec:MODEF
+         [(match_dup 2)
+          (match_dup 1)]
+        UNSPEC_IEEE_MIN))])
+
 ;; Make two stack loads independent:
 ;;   fld aa              fld aa
 ;;   fld %st(0)     ->   fld bb
   DONE;
 })
 
-(define_expand "save_stack_nonlocal"
-  [(set (match_operand 0 "memory_operand")
-        (match_operand 1 "register_operand"))]
+(define_expand "save_stack_nonlocal"
+  [(set (match_operand 0 "memory_operand")
+        (match_operand 1 "register_operand"))]
+  ""
+{
+  rtx stack_slot;
+
+  if (flag_cf_protection & CF_RETURN)
+    {
+      /* Copy shadow stack pointer to the first slot
+        and stack pointer to the second slot.  */
+      rtx ssp_slot = adjust_address (operands[0], word_mode, 0);
+      stack_slot = adjust_address (operands[0], Pmode, UNITS_PER_WORD);
+
+      rtx reg_ssp = force_reg (word_mode, const0_rtx);
+      emit_insn (gen_rdssp (word_mode, reg_ssp, reg_ssp));
+      emit_move_insn (ssp_slot, reg_ssp);
+    }
+  else
+    stack_slot = adjust_address (operands[0], Pmode, 0);
+  emit_move_insn (stack_slot, operands[1]);
+  DONE;
+})
+
+(define_expand "restore_stack_nonlocal"
+  [(set (match_operand 0 "register_operand" "")
+       (match_operand 1 "memory_operand" ""))]
+  ""
+{
+  rtx stack_slot;
+
+  if (flag_cf_protection & CF_RETURN)
+    {
+      /* Restore shadow stack pointer from the first slot
+        and stack pointer from the second slot.  */
+      rtx ssp_slot = adjust_address (operands[1], word_mode, 0);
+      stack_slot = adjust_address (operands[1], Pmode, UNITS_PER_WORD);
+
+      /* Get the current shadow stack pointer.  The code below will check if
+        SHSTK feature is enabled.  If it is not enabled the RDSSP instruction
+        is a NOP.  */
+      rtx reg_ssp = force_reg (word_mode, const0_rtx);
+      emit_insn (gen_rdssp (word_mode, reg_ssp, reg_ssp));
+
+      /* Compare through subtraction the saved and the current ssp
+        to decide if ssp has to be adjusted.  */
+      reg_ssp = expand_simple_binop (word_mode, MINUS,
+                                    reg_ssp, ssp_slot,
+                                    reg_ssp, 1, OPTAB_DIRECT);
+
+      /* Compare and jump over adjustment code.  */
+      rtx noadj_label = gen_label_rtx ();
+      emit_cmp_and_jump_insns (reg_ssp, const0_rtx, EQ, NULL_RTX,
+                              word_mode, 1, noadj_label);
+
+      /* Compute the number of frames to adjust.  */
+      rtx reg_adj = gen_lowpart (ptr_mode, reg_ssp);
+      rtx reg_adj_neg = expand_simple_unop (ptr_mode, NEG, reg_adj,
+                                           NULL_RTX, 1);
+
+      reg_adj = expand_simple_binop (ptr_mode, LSHIFTRT, reg_adj_neg,
+                                    GEN_INT (exact_log2 (UNITS_PER_WORD)),
+                                    reg_adj, 1, OPTAB_DIRECT);
+
+      /* Check if number of frames <= 255 so no loop is needed.  */
+      rtx inc_label = gen_label_rtx ();
+      emit_cmp_and_jump_insns (reg_adj, GEN_INT (255), LEU, NULL_RTX,
+                              ptr_mode, 1, inc_label);
+
+      /* Adjust the ssp in a loop.  */
+      rtx loop_label = gen_label_rtx ();
+      emit_label (loop_label);
+      LABEL_NUSES (loop_label) = 1;
+
+      rtx reg_255 = force_reg (word_mode, GEN_INT (255));
+      emit_insn (gen_incssp (word_mode, reg_255));
+
+      reg_adj = expand_simple_binop (ptr_mode, MINUS,
+                                    reg_adj, GEN_INT (255),
+                                    reg_adj, 1, OPTAB_DIRECT);
+
+      /* Compare and jump to the loop label.  */
+      emit_cmp_and_jump_insns (reg_adj, GEN_INT (255), GTU, NULL_RTX,
+                              ptr_mode, 1, loop_label);
+
+      emit_label (inc_label);
+      LABEL_NUSES (inc_label) = 1;
+
+      emit_insn (gen_incssp (word_mode, reg_ssp));
+
+      emit_label (noadj_label);
+      LABEL_NUSES (noadj_label) = 1;
+    }
+  else
+    stack_slot = adjust_address (operands[1], Pmode, 0);
+  emit_move_insn (operands[0], stack_slot);
+  DONE;
+})
+
+(define_expand "stack_protect_set"
+  [(match_operand 0 "memory_operand")
+   (match_operand 1 "memory_operand")]
+  ""
+{
+  rtx scratch = gen_reg_rtx (word_mode);
+
+  emit_insn (gen_stack_protect_set_1
+            (ptr_mode, word_mode, operands[0], operands[1], scratch));
+  DONE;
+})
+
+(define_insn "@stack_protect_set_1_<PTR:mode>_<W:mode>"
+  [(set (match_operand:PTR 0 "memory_operand" "=m")
+       (unspec:PTR [(match_operand:PTR 1 "memory_operand" "m")]
+                   UNSPEC_SP_SET))
+   (set (match_operand:W 2 "register_operand" "=&r") (const_int 0))
+   (clobber (reg:CC FLAGS_REG))]
+  ""
+{
+  output_asm_insn ("mov{<PTR:imodesuffix>}\t{%1, %<PTR:k>2|%<PTR:k>2, %1}",
+                  operands);
+  output_asm_insn ("mov{<PTR:imodesuffix>}\t{%<PTR:k>2, %0|%0, %<PTR:k>2}",
+                  operands);
+  if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
+    return "xor{l}\t%k2, %k2";
+  else
+    return "mov{l}\t{$0, %k2|%k2, 0}";
+}
+  [(set_attr "type" "multi")])
+
+;; Patterns and peephole2s to optimize stack_protect_set_1_<mode>
+;; immediately followed by *mov{s,d}i_internal, where we can avoid
+;; the xor{l} above.  We don't split this, so that scheduling or
+;; anything else doesn't separate the *stack_protect_set* pattern from
+;; the set of the register that overwrites the register with a new value.
+
+(define_peephole2
+  [(parallel [(set (match_operand:PTR 0 "memory_operand")
+                  (unspec:PTR [(match_operand:PTR 1 "memory_operand")]
+                              UNSPEC_SP_SET))
+             (set (match_operand 2 "general_reg_operand") (const_int 0))
+             (clobber (reg:CC FLAGS_REG))])
+   (set (match_operand 3 "general_reg_operand")
+       (match_operand 4 "const0_operand"))]
+  "GET_MODE (operands[2]) == word_mode
+   && GET_MODE_SIZE (GET_MODE (operands[3])) <= UNITS_PER_WORD
+   && peep2_reg_dead_p (0, operands[3])
+   && peep2_reg_dead_p (1, operands[2])"
+  [(parallel [(set (match_dup 0)
+                  (unspec:PTR [(match_dup 1)] UNSPEC_SP_SET))
+             (set (match_dup 3) (const_int 0))
+             (clobber (reg:CC FLAGS_REG))])]
+  "operands[3] = gen_lowpart (word_mode, operands[3]);")
+
+(define_insn "*stack_protect_set_2_<mode>_si"
+  [(set (match_operand:PTR 0 "memory_operand" "=m")
+       (unspec:PTR [(match_operand:PTR 3 "memory_operand" "m")]
+                   UNSPEC_SP_SET))
+   (set (match_operand:SI 1 "register_operand" "=&r")
+       (match_operand:SI 2 "general_operand" "g"))]
+  "reload_completed"
+{
+  output_asm_insn ("mov{<imodesuffix>}\t{%3, %<k>1|%<k>1, %3}", operands);
+  output_asm_insn ("mov{<imodesuffix>}\t{%<k>1, %0|%0, %<k>1}", operands);
+  if (pic_32bit_operand (operands[2], SImode)
+      || ix86_use_lea_for_mov (insn, operands + 1))
+    return "lea{l}\t{%E2, %1|%1, %E2}";
+  else
+    return "mov{l}\t{%2, %1|%1, %2}";
+}
+  [(set_attr "type" "multi")
+   (set_attr "length" "24")])
+
+(define_insn "*stack_protect_set_2_<mode>_di"
+  [(set (match_operand:PTR 0 "memory_operand" "=m,m,m")
+       (unspec:PTR [(match_operand:PTR 3 "memory_operand" "m,m,m")]
+                   UNSPEC_SP_SET))
+   (set (match_operand:DI 1 "register_operand" "=&r,&r,&r")
+       (match_operand:DI 2 "general_operand" "Z,rem,i"))]
+  "TARGET_64BIT && reload_completed"
+{
+  output_asm_insn ("mov{<imodesuffix>}\t{%3, %<k>1|%<k>1, %3}", operands);
+  output_asm_insn ("mov{<imodesuffix>}\t{%<k>1, %0|%0, %<k>1}", operands);
+  if (pic_32bit_operand (operands[2], DImode))
+    return "lea{q}\t{%E2, %1|%1, %E2}";
+  else if (which_alternative == 0)
+    return "mov{l}\t{%k2, %k1|%k1, %k2}";
+  else if (which_alternative == 2)
+    return "movabs{q}\t{%2, %1|%1, %2}";
+  else if (ix86_use_lea_for_mov (insn, operands + 1))
+    return "lea{q}\t{%E2, %1|%1, %E2}";
+  else
+    return "mov{q}\t{%2, %1|%1, %2}";
+}
+  [(set_attr "type" "multi")
+   (set_attr "length" "24")])
+
+(define_peephole2
+  [(parallel [(set (match_operand:PTR 0 "memory_operand")
+                  (unspec:PTR [(match_operand:PTR 1 "memory_operand")]
+                              UNSPEC_SP_SET))
+             (set (match_operand 2 "general_reg_operand") (const_int 0))
+             (clobber (reg:CC FLAGS_REG))])
+   (set (match_operand:SWI48 3 "general_reg_operand")
+       (match_operand:SWI48 4 "general_gr_operand"))]
+  "GET_MODE (operands[2]) == word_mode
+   && peep2_reg_dead_p (0, operands[3])
+   && peep2_reg_dead_p (1, operands[2])"
+  [(parallel [(set (match_dup 0)
+                  (unspec:PTR [(match_dup 1)] UNSPEC_SP_SET))
+             (set (match_dup 3) (match_dup 4))])])
+
+(define_peephole2
+  [(set (match_operand:SWI48 3 "general_reg_operand")
+       (match_operand:SWI48 4 "general_gr_operand"))
+   (parallel [(set (match_operand:PTR 0 "memory_operand")
+                  (unspec:PTR [(match_operand:PTR 1 "memory_operand")]
+                              UNSPEC_SP_SET))
+             (set (match_operand 2 "general_reg_operand") (const_int 0))
+             (clobber (reg:CC FLAGS_REG))])]
+  "GET_MODE (operands[2]) == word_mode
+   && peep2_reg_dead_p (0, operands[3])
+   && peep2_reg_dead_p (2, operands[2])
+   && !reg_mentioned_p (operands[3], operands[0])
+   && !reg_mentioned_p (operands[3], operands[1])"
+  [(parallel [(set (match_dup 0)
+                  (unspec:PTR [(match_dup 1)] UNSPEC_SP_SET))
+             (set (match_dup 3) (match_dup 4))])])
+
+(define_insn "*stack_protect_set_3_<PTR:mode>_<SWI48:mode>"
+  [(set (match_operand:PTR 0 "memory_operand" "=m")
+       (unspec:PTR [(match_operand:PTR 3 "memory_operand" "m")]
+                   UNSPEC_SP_SET))
+   (set (match_operand:SWI48 1 "register_operand" "=&r")
+       (match_operand:SWI48 2 "address_no_seg_operand" "Ts"))]
   ""
 {
-  rtx stack_slot;
-
-  if (flag_cf_protection & CF_RETURN)
+  output_asm_insn ("mov{<PTR:imodesuffix>}\t{%3, %<PTR:k>1|%<PTR:k>1, %3}",
+                  operands);
+  output_asm_insn ("mov{<PTR:imodesuffix>}\t{%<PTR:k>1, %0|%0, %<PTR:k>1}",
+                  operands);
+  if (SImode_address_operand (operands[2], VOIDmode))
     {
-      /* Copy shadow stack pointer to the first slot
-        and stack pointer to the second slot.  */
-      rtx ssp_slot = adjust_address (operands[0], word_mode, 0);
-      stack_slot = adjust_address (operands[0], Pmode, UNITS_PER_WORD);
-
-      rtx reg_ssp = force_reg (word_mode, const0_rtx);
-      emit_insn (gen_rdssp (word_mode, reg_ssp, reg_ssp));
-      emit_move_insn (ssp_slot, reg_ssp);
+      gcc_assert (TARGET_64BIT);
+      return "lea{l}\t{%E2, %k1|%k1, %E2}";
     }
   else
-    stack_slot = adjust_address (operands[0], Pmode, 0);
-  emit_move_insn (stack_slot, operands[1]);
-  DONE;
-})
-
-(define_expand "restore_stack_nonlocal"
-  [(set (match_operand 0 "register_operand" "")
-       (match_operand 1 "memory_operand" ""))]
-  ""
-{
-  rtx stack_slot;
-
-  if (flag_cf_protection & CF_RETURN)
-    {
-      /* Restore shadow stack pointer from the first slot
-        and stack pointer from the second slot.  */
-      rtx ssp_slot = adjust_address (operands[1], word_mode, 0);
-      stack_slot = adjust_address (operands[1], Pmode, UNITS_PER_WORD);
-
-      /* Get the current shadow stack pointer.  The code below will check if
-        SHSTK feature is enabled.  If it is not enabled the RDSSP instruction
-        is a NOP.  */
-      rtx reg_ssp = force_reg (word_mode, const0_rtx);
-      emit_insn (gen_rdssp (word_mode, reg_ssp, reg_ssp));
-
-      /* Compare through subtraction the saved and the current ssp
-        to decide if ssp has to be adjusted.  */
-      reg_ssp = expand_simple_binop (word_mode, MINUS,
-                                    reg_ssp, ssp_slot,
-                                    reg_ssp, 1, OPTAB_DIRECT);
-
-      /* Compare and jump over adjustment code.  */
-      rtx noadj_label = gen_label_rtx ();
-      emit_cmp_and_jump_insns (reg_ssp, const0_rtx, EQ, NULL_RTX,
-                              word_mode, 1, noadj_label);
-
-      /* Compute the number of frames to adjust.  */
-      rtx reg_adj = gen_lowpart (ptr_mode, reg_ssp);
-      rtx reg_adj_neg = expand_simple_unop (ptr_mode, NEG, reg_adj,
-                                           NULL_RTX, 1);
-
-      reg_adj = expand_simple_binop (ptr_mode, LSHIFTRT, reg_adj_neg,
-                                    GEN_INT (exact_log2 (UNITS_PER_WORD)),
-                                    reg_adj, 1, OPTAB_DIRECT);
-
-      /* Check if number of frames <= 255 so no loop is needed.  */
-      rtx inc_label = gen_label_rtx ();
-      emit_cmp_and_jump_insns (reg_adj, GEN_INT (255), LEU, NULL_RTX,
-                              ptr_mode, 1, inc_label);
+    return "lea{<SWI48:imodesuffix>}\t{%E2, %1|%1, %E2}";
+}
+  [(set_attr "type" "multi")
+   (set_attr "length" "24")])
 
-      /* Adjust the ssp in a loop.  */
-      rtx loop_label = gen_label_rtx ();
-      emit_label (loop_label);
-      LABEL_NUSES (loop_label) = 1;
+(define_peephole2
+  [(parallel [(set (match_operand:PTR 0 "memory_operand")
+                  (unspec:PTR [(match_operand:PTR 1 "memory_operand")]
+                              UNSPEC_SP_SET))
+             (set (match_operand 2 "general_reg_operand") (const_int 0))
+             (clobber (reg:CC FLAGS_REG))])
+   (set (match_operand:SWI48 3 "general_reg_operand")
+       (match_operand:SWI48 4 "address_no_seg_operand"))]
+  "GET_MODE (operands[2]) == word_mode
+   && peep2_reg_dead_p (0, operands[3])
+   && peep2_reg_dead_p (1, operands[2])"
+  [(parallel [(set (match_dup 0)
+                  (unspec:PTR [(match_dup 1)] UNSPEC_SP_SET))
+             (set (match_dup 3) (match_dup 4))])])
 
-      rtx reg_255 = force_reg (word_mode, GEN_INT (255));
-      emit_insn (gen_incssp (word_mode, reg_255));
+(define_insn "*stack_protect_set_4z_<mode>_di"
+  [(set (match_operand:PTR 0 "memory_operand" "=m")
+       (unspec:PTR [(match_operand:PTR 3 "memory_operand" "m")]
+                   UNSPEC_SP_SET))
+   (set (match_operand:DI 1 "register_operand" "=&r")
+       (zero_extend:DI (match_operand:SI 2 "nonimmediate_operand" "rm")))]
+  "TARGET_64BIT && reload_completed"
+{
+  output_asm_insn ("mov{<imodesuffix>}\t{%3, %<k>1|%<k>1, %3}", operands);
+  output_asm_insn ("mov{<imodesuffix>}\t{%<k>1, %0|%0, %<k>1}", operands);
+  if (ix86_use_lea_for_mov (insn, operands + 1))
+    return "lea{l}\t{%E2, %k1|%k1, %E2}";
+  else
+    return "mov{l}\t{%2, %k1|%k1, %2}";
+}
+  [(set_attr "type" "multi")
+   (set_attr "length" "24")])
 
-      reg_adj = expand_simple_binop (ptr_mode, MINUS,
-                                    reg_adj, GEN_INT (255),
-                                    reg_adj, 1, OPTAB_DIRECT);
+(define_insn "*stack_protect_set_4s_<mode>_di"
+  [(set (match_operand:PTR 0 "memory_operand" "=m")
+       (unspec:PTR [(match_operand:PTR 3 "memory_operand" "m")]
+                   UNSPEC_SP_SET))
+   (set (match_operand:DI 1 "register_operand" "=&r")
+       (sign_extend:DI (match_operand:SI 2 "nonimmediate_operand" "rm")))]
+  "TARGET_64BIT && reload_completed"
+{
+  output_asm_insn ("mov{<imodesuffix>}\t{%3, %<k>1|%<k>1, %3}", operands);
+  output_asm_insn ("mov{<imodesuffix>}\t{%<k>1, %0|%0, %<k>1}", operands);
+  return "movs{lq|x}\t{%2, %1|%1, %2}";
+}
+  [(set_attr "type" "multi")
+   (set_attr "length" "24")])
 
-      /* Compare and jump to the loop label.  */
-      emit_cmp_and_jump_insns (reg_adj, GEN_INT (255), GTU, NULL_RTX,
-                              ptr_mode, 1, loop_label);
+(define_peephole2
+  [(parallel [(set (match_operand:PTR 0 "memory_operand")
+                  (unspec:PTR [(match_operand:PTR 1 "memory_operand")]
+                              UNSPEC_SP_SET))
+             (set (match_operand 2 "general_reg_operand") (const_int 0))
+             (clobber (reg:CC FLAGS_REG))])
+   (set (match_operand:DI 3 "general_reg_operand")
+       (any_extend:DI
+         (match_operand:SI 4 "nonimmediate_gr_operand")))]
+  "TARGET_64BIT
+   && GET_MODE (operands[2]) == word_mode
+   && peep2_reg_dead_p (0, operands[3])
+   && peep2_reg_dead_p (1, operands[2])"
+  [(parallel [(set (match_dup 0)
+                  (unspec:PTR [(match_dup 1)] UNSPEC_SP_SET))
+             (set (match_dup 3)
+                  (any_extend:DI (match_dup 4)))])])
 
-      emit_label (inc_label);
-      LABEL_NUSES (inc_label) = 1;
+(define_expand "stack_protect_test"
+  [(match_operand 0 "memory_operand")
+   (match_operand 1 "memory_operand")
+   (match_operand 2)]
+  ""
+{
+  rtx flags = gen_rtx_REG (CCZmode, FLAGS_REG);
 
-      emit_insn (gen_incssp (word_mode, reg_ssp));
+  emit_insn (gen_stack_protect_test_1
+            (ptr_mode, flags, operands[0], operands[1]));
 
-      emit_label (noadj_label);
-      LABEL_NUSES (noadj_label) = 1;
-    }
-  else
-    stack_slot = adjust_address (operands[1], Pmode, 0);
-  emit_move_insn (operands[0], stack_slot);
+  emit_jump_insn (gen_cbranchcc4 (gen_rtx_EQ (VOIDmode, flags, const0_rtx),
+                                 flags, const0_rtx, operands[2]));
   DONE;
 })
 
+(define_insn "@stack_protect_test_1_<mode>"
+  [(set (match_operand:CCZ 0 "flags_reg_operand")
+       (unspec:CCZ [(match_operand:PTR 1 "memory_operand" "m")
+                    (match_operand:PTR 2 "memory_operand" "m")]
+                   UNSPEC_SP_TEST))
+   (clobber (match_scratch:PTR 3 "=&r"))]
+  ""
+{
+  output_asm_insn ("mov{<imodesuffix>}\t{%1, %3|%3, %1}", operands);
+  return "sub{<imodesuffix>}\t{%2, %3|%3, %2}";
+}
+  [(set_attr "type" "multi")])
 
 ;; Avoid redundant prefixes by splitting HImode arithmetic to SImode.
 ;; Do not split instructions with mask registers.
        (match_operator 1 "compare_operator"
          [(and:QI
             (subreg:QI
-              (zero_extract:SWI248 (match_operand 2 "int248_register_operand")
-                                   (const_int 8)
-                                   (const_int 8)) 0)
+              (match_operator:SWI248 4 "extract_operator"
+                [(match_operand 2 "int248_register_operand")
+                 (const_int 8)
+                 (const_int 8)]) 0)
             (match_operand 3 "const_int_operand"))
           (const_int 0)]))]
   "! TARGET_PARTIAL_REG_STALL
           (match_op_dup 1
             [(and:QI
                (subreg:QI
-                 (zero_extract:SWI248 (match_dup 2)
-                                      (const_int 8)
-                                      (const_int 8)) 0)
+                 (match_op_dup 4 [(match_dup 2)
+                                  (const_int 8)
+                                  (const_int 8)]) 0)
                (match_dup 3))
              (const_int 0)]))
       (set (zero_extract:SWI248 (match_dup 2)
           (subreg:SWI248
             (and:QI
               (subreg:QI
-                (zero_extract:SWI248 (match_dup 2)
-                                     (const_int 8)
-                                     (const_int 8)) 0)
+                (match_op_dup 4 [(match_dup 2)
+                                 (const_int 8)
+                                 (const_int 8)]) 0)
               (match_dup 3)) 0))])])
 
 ;; Don't do logical operations with memory inputs.
        (symbol_ref "memory_address_length (operands[0], false)"))
    (set_attr "memory" "none")])
 
-(define_expand "stack_protect_set"
-  [(match_operand 0 "memory_operand")
-   (match_operand 1 "memory_operand")]
-  ""
-{
-  emit_insn (gen_stack_protect_set_1
-            (ptr_mode, operands[0], operands[1]));
-  DONE;
-})
-
-(define_insn "@stack_protect_set_1_<mode>"
-  [(set (match_operand:PTR 0 "memory_operand" "=m")
-       (unspec:PTR [(match_operand:PTR 1 "memory_operand" "m")]
-                   UNSPEC_SP_SET))
-   (set (match_scratch:PTR 2 "=&r") (const_int 0))
-   (clobber (reg:CC FLAGS_REG))]
-  ""
-{
-  output_asm_insn ("mov{<imodesuffix>}\t{%1, %2|%2, %1}", operands);
-  output_asm_insn ("mov{<imodesuffix>}\t{%2, %0|%0, %2}", operands);
-  return "xor{l}\t%k2, %k2";
-}
-  [(set_attr "type" "multi")])
-
-;; Patterns and peephole2s to optimize stack_protect_set_1_<mode>
-;; immediately followed by *mov{s,d}i_internal to the same register,
-;; where we can avoid the xor{l} above.  We don't split this, so that
-;; scheduling or anything else doesn't separate the *stack_protect_set*
-;; pattern from the set of the register that overwrites the register
-;; with a new value.
-(define_insn "*stack_protect_set_2_<mode>"
-  [(set (match_operand:PTR 0 "memory_operand" "=m")
-       (unspec:PTR [(match_operand:PTR 3 "memory_operand" "m")]
-                   UNSPEC_SP_SET))
-   (set (match_operand:SI 1 "register_operand" "=&r")
-       (match_operand:SI 2 "general_operand" "g"))
-   (clobber (reg:CC FLAGS_REG))]
-  "reload_completed
-   && !reg_overlap_mentioned_p (operands[1], operands[2])"
-{
-  output_asm_insn ("mov{<imodesuffix>}\t{%3, %<k>1|%<k>1, %3}", operands);
-  output_asm_insn ("mov{<imodesuffix>}\t{%<k>1, %0|%0, %<k>1}", operands);
-  if (pic_32bit_operand (operands[2], SImode)
-      || ix86_use_lea_for_mov (insn, operands + 1))
-    return "lea{l}\t{%E2, %1|%1, %E2}";
-  else
-    return "mov{l}\t{%2, %1|%1, %2}";
-}
-  [(set_attr "type" "multi")
-   (set_attr "length" "24")])
-
-(define_peephole2
- [(parallel [(set (match_operand:PTR 0 "memory_operand")
-                 (unspec:PTR [(match_operand:PTR 1 "memory_operand")]
-                             UNSPEC_SP_SET))
-            (set (match_operand:PTR 2 "general_reg_operand") (const_int 0))
-            (clobber (reg:CC FLAGS_REG))])
-  (set (match_operand:SI 3 "general_reg_operand")
-       (match_operand:SI 4))]
- "REGNO (operands[2]) == REGNO (operands[3])
-  && general_operand (operands[4], SImode)
-  && (general_reg_operand (operands[4], SImode)
-      || memory_operand (operands[4], SImode)
-      || immediate_operand (operands[4], SImode))
-  && !reg_overlap_mentioned_p (operands[3], operands[4])"
- [(parallel [(set (match_dup 0)
-                 (unspec:PTR [(match_dup 1)] UNSPEC_SP_SET))
-            (set (match_dup 3) (match_dup 4))
-            (clobber (reg:CC FLAGS_REG))])])
-
-(define_insn "*stack_protect_set_3"
-  [(set (match_operand:DI 0 "memory_operand" "=m,m,m")
-       (unspec:DI [(match_operand:DI 3 "memory_operand" "m,m,m")]
-                  UNSPEC_SP_SET))
-   (set (match_operand:DI 1 "register_operand" "=&r,r,r")
-       (match_operand:DI 2 "general_operand" "Z,rem,i"))
-   (clobber (reg:CC FLAGS_REG))]
-  "TARGET_64BIT
-   && reload_completed
-   && !reg_overlap_mentioned_p (operands[1], operands[2])"
-{
-  output_asm_insn ("mov{q}\t{%3, %1|%1, %3}", operands);
-  output_asm_insn ("mov{q}\t{%1, %0|%0, %1}", operands);
-  if (pic_32bit_operand (operands[2], DImode))
-    return "lea{q}\t{%E2, %1|%1, %E2}";
-  else if (which_alternative == 0)
-    return "mov{l}\t{%k2, %k1|%k1, %k2}";
-  else if (which_alternative == 2)
-    return "movabs{q}\t{%2, %1|%1, %2}";
-  else if (ix86_use_lea_for_mov (insn, operands + 1))
-    return "lea{q}\t{%E2, %1|%1, %E2}";
-  else
-    return "mov{q}\t{%2, %1|%1, %2}";
-}
-  [(set_attr "type" "multi")
-   (set_attr "length" "24")])
-
-(define_peephole2
- [(parallel [(set (match_operand:DI 0 "memory_operand")
-                 (unspec:DI [(match_operand:DI 1 "memory_operand")]
-                            UNSPEC_SP_SET))
-            (set (match_operand:DI 2 "general_reg_operand") (const_int 0))
-            (clobber (reg:CC FLAGS_REG))])
-  (set (match_dup 2) (match_operand:DI 3))]
- "TARGET_64BIT
-  && general_operand (operands[3], DImode)
-  && (general_reg_operand (operands[3], DImode)
-      || memory_operand (operands[3], DImode)
-      || x86_64_zext_immediate_operand (operands[3], DImode)
-      || x86_64_immediate_operand (operands[3], DImode)
-      || (CONSTANT_P (operands[3])
-         && (!flag_pic || LEGITIMATE_PIC_OPERAND_P (operands[3]))))
-  && !reg_overlap_mentioned_p (operands[2], operands[3])"
- [(parallel [(set (match_dup 0)
-                 (unspec:PTR [(match_dup 1)] UNSPEC_SP_SET))
-            (set (match_dup 2) (match_dup 3))
-            (clobber (reg:CC FLAGS_REG))])])
-
-(define_expand "stack_protect_test"
-  [(match_operand 0 "memory_operand")
-   (match_operand 1 "memory_operand")
-   (match_operand 2)]
-  ""
-{
-  rtx flags = gen_rtx_REG (CCZmode, FLAGS_REG);
-
-  emit_insn (gen_stack_protect_test_1
-            (ptr_mode, flags, operands[0], operands[1]));
-
-  emit_jump_insn (gen_cbranchcc4 (gen_rtx_EQ (VOIDmode, flags, const0_rtx),
-                                 flags, const0_rtx, operands[2]));
-  DONE;
-})
-
-(define_insn "@stack_protect_test_1_<mode>"
-  [(set (match_operand:CCZ 0 "flags_reg_operand")
-       (unspec:CCZ [(match_operand:PTR 1 "memory_operand" "m")
-                    (match_operand:PTR 2 "memory_operand" "m")]
-                   UNSPEC_SP_TEST))
-   (clobber (match_scratch:PTR 3 "=&r"))]
-  ""
-{
-  output_asm_insn ("mov{<imodesuffix>}\t{%1, %3|%3, %1}", operands);
-  return "sub{<imodesuffix>}\t{%2, %3|%3, %2}";
-}
-  [(set_attr "type" "multi")])
-
 (define_insn "sse4_2_crc32<mode>"
   [(set (match_operand:SI 0 "register_operand" "=r")
        (unspec:SI
         (symbol_ref "ix86_attr_length_address_default (insn) + 3"))])
 
 (define_insn "fxsave64"
-  [(set (match_operand:BLK 0 "memory_operand" "=m")
+  [(set (match_operand:BLK 0 "memory_operand" "=jm")
        (unspec_volatile:BLK [(const_int 0)] UNSPECV_FXSAVE64))]
   "TARGET_64BIT && TARGET_FXSR"
   "fxsave64\t%0"
   [(set_attr "type" "other")
+   (set_attr "addr" "gpr16")
    (set_attr "memory" "store")
    (set (attr "length")
         (symbol_ref "ix86_attr_length_address_default (insn) + 4"))])
         (symbol_ref "ix86_attr_length_address_default (insn) + 3"))])
 
 (define_insn "fxrstor64"
-  [(unspec_volatile [(match_operand:BLK 0 "memory_operand" "m")]
+  [(unspec_volatile [(match_operand:BLK 0 "memory_operand" "jm")]
                    UNSPECV_FXRSTOR64)]
   "TARGET_64BIT && TARGET_FXSR"
   "fxrstor64\t%0"
   [(set_attr "type" "other")
+   (set_attr "addr" "gpr16")
    (set_attr "memory" "load")
    (set (attr "length")
         (symbol_ref "ix86_attr_length_address_default (insn) + 4"))])
         (symbol_ref "ix86_attr_length_address_default (insn) + 3"))])
 
 (define_insn "<xsave>_rex64"
-  [(set (match_operand:BLK 0 "memory_operand" "=m")
+  [(set (match_operand:BLK 0 "memory_operand" "=jm")
        (unspec_volatile:BLK
         [(match_operand:SI 1 "register_operand" "a")
          (match_operand:SI 2 "register_operand" "d")]
   "<xsave>\t%0"
   [(set_attr "type" "other")
    (set_attr "memory" "store")
+   (set_attr "addr" "gpr16")
    (set (attr "length")
         (symbol_ref "ix86_attr_length_address_default (insn) + 3"))])
 
 (define_insn "<xsave>"
-  [(set (match_operand:BLK 0 "memory_operand" "=m")
+  [(set (match_operand:BLK 0 "memory_operand" "=jm")
        (unspec_volatile:BLK
         [(match_operand:SI 1 "register_operand" "a")
          (match_operand:SI 2 "register_operand" "d")]
   "<xsave>\t%0"
   [(set_attr "type" "other")
    (set_attr "memory" "store")
+   (set_attr "addr" "gpr16")
    (set (attr "length")
         (symbol_ref "ix86_attr_length_address_default (insn) + 4"))])
 
 
 (define_insn "<xrstor>_rex64"
    [(unspec_volatile:BLK
-     [(match_operand:BLK 0 "memory_operand" "m")
+     [(match_operand:BLK 0 "memory_operand" "jm")
       (match_operand:SI 1 "register_operand" "a")
       (match_operand:SI 2 "register_operand" "d")]
      ANY_XRSTOR)]
   "<xrstor>\t%0"
   [(set_attr "type" "other")
    (set_attr "memory" "load")
+   (set_attr "addr" "gpr16")
    (set (attr "length")
         (symbol_ref "ix86_attr_length_address_default (insn) + 3"))])
 
 (define_insn "<xrstor>64"
    [(unspec_volatile:BLK
-     [(match_operand:BLK 0 "memory_operand" "m")
+     [(match_operand:BLK 0 "memory_operand" "jm")
       (match_operand:SI 1 "register_operand" "a")
       (match_operand:SI 2 "register_operand" "d")]
      ANY_XRSTOR64)]
   "<xrstor>64\t%0"
   [(set_attr "type" "other")
    (set_attr "memory" "load")
+   (set_attr "addr" "gpr16")
    (set (attr "length")
         (symbol_ref "ix86_attr_length_address_default (insn) + 4"))])
 
   "TARGET_64BIT && TARGET_FSGSBASE"
   "rd<fsgs>base\t%0"
   [(set_attr "type" "other")
-   (set_attr "prefix_extra" "2")])
+   (set_attr "prefix_0f" "1")
+   (set_attr "prefix_rep" "1")])
 
 (define_insn "wr<fsgs>base<mode>"
   [(unspec_volatile [(match_operand:SWI48 0 "register_operand" "r")]
   "TARGET_64BIT && TARGET_FSGSBASE"
   "wr<fsgs>base\t%0"
   [(set_attr "type" "other")
-   (set_attr "prefix_extra" "2")])
+   (set_attr "prefix_0f" "1")
+   (set_attr "prefix_rep" "1")])
 
 (define_insn "ptwrite<mode>"
   [(unspec_volatile [(match_operand:SWI48 0 "nonimmediate_operand" "rm")]
   "TARGET_PTWRITE"
   "ptwrite\t%0"
   [(set_attr "type" "other")
-   (set_attr "prefix_extra" "2")])
+   (set_attr "prefix_0f" "1")
+   (set_attr "prefix_rep" "1")])
 
 (define_insn "@rdrand<mode>"
   [(set (match_operand:SWI248 0 "register_operand" "=r")
   "TARGET_RDRND"
   "rdrand\t%0"
   [(set_attr "type" "other")
-   (set_attr "prefix_extra" "1")])
+   (set_attr "prefix_0f" "1")])
 
 (define_insn "@rdseed<mode>"
   [(set (match_operand:SWI248 0 "register_operand" "=r")
   "TARGET_RDSEED"
   "rdseed\t%0"
   [(set_attr "type" "other")
-   (set_attr "prefix_extra" "1")])
+   (set_attr "prefix_0f" "1")])
 
 (define_expand "pause"
   [(set (match_dup 0)
   DONE;
 })
 
+(define_insn "urdmsr"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+    (unspec_volatile:DI
+      [(match_operand:DI 1 "x86_64_szext_nonmemory_operand" "reZ")]
+      UNSPECV_URDMSR))]
+  "TARGET_USER_MSR && TARGET_64BIT"
+  "urdmsr\t{%1, %0|%0, %1}"
+  [(set_attr "prefix" "vex")
+   (set_attr "type" "other")])
+
+(define_insn "uwrmsr"
+  [(unspec_volatile
+    [(match_operand:DI 0 "x86_64_szext_nonmemory_operand" "reZ")
+      (match_operand:DI 1 "register_operand" "r")]
+      UNSPECV_UWRMSR)]
+  "TARGET_USER_MSR && TARGET_64BIT"
+  "uwrmsr\t{%1, %0|%0, %1}"
+  [(set_attr "prefix" "vex")
+   (set_attr "type" "other")])
+
 (include "mmx.md")
 (include "sse.md")
 (include "sync.md")