Update copyright years.

[thirdparty/gcc.git] / gcc / config / aarch64 / aarch64-simd.md
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md

index c94946563e620ada88df1cfa0f7724143a147df7..4e28cf97516df19e1d502e56c776f6b34f15c116 100644 (file)
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1,5 +1,5 @@
  ;; Machine description for AArch64 AdvSIMD architecture.
-;; Copyright (C) 2011-2017 Free Software Foundation, Inc.
+;; Copyright (C) 2011-2020 Free Software Foundation, Inc.
  ;; Contributed by ARM Ltd.
  ;;
  ;; This file is part of GCC.
@@ -19,18 +19,28 @@
  ;; <http://www.gnu.org/licenses/>.
  
  (define_expand "mov<mode>"
-  [(set (match_operand:VALL_F16 0 "nonimmediate_operand" "")
-       (match_operand:VALL_F16 1 "general_operand" ""))]
+  [(set (match_operand:VALL_F16 0 "nonimmediate_operand")
+       (match_operand:VALL_F16 1 "general_operand"))]
    "TARGET_SIMD"
    "
-    if (GET_CODE (operands[0]) == MEM)
+  /* Force the operand into a register if it is not an
+     immediate whose use can be replaced with xzr.
+     If the mode is 16 bytes wide, then we will be doing
+     a stp in DI mode, so we check the validity of that.
+     If the mode is 8 bytes wide, then we will do doing a
+     normal str, so the check need not apply.  */
+  if (GET_CODE (operands[0]) == MEM
+      && !(aarch64_simd_imm_zero (operands[1], <MODE>mode)
+          && ((known_eq (GET_MODE_SIZE (<MODE>mode), 16)
+               && aarch64_mem_pair_operand (operands[0], DImode))
+              || known_eq (GET_MODE_SIZE (<MODE>mode), 8))))
        operands[1] = force_reg (<MODE>mode, operands[1]);
    "
  )
  
  (define_expand "movmisalign<mode>"
-  [(set (match_operand:VALL 0 "nonimmediate_operand" "")
-        (match_operand:VALL 1 "general_operand" ""))]
+  [(set (match_operand:VALL 0 "nonimmediate_operand")
+        (match_operand:VALL 1 "general_operand"))]
    "TARGET_SIMD"
  {
    /* This pattern is not permitted to fail during expansion: if both arguments
@@ -70,7 +80,7 @@
            )))]
    "TARGET_SIMD"
    {
-    operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2])));
+    operands[2] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[2]));
      return "dup\\t%0.<Vtype>, %1.<Vetype>[%2]";
    }
    [(set_attr "type" "neon_dup<q>")]
@@ -85,72 +95,73 @@
            )))]
    "TARGET_SIMD"
    {
-    operands[2] = GEN_INT (ENDIAN_LANE_N (<VSWAP_WIDTH>mode,
-                                         INTVAL (operands[2])));
+    operands[2] = aarch64_endian_lane_rtx (<VSWAP_WIDTH>mode, INTVAL (operands[2]));
      return "dup\\t%0.<Vtype>, %1.<Vetype>[%2]";
    }
    [(set_attr "type" "neon_dup<q>")]
  )
  
-(define_insn "*aarch64_simd_mov<mode>"
+(define_insn "*aarch64_simd_mov<VD:mode>"
    [(set (match_operand:VD 0 "nonimmediate_operand"
-               "=w, m,  w, ?r, ?w, ?r, w")
+               "=w, m,  m,  w, ?r, ?w, ?r, w")
         (match_operand:VD 1 "general_operand"
-               "m,  w,  w,  w,  r,  r, Dn"))]
+               "m,  Dz, w,  w,  w,  r,  r, Dn"))]
    "TARGET_SIMD
     && (register_operand (operands[0], <MODE>mode)
-       || register_operand (operands[1], <MODE>mode))"
+       || aarch64_simd_reg_or_zero (operands[1], <MODE>mode))"
  {
     switch (which_alternative)
       {
-     case 0: return "ldr\\t%d0, %1";
-     case 1: return "str\\t%d1, %0";
-     case 2: return "orr\t%0.<Vbtype>, %1.<Vbtype>, %1.<Vbtype>";
-     case 3: return "umov\t%0, %1.d[0]";
-     case 4: return "fmov\t%d0, %1";
-     case 5: return "mov\t%0, %1";
-     case 6:
-       return aarch64_output_simd_mov_immediate (operands[1],
-                                                 <MODE>mode, 64);
+     case 0: return "ldr\t%d0, %1";
+     case 1: return "str\txzr, %0";
+     case 2: return "str\t%d1, %0";
+     case 3: return "mov\t%0.<Vbtype>, %1.<Vbtype>";
+     case 4: return "umov\t%0, %1.d[0]";
+     case 5: return "fmov\t%d0, %1";
+     case 6: return "mov\t%0, %1";
+     case 7:
+       return aarch64_output_simd_mov_immediate (operands[1], 64);
       default: gcc_unreachable ();
       }
  }
-  [(set_attr "type" "neon_load1_1reg<q>, neon_store1_1reg<q>,\
+  [(set_attr "type" "neon_load1_1reg<q>, store_8, neon_store1_1reg<q>,\
                      neon_logic<q>, neon_to_gp<q>, f_mcr,\
                      mov_reg, neon_move<q>")]
  )
  
-(define_insn "*aarch64_simd_mov<mode>"
+(define_insn "*aarch64_simd_mov<VQ:mode>"
    [(set (match_operand:VQ 0 "nonimmediate_operand"
-               "=w, m,  w, ?r, ?w, ?r, w")
+               "=w, Umn,  m,  w, ?r, ?w, ?r, w")
         (match_operand:VQ 1 "general_operand"
-               "m,  w,  w,  w,  r,  r, Dn"))]
+               "m,  Dz, w,  w,  w,  r,  r, Dn"))]
    "TARGET_SIMD
     && (register_operand (operands[0], <MODE>mode)
-       || register_operand (operands[1], <MODE>mode))"
+       || aarch64_simd_reg_or_zero (operands[1], <MODE>mode))"
  {
    switch (which_alternative)
      {
      case 0:
-       return "ldr\\t%q0, %1";
+       return "ldr\t%q0, %1";
      case 1:
-       return "str\\t%q1, %0";
+       return "stp\txzr, xzr, %0";
      case 2:
-       return "orr\t%0.<Vbtype>, %1.<Vbtype>, %1.<Vbtype>";
+       return "str\t%q1, %0";
      case 3:
+       return "mov\t%0.<Vbtype>, %1.<Vbtype>";
      case 4:
      case 5:
-       return "#";
      case 6:
-       return aarch64_output_simd_mov_immediate (operands[1], <MODE>mode, 128);
+       return "#";
+    case 7:
+       return aarch64_output_simd_mov_immediate (operands[1], 128);
      default:
         gcc_unreachable ();
      }
  }
-  [(set_attr "type" "neon_load1_1reg<q>, neon_store1_1reg<q>,\
-                     neon_logic<q>, multiple, multiple, multiple,\
-                     neon_move<q>")
-   (set_attr "length" "4,4,4,8,8,8,4")]
+  [(set_attr "type" "neon_load1_1reg<q>, store_16, neon_store1_1reg<q>,\
+                    neon_logic<q>, multiple, multiple,\
+                    multiple, neon_move<q>")
+   (set_attr "length" "4,4,4,4,8,8,8,4")]
  )
  
  ;; When storing lane zero we can use the normal STR and its more permissive
@@ -161,39 +172,67 @@
         (vec_select:<VEL> (match_operand:VALL_F16 1 "register_operand" "w")
                         (parallel [(match_operand 2 "const_int_operand" "n")])))]
    "TARGET_SIMD
-   && ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2])) == 0"
+   && ENDIAN_LANE_N (<nunits>, INTVAL (operands[2])) == 0"
    "str\\t%<Vetype>1, %0"
    [(set_attr "type" "neon_store1_1reg<q>")]
  )
  
-(define_insn "load_pair<mode>"
-  [(set (match_operand:VD 0 "register_operand" "=w")
-       (match_operand:VD 1 "aarch64_mem_pair_operand" "Ump"))
-   (set (match_operand:VD 2 "register_operand" "=w")
-       (match_operand:VD 3 "memory_operand" "m"))]
+(define_insn "load_pair<DREG:mode><DREG2:mode>"
+  [(set (match_operand:DREG 0 "register_operand" "=w")
+       (match_operand:DREG 1 "aarch64_mem_pair_operand" "Ump"))
+   (set (match_operand:DREG2 2 "register_operand" "=w")
+       (match_operand:DREG2 3 "memory_operand" "m"))]
    "TARGET_SIMD
     && rtx_equal_p (XEXP (operands[3], 0),
                    plus_constant (Pmode,
                                   XEXP (operands[1], 0),
-                                 GET_MODE_SIZE (<MODE>mode)))"
+                                 GET_MODE_SIZE (<DREG:MODE>mode)))"
    "ldp\\t%d0, %d2, %1"
    [(set_attr "type" "neon_ldp")]
  )
  
-(define_insn "store_pair<mode>"
-  [(set (match_operand:VD 0 "aarch64_mem_pair_operand" "=Ump")
-       (match_operand:VD 1 "register_operand" "w"))
-   (set (match_operand:VD 2 "memory_operand" "=m")
-       (match_operand:VD 3 "register_operand" "w"))]
+(define_insn "vec_store_pair<DREG:mode><DREG2:mode>"
+  [(set (match_operand:DREG 0 "aarch64_mem_pair_operand" "=Ump")
+       (match_operand:DREG 1 "register_operand" "w"))
+   (set (match_operand:DREG2 2 "memory_operand" "=m")
+       (match_operand:DREG2 3 "register_operand" "w"))]
    "TARGET_SIMD
     && rtx_equal_p (XEXP (operands[2], 0),
                    plus_constant (Pmode,
                                   XEXP (operands[0], 0),
-                                 GET_MODE_SIZE (<MODE>mode)))"
+                                 GET_MODE_SIZE (<DREG:MODE>mode)))"
    "stp\\t%d1, %d3, %0"
    [(set_attr "type" "neon_stp")]
  )
  
+(define_insn "load_pair<VQ:mode><VQ2:mode>"
+  [(set (match_operand:VQ 0 "register_operand" "=w")
+       (match_operand:VQ 1 "aarch64_mem_pair_operand" "Ump"))
+   (set (match_operand:VQ2 2 "register_operand" "=w")
+       (match_operand:VQ2 3 "memory_operand" "m"))]
+  "TARGET_SIMD
+    && rtx_equal_p (XEXP (operands[3], 0),
+                   plus_constant (Pmode,
+                              XEXP (operands[1], 0),
+                              GET_MODE_SIZE (<VQ:MODE>mode)))"
+  "ldp\\t%q0, %q2, %1"
+  [(set_attr "type" "neon_ldp_q")]
+)
+
+(define_insn "vec_store_pair<VQ:mode><VQ2:mode>"
+  [(set (match_operand:VQ 0 "aarch64_mem_pair_operand" "=Ump")
+       (match_operand:VQ 1 "register_operand" "w"))
+   (set (match_operand:VQ2 2 "memory_operand" "=m")
+       (match_operand:VQ2 3 "register_operand" "w"))]
+  "TARGET_SIMD && rtx_equal_p (XEXP (operands[2], 0),
+               plus_constant (Pmode,
+                              XEXP (operands[0], 0),
+                              GET_MODE_SIZE (<VQ:MODE>mode)))"
+  "stp\\t%q1, %q3, %0"
+  [(set_attr "type" "neon_stp_q")]
+)
+
+
  (define_split
    [(set (match_operand:VQ 0 "register_operand" "")
        (match_operand:VQ 1 "register_operand" ""))]
@@ -218,7 +257,7 @@
    DONE;
  })
  
-(define_expand "aarch64_split_simd_mov<mode>"
+(define_expand "@aarch64_split_simd_mov<mode>"
    [(set (match_operand:VQ 0)
          (match_operand:VQ 1))]
    "TARGET_SIMD"
@@ -241,8 +280,8 @@
        {
          rtx dst_low_part = gen_lowpart (<VHALF>mode, dst);
          rtx dst_high_part = gen_highpart (<VHALF>mode, dst);
-        rtx lo = aarch64_simd_vect_par_cnst_half (<MODE>mode, false);
-        rtx hi = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+       rtx lo = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false);
+       rtx hi = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
  
          emit_insn
            (gen_aarch64_simd_mov_from_<mode>low (dst_low_part, src, lo));
@@ -351,17 +390,200 @@
    }
  )
  
+(define_expand "xorsign<mode>3"
+  [(match_operand:VHSDF 0 "register_operand")
+   (match_operand:VHSDF 1 "register_operand")
+   (match_operand:VHSDF 2 "register_operand")]
+  "TARGET_SIMD"
+{
+
+  machine_mode imode = <V_INT_EQUIV>mode;
+  rtx v_bitmask = gen_reg_rtx (imode);
+  rtx op1x = gen_reg_rtx (imode);
+  rtx op2x = gen_reg_rtx (imode);
+
+  rtx arg1 = lowpart_subreg (imode, operands[1], <MODE>mode);
+  rtx arg2 = lowpart_subreg (imode, operands[2], <MODE>mode);
+
+  int bits = GET_MODE_UNIT_BITSIZE (<MODE>mode) - 1;
+
+  emit_move_insn (v_bitmask,
+                 aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
+                                                    HOST_WIDE_INT_M1U << bits));
+
+  emit_insn (gen_and<v_int_equiv>3 (op2x, v_bitmask, arg2));
+  emit_insn (gen_xor<v_int_equiv>3 (op1x, arg1, op2x));
+  emit_move_insn (operands[0],
+                 lowpart_subreg (<MODE>mode, op1x, imode));
+  DONE;
+}
+)
+
+;; The fcadd and fcmla patterns are made UNSPEC for the explicitly due to the
+;; fact that their usage need to guarantee that the source vectors are
+;; contiguous.  It would be wrong to describe the operation without being able
+;; to describe the permute that is also required, but even if that is done
+;; the permute would have been created as a LOAD_LANES which means the values
+;; in the registers are in the wrong order.
+(define_insn "aarch64_fcadd<rot><mode>"
+  [(set (match_operand:VHSDF 0 "register_operand" "=w")
+       (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")
+                      (match_operand:VHSDF 2 "register_operand" "w")]
+                      FCADD))]
+  "TARGET_COMPLEX"
+  "fcadd\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>, #<rot>"
+  [(set_attr "type" "neon_fcadd")]
+)
+
+(define_insn "aarch64_fcmla<rot><mode>"
+  [(set (match_operand:VHSDF 0 "register_operand" "=w")
+       (plus:VHSDF (match_operand:VHSDF 1 "register_operand" "0")
+                   (unspec:VHSDF [(match_operand:VHSDF 2 "register_operand" "w")
+                                  (match_operand:VHSDF 3 "register_operand" "w")]
+                                  FCMLA)))]
+  "TARGET_COMPLEX"
+  "fcmla\t%0.<Vtype>, %2.<Vtype>, %3.<Vtype>, #<rot>"
+  [(set_attr "type" "neon_fcmla")]
+)
+
+
+(define_insn "aarch64_fcmla_lane<rot><mode>"
+  [(set (match_operand:VHSDF 0 "register_operand" "=w")
+       (plus:VHSDF (match_operand:VHSDF 1 "register_operand" "0")
+                   (unspec:VHSDF [(match_operand:VHSDF 2 "register_operand" "w")
+                                  (match_operand:VHSDF 3 "register_operand" "w")
+                                  (match_operand:SI 4 "const_int_operand" "n")]
+                                  FCMLA)))]
+  "TARGET_COMPLEX"
+{
+  operands[4] = aarch64_endian_lane_rtx (<VHALF>mode, INTVAL (operands[4]));
+  return "fcmla\t%0.<Vtype>, %2.<Vtype>, %3.<FCMLA_maybe_lane>, #<rot>";
+}
+  [(set_attr "type" "neon_fcmla")]
+)
+
+(define_insn "aarch64_fcmla_laneq<rot>v4hf"
+  [(set (match_operand:V4HF 0 "register_operand" "=w")
+       (plus:V4HF (match_operand:V4HF 1 "register_operand" "0")
+                  (unspec:V4HF [(match_operand:V4HF 2 "register_operand" "w")
+                                (match_operand:V8HF 3 "register_operand" "w")
+                                (match_operand:SI 4 "const_int_operand" "n")]
+                                FCMLA)))]
+  "TARGET_COMPLEX"
+{
+  operands[4] = aarch64_endian_lane_rtx (V4HFmode, INTVAL (operands[4]));
+  return "fcmla\t%0.4h, %2.4h, %3.h[%4], #<rot>";
+}
+  [(set_attr "type" "neon_fcmla")]
+)
+
+(define_insn "aarch64_fcmlaq_lane<rot><mode>"
+  [(set (match_operand:VQ_HSF 0 "register_operand" "=w")
+       (plus:VQ_HSF (match_operand:VQ_HSF 1 "register_operand" "0")
+                    (unspec:VQ_HSF [(match_operand:VQ_HSF 2 "register_operand" "w")
+                                    (match_operand:<VHALF> 3 "register_operand" "w")
+                                    (match_operand:SI 4 "const_int_operand" "n")]
+                                    FCMLA)))]
+  "TARGET_COMPLEX"
+{
+  int nunits = GET_MODE_NUNITS (<VHALF>mode).to_constant ();
+  operands[4]
+    = gen_int_mode (ENDIAN_LANE_N (nunits / 2, INTVAL (operands[4])), SImode);
+  return "fcmla\t%0.<Vtype>, %2.<Vtype>, %3.<FCMLA_maybe_lane>, #<rot>";
+}
+  [(set_attr "type" "neon_fcmla")]
+)
+
+;; These instructions map to the __builtins for the Dot Product operations.
+(define_insn "aarch64_<sur>dot<vsi2qi>"
+  [(set (match_operand:VS 0 "register_operand" "=w")
+       (plus:VS (match_operand:VS 1 "register_operand" "0")
+               (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
+                           (match_operand:<VSI2QI> 3 "register_operand" "w")]
+               DOTPROD)))]
+  "TARGET_DOTPROD"
+  "<sur>dot\\t%0.<Vtype>, %2.<Vdottype>, %3.<Vdottype>"
+  [(set_attr "type" "neon_dot<q>")]
+)
+
+;; These expands map to the Dot Product optab the vectorizer checks for.
+;; The auto-vectorizer expects a dot product builtin that also does an
+;; accumulation into the provided register.
+;; Given the following pattern
+;;
+;; for (i=0; i<len; i++) {
+;;     c = a[i] * b[i];
+;;     r += c;
+;; }
+;; return result;
+;;
+;; This can be auto-vectorized to
+;; r  = a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3];
+;;
+;; given enough iterations.  However the vectorizer can keep unrolling the loop
+;; r += a[4]*b[4] + a[5]*b[5] + a[6]*b[6] + a[7]*b[7];
+;; r += a[8]*b[8] + a[9]*b[9] + a[10]*b[10] + a[11]*b[11];
+;; ...
+;;
+;; and so the vectorizer provides r, in which the result has to be accumulated.
+(define_expand "<sur>dot_prod<vsi2qi>"
+  [(set (match_operand:VS 0 "register_operand")
+       (plus:VS (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand")
+                           (match_operand:<VSI2QI> 2 "register_operand")]
+                DOTPROD)
+               (match_operand:VS 3 "register_operand")))]
+  "TARGET_DOTPROD"
+{
+  emit_insn (
+    gen_aarch64_<sur>dot<vsi2qi> (operands[3], operands[3], operands[1],
+                                   operands[2]));
+  emit_insn (gen_rtx_SET (operands[0], operands[3]));
+  DONE;
+})
+
+;; These instructions map to the __builtins for the Dot Product
+;; indexed operations.
+(define_insn "aarch64_<sur>dot_lane<vsi2qi>"
+  [(set (match_operand:VS 0 "register_operand" "=w")
+       (plus:VS (match_operand:VS 1 "register_operand" "0")
+               (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
+                           (match_operand:V8QI 3 "register_operand" "<h_con>")
+                           (match_operand:SI 4 "immediate_operand" "i")]
+               DOTPROD)))]
+  "TARGET_DOTPROD"
+  {
+    operands[4] = aarch64_endian_lane_rtx (V8QImode, INTVAL (operands[4]));
+    return "<sur>dot\\t%0.<Vtype>, %2.<Vdottype>, %3.4b[%4]";
+  }
+  [(set_attr "type" "neon_dot<q>")]
+)
+
+(define_insn "aarch64_<sur>dot_laneq<vsi2qi>"
+  [(set (match_operand:VS 0 "register_operand" "=w")
+       (plus:VS (match_operand:VS 1 "register_operand" "0")
+               (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
+                           (match_operand:V16QI 3 "register_operand" "<h_con>")
+                           (match_operand:SI 4 "immediate_operand" "i")]
+               DOTPROD)))]
+  "TARGET_DOTPROD"
+  {
+    operands[4] = aarch64_endian_lane_rtx (V16QImode, INTVAL (operands[4]));
+    return "<sur>dot\\t%0.<Vtype>, %2.<Vdottype>, %3.4b[%4]";
+  }
+  [(set_attr "type" "neon_dot<q>")]
+)
+
  (define_expand "copysign<mode>3"
    [(match_operand:VHSDF 0 "register_operand")
     (match_operand:VHSDF 1 "register_operand")
     (match_operand:VHSDF 2 "register_operand")]
    "TARGET_FLOAT && TARGET_SIMD"
  {
-  rtx v_bitmask = gen_reg_rtx (<V_cmp_result>mode);
+  rtx v_bitmask = gen_reg_rtx (<V_INT_EQUIV>mode);
    int bits = GET_MODE_UNIT_BITSIZE (<MODE>mode) - 1;
  
    emit_move_insn (v_bitmask,
-                 aarch64_simd_gen_const_vector_dup (<V_cmp_result>mode,
+                 aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
                                                      HOST_WIDE_INT_M1U << bits));
    emit_insn (gen_aarch64_simd_bsl<mode> (operands[0], v_bitmask,
                                          operands[2], operands[1]));
@@ -379,7 +601,7 @@
        (match_operand:VMUL 3 "register_operand" "w")))]
    "TARGET_SIMD"
    {
-    operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2])));
+    operands[2] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[2]));
      return "<f>mul\\t%0.<Vtype>, %3.<Vtype>, %1.<Vetype>[%2]";
    }
    [(set_attr "type" "neon<fp>_mul_<stype>_scalar<q>")]
@@ -395,8 +617,7 @@
        (match_operand:VMUL_CHANGE_NLANES 3 "register_operand" "w")))]
    "TARGET_SIMD"
    {
-    operands[2] = GEN_INT (ENDIAN_LANE_N (<VSWAP_WIDTH>mode,
-                                         INTVAL (operands[2])));
+    operands[2] = aarch64_endian_lane_rtx (<VSWAP_WIDTH>mode, INTVAL (operands[2]));
      return "<f>mul\\t%0.<Vtype>, %3.<Vtype>, %1.<Vetype>[%2]";
    }
    [(set_attr "type" "neon<fp>_mul_<Vetype>_scalar<q>")]
@@ -413,7 +634,7 @@
    [(set_attr "type" "neon<fp>_mul_<stype>_scalar<q>")]
  )
  
-(define_insn "aarch64_rsqrte<mode>"
+(define_insn "@aarch64_rsqrte<mode>"
    [(set (match_operand:VHSDF_HSDF 0 "register_operand" "=w")
         (unspec:VHSDF_HSDF [(match_operand:VHSDF_HSDF 1 "register_operand" "w")]
                      UNSPEC_RSQRTE))]
@@ -421,7 +642,7 @@
    "frsqrte\\t%<v>0<Vmtype>, %<v>1<Vmtype>"
    [(set_attr "type" "neon_fp_rsqrte_<stype><q>")])
  
-(define_insn "aarch64_rsqrts<mode>"
+(define_insn "@aarch64_rsqrts<mode>"
    [(set (match_operand:VHSDF_HSDF 0 "register_operand" "=w")
         (unspec:VHSDF_HSDF [(match_operand:VHSDF_HSDF 1 "register_operand" "w")
                             (match_operand:VHSDF_HSDF 2 "register_operand" "w")]
@@ -431,8 +652,8 @@
    [(set_attr "type" "neon_fp_rsqrts_<stype><q>")])
  
  (define_expand "rsqrt<mode>2"
-  [(set (match_operand:VALLF 0 "register_operand" "=w")
-       (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
+  [(set (match_operand:VALLF 0 "register_operand")
+       (unspec:VALLF [(match_operand:VALLF 1 "register_operand")]
                      UNSPEC_RSQRT))]
    "TARGET_SIMD"
  {
@@ -449,7 +670,7 @@
         (match_operand:DF 3 "register_operand" "w")))]
    "TARGET_SIMD"
    {
-    operands[2] = GEN_INT (ENDIAN_LANE_N (V2DFmode, INTVAL (operands[2])));
+    operands[2] = aarch64_endian_lane_rtx (V2DFmode, INTVAL (operands[2]));
      return "fmul\\t%0.2d, %3.2d, %1.d[%2]";
    }
    [(set_attr "type" "neon_fp_mul_d_scalar_q")]
@@ -484,16 +705,104 @@
    [(set_attr "type" "neon_abs<q>")]
  )
  
-(define_insn "abd<mode>_3"
+;; It's tempting to represent SABD as ABS (MINUS op1 op2).
+;; This isn't accurate as ABS treats always its input as a signed value.
+;; So (ABS:QI (minus:QI 64 -128)) == (ABS:QI (192 or -64 signed)) == 64.
+;; Whereas SABD would return 192 (-64 signed) on the above example.
+;; Use MINUS ([us]max (op1, op2), [us]min (op1, op2)) instead.
+(define_insn "aarch64_<su>abd<mode>_3"
    [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w")
-       (abs:VDQ_BHSI (minus:VDQ_BHSI
-                      (match_operand:VDQ_BHSI 1 "register_operand" "w")
-                      (match_operand:VDQ_BHSI 2 "register_operand" "w"))))]
+       (minus:VDQ_BHSI
+         (USMAX:VDQ_BHSI
+           (match_operand:VDQ_BHSI 1 "register_operand" "w")
+           (match_operand:VDQ_BHSI 2 "register_operand" "w"))
+         (<max_opp>:VDQ_BHSI
+           (match_dup 1)
+           (match_dup 2))))]
+  "TARGET_SIMD"
+  "<su>abd\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
+  [(set_attr "type" "neon_abd<q>")]
+)
+
+(define_insn "aarch64_<sur>abdl2<mode>_3"
+  [(set (match_operand:<VDBLW> 0 "register_operand" "=w")
+       (unspec:<VDBLW> [(match_operand:VDQV_S 1 "register_operand" "w")
+                         (match_operand:VDQV_S 2 "register_operand" "w")]
+       ABDL2))]
    "TARGET_SIMD"
-  "sabd\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
+  "<sur>abdl2\t%0.<Vwtype>, %1.<Vtype>, %2.<Vtype>"
    [(set_attr "type" "neon_abd<q>")]
  )
  
+(define_insn "aarch64_<sur>abal<mode>_4"
+  [(set (match_operand:<VDBLW> 0 "register_operand" "=w")
+       (unspec:<VDBLW> [(match_operand:VDQV_S 1 "register_operand" "w")
+                         (match_operand:VDQV_S 2 "register_operand" "w")
+                        (match_operand:<VDBLW> 3 "register_operand" "0")]
+       ABAL))]
+  "TARGET_SIMD"
+  "<sur>abal\t%0.<Vwtype>, %1.<Vhalftype>, %2.<Vhalftype>"
+  [(set_attr "type" "neon_arith_acc<q>")]
+)
+
+(define_insn "aarch64_<sur>adalp<mode>_3"
+  [(set (match_operand:<VDBLW> 0 "register_operand" "=w")
+       (unspec:<VDBLW> [(match_operand:VDQV_S 1 "register_operand" "w")
+                         (match_operand:<VDBLW> 2 "register_operand" "0")]
+       ADALP))]
+  "TARGET_SIMD"
+  "<sur>adalp\t%0.<Vwtype>, %1.<Vtype>"
+  [(set_attr "type" "neon_reduc_add<q>")]
+)
+
+;; Emit a sequence to produce a sum-of-absolute-differences of the V16QI
+;; inputs in operands 1 and 2.  The sequence also has to perform a widening
+;; reduction of the difference into a V4SI vector and accumulate that into
+;; operand 3 before copying that into the result operand 0.
+;; Perform that with a sequence of:
+;; UABDL2      tmp.8h, op1.16b, op2.16b
+;; UABAL       tmp.8h, op1.16b, op2.16b
+;; UADALP      op3.4s, tmp.8h
+;; MOV         op0, op3 // should be eliminated in later passes.
+;;
+;; For TARGET_DOTPROD we do:
+;; MOV tmp1.16b, #1 // Can be CSE'd and hoisted out of loops.
+;; UABD        tmp2.16b, op1.16b, op2.16b
+;; UDOT        op3.4s, tmp2.16b, tmp1.16b
+;; MOV op0, op3 // RA will tie the operands of UDOT appropriately.
+;;
+;; The signed version just uses the signed variants of the above instructions
+;; but for TARGET_DOTPROD still emits a UDOT as the absolute difference is
+;; unsigned.
+
+(define_expand "<sur>sadv16qi"
+  [(use (match_operand:V4SI 0 "register_operand"))
+   (unspec:V16QI [(use (match_operand:V16QI 1 "register_operand"))
+                 (use (match_operand:V16QI 2 "register_operand"))] ABAL)
+   (use (match_operand:V4SI 3 "register_operand"))]
+  "TARGET_SIMD"
+  {
+    if (TARGET_DOTPROD)
+      {
+       rtx ones = force_reg (V16QImode, CONST1_RTX (V16QImode));
+       rtx abd = gen_reg_rtx (V16QImode);
+       emit_insn (gen_aarch64_<sur>abdv16qi_3 (abd, operands[1], operands[2]));
+       emit_insn (gen_aarch64_udotv16qi (operands[0], operands[3],
+                                         abd, ones));
+       DONE;
+      }
+    rtx reduc = gen_reg_rtx (V8HImode);
+    emit_insn (gen_aarch64_<sur>abdl2v16qi_3 (reduc, operands[1],
+                                              operands[2]));
+    emit_insn (gen_aarch64_<sur>abalv16qi_4 (reduc, operands[1],
+                                             operands[2], reduc));
+    emit_insn (gen_aarch64_<sur>adalpv8hi_3 (operands[3], reduc,
+                                             operands[3]));
+    emit_move_insn (operands[0], operands[3]);
+    DONE;
+  }
+)
+
  (define_insn "aba<mode>_3"
    [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w")
         (plus:VDQ_BHSI (abs:VDQ_BHSI (minus:VDQ_BHSI
@@ -516,21 +825,45 @@
    [(set_attr "type" "neon_fp_abd_<stype><q>")]
  )
  
+;; For AND (vector, register) and BIC (vector, immediate)
  (define_insn "and<mode>3"
-  [(set (match_operand:VDQ_I 0 "register_operand" "=w")
-        (and:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w")
-                (match_operand:VDQ_I 2 "register_operand" "w")))]
+  [(set (match_operand:VDQ_I 0 "register_operand" "=w,w")
+       (and:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w,0")
+                  (match_operand:VDQ_I 2 "aarch64_reg_or_bic_imm" "w,Db")))]
    "TARGET_SIMD"
-  "and\t%0.<Vbtype>, %1.<Vbtype>, %2.<Vbtype>"
+  {
+    switch (which_alternative)
+      {
+      case 0:
+       return "and\t%0.<Vbtype>, %1.<Vbtype>, %2.<Vbtype>";
+      case 1:
+       return aarch64_output_simd_mov_immediate (operands[2], <bitsize>,
+                                                 AARCH64_CHECK_BIC);
+      default:
+       gcc_unreachable ();
+      }
+  }
    [(set_attr "type" "neon_logic<q>")]
  )
  
+;; For ORR (vector, register) and ORR (vector, immediate)
  (define_insn "ior<mode>3"
-  [(set (match_operand:VDQ_I 0 "register_operand" "=w")
-        (ior:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w")
-                (match_operand:VDQ_I 2 "register_operand" "w")))]
+  [(set (match_operand:VDQ_I 0 "register_operand" "=w,w")
+       (ior:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w,0")
+                  (match_operand:VDQ_I 2 "aarch64_reg_or_orr_imm" "w,Do")))]
    "TARGET_SIMD"
-  "orr\t%0.<Vbtype>, %1.<Vbtype>, %2.<Vbtype>"
+  {
+    switch (which_alternative)
+      {
+      case 0:
+       return "orr\t%0.<Vbtype>, %1.<Vbtype>, %2.<Vbtype>";
+      case 1:
+       return aarch64_output_simd_mov_immediate (operands[2], <bitsize>,
+                                                 AARCH64_CHECK_ORR);
+      default:
+       gcc_unreachable ();
+      }
+  }
    [(set_attr "type" "neon_logic<q>")]
  )
  
@@ -552,29 +885,29 @@
  )
  
  (define_insn "aarch64_simd_vec_set<mode>"
-  [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w,w,w")
-        (vec_merge:VDQ_BHSI
-           (vec_duplicate:VDQ_BHSI
-               (match_operand:<VEL> 1 "aarch64_simd_general_operand" "r,w,Utv"))
-           (match_operand:VDQ_BHSI 3 "register_operand" "0,0,0")
+  [(set (match_operand:VALL_F16 0 "register_operand" "=w,w,w")
+       (vec_merge:VALL_F16
+           (vec_duplicate:VALL_F16
+               (match_operand:<VEL> 1 "aarch64_simd_general_operand" "w,?r,Utv"))
+           (match_operand:VALL_F16 3 "register_operand" "0,0,0")
             (match_operand:SI 2 "immediate_operand" "i,i,i")))]
    "TARGET_SIMD"
    {
-   int elt = ENDIAN_LANE_N (<MODE>mode, exact_log2 (INTVAL (operands[2])));
+   int elt = ENDIAN_LANE_N (<nunits>, exact_log2 (INTVAL (operands[2])));
     operands[2] = GEN_INT ((HOST_WIDE_INT) 1 << elt);
     switch (which_alternative)
       {
       case 0:
-       return "ins\\t%0.<Vetype>[%p2], %w1";
-     case 1:
         return "ins\\t%0.<Vetype>[%p2], %1.<Vetype>[0]";
+     case 1:
+       return "ins\\t%0.<Vetype>[%p2], %<vwcore>1";
       case 2:
          return "ld1\\t{%0.<Vetype>}[%p2], %1";
       default:
         gcc_unreachable ();
       }
    }
-  [(set_attr "type" "neon_from_gp<q>, neon_ins<q>, neon_load1_one_lane<q>")]
+  [(set_attr "type" "neon_ins<q>, neon_from_gp<q>, neon_load1_one_lane<q>")]
  )
  
  (define_insn "*aarch64_simd_vec_copy_lane<mode>"
@@ -589,9 +922,9 @@
             (match_operand:SI 2 "immediate_operand" "i")))]
    "TARGET_SIMD"
    {
-    int elt = ENDIAN_LANE_N (<MODE>mode, exact_log2 (INTVAL (operands[2])));
+    int elt = ENDIAN_LANE_N (<nunits>, exact_log2 (INTVAL (operands[2])));
      operands[2] = GEN_INT (HOST_WIDE_INT_1 << elt);
-    operands[4] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[4])));
+    operands[4] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[4]));
  
      return "ins\t%0.<Vetype>[%p2], %3.<Vetype>[%4]";
    }
@@ -610,16 +943,31 @@
             (match_operand:SI 2 "immediate_operand" "i")))]
    "TARGET_SIMD"
    {
-    int elt = ENDIAN_LANE_N (<MODE>mode, exact_log2 (INTVAL (operands[2])));
+    int elt = ENDIAN_LANE_N (<nunits>, exact_log2 (INTVAL (operands[2])));
      operands[2] = GEN_INT (HOST_WIDE_INT_1 << elt);
-    operands[4] = GEN_INT (ENDIAN_LANE_N (<VSWAP_WIDTH>mode,
-                          INTVAL (operands[4])));
+    operands[4] = aarch64_endian_lane_rtx (<VSWAP_WIDTH>mode,
+                                          INTVAL (operands[4]));
  
      return "ins\t%0.<Vetype>[%p2], %3.<Vetype>[%4]";
    }
    [(set_attr "type" "neon_ins<q>")]
  )
  
+(define_expand "signbit<mode>2"
+  [(use (match_operand:<V_INT_EQUIV> 0 "register_operand"))
+   (use (match_operand:VDQSF 1 "register_operand"))]
+  "TARGET_SIMD"
+{
+  int shift_amount = GET_MODE_UNIT_BITSIZE (<V_INT_EQUIV>mode) - 1;
+  rtx shift_vector = aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
+                                                        shift_amount);
+  operands[1] = lowpart_subreg (<V_INT_EQUIV>mode, operands[1], <MODE>mode);
+
+  emit_insn (gen_aarch64_simd_lshr<v_int_equiv> (operands[0], operands[1],
+                                                 shift_vector));
+  DONE;
+})
+
  (define_insn "aarch64_simd_lshr<mode>"
   [(set (match_operand:VDQ_I 0 "register_operand" "=w")
         (lshiftrt:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w")
@@ -638,6 +986,18 @@
    [(set_attr "type" "neon_shift_imm<q>")]
  )
  
+(define_insn "*aarch64_simd_sra<mode>"
+ [(set (match_operand:VDQ_I 0 "register_operand" "=w")
+       (plus:VDQ_I
+          (SHIFTRT:VDQ_I
+               (match_operand:VDQ_I 1 "register_operand" "w")
+               (match_operand:VDQ_I 2 "aarch64_simd_rshift_imm" "Dr"))
+          (match_operand:VDQ_I 3 "register_operand" "0")))]
+  "TARGET_SIMD"
+  "<sra_op>sra\t%0.<Vtype>, %1.<Vtype>, %2"
+  [(set_attr "type" "neon_shift_acc<q>")]
+)
+
  (define_insn "aarch64_simd_imm_shl<mode>"
   [(set (match_operand:VDQ_I 0 "register_operand" "=w")
         (ashift:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w")
@@ -677,9 +1037,9 @@
  )
  
  (define_expand "ashl<mode>3"
-  [(match_operand:VDQ_I 0 "register_operand" "")
-   (match_operand:VDQ_I 1 "register_operand" "")
-   (match_operand:SI  2 "general_operand" "")]
+  [(match_operand:VDQ_I 0 "register_operand")
+   (match_operand:VDQ_I 1 "register_operand")
+   (match_operand:SI  2 "general_operand")]
   "TARGET_SIMD"
  {
    int bit_width = GET_MODE_UNIT_SIZE (<MODE>mode) * BITS_PER_UNIT;
@@ -724,9 +1084,9 @@
  )
  
  (define_expand "lshr<mode>3"
-  [(match_operand:VDQ_I 0 "register_operand" "")
-   (match_operand:VDQ_I 1 "register_operand" "")
-   (match_operand:SI  2 "general_operand" "")]
+  [(match_operand:VDQ_I 0 "register_operand")
+   (match_operand:VDQ_I 1 "register_operand")
+   (match_operand:SI  2 "general_operand")]
   "TARGET_SIMD"
  {
    int bit_width = GET_MODE_UNIT_SIZE (<MODE>mode) * BITS_PER_UNIT;
@@ -771,9 +1131,9 @@
  )
  
  (define_expand "ashr<mode>3"
-  [(match_operand:VDQ_I 0 "register_operand" "")
-   (match_operand:VDQ_I 1 "register_operand" "")
-   (match_operand:SI  2 "general_operand" "")]
+  [(match_operand:VDQ_I 0 "register_operand")
+   (match_operand:VDQ_I 1 "register_operand")
+   (match_operand:SI  2 "general_operand")]
   "TARGET_SIMD"
  {
    int bit_width = GET_MODE_UNIT_SIZE (<MODE>mode) * BITS_PER_UNIT;
@@ -818,9 +1178,9 @@
  )
  
  (define_expand "vashl<mode>3"
- [(match_operand:VDQ_I 0 "register_operand" "")
-  (match_operand:VDQ_I 1 "register_operand" "")
-  (match_operand:VDQ_I 2 "register_operand" "")]
+ [(match_operand:VDQ_I 0 "register_operand")
+  (match_operand:VDQ_I 1 "register_operand")
+  (match_operand:VDQ_I 2 "register_operand")]
   "TARGET_SIMD"
  {
    emit_insn (gen_aarch64_simd_reg_sshl<mode> (operands[0], operands[1],
@@ -832,9 +1192,9 @@
  ;; Negating individual lanes most certainly offsets the
  ;; gain from vectorization.
  (define_expand "vashr<mode>3"
- [(match_operand:VDQ_BHSI 0 "register_operand" "")
-  (match_operand:VDQ_BHSI 1 "register_operand" "")
-  (match_operand:VDQ_BHSI 2 "register_operand" "")]
+ [(match_operand:VDQ_BHSI 0 "register_operand")
+  (match_operand:VDQ_BHSI 1 "register_operand")
+  (match_operand:VDQ_BHSI 2 "register_operand")]
   "TARGET_SIMD"
  {
    rtx neg = gen_reg_rtx (<MODE>mode);
@@ -846,9 +1206,9 @@
  
  ;; DI vector shift
  (define_expand "aarch64_ashr_simddi"
-  [(match_operand:DI 0 "register_operand" "=w")
-   (match_operand:DI 1 "register_operand" "w")
-   (match_operand:SI 2 "aarch64_shift_imm64_di" "")]
+  [(match_operand:DI 0 "register_operand")
+   (match_operand:DI 1 "register_operand")
+   (match_operand:SI 2 "aarch64_shift_imm64_di")]
    "TARGET_SIMD"
    {
      /* An arithmetic shift right by 64 fills the result with copies of the sign
@@ -862,9 +1222,9 @@
  )
  
  (define_expand "vlshr<mode>3"
- [(match_operand:VDQ_BHSI 0 "register_operand" "")
-  (match_operand:VDQ_BHSI 1 "register_operand" "")
-  (match_operand:VDQ_BHSI 2 "register_operand" "")]
+ [(match_operand:VDQ_BHSI 0 "register_operand")
+  (match_operand:VDQ_BHSI 1 "register_operand")
+  (match_operand:VDQ_BHSI 2 "register_operand")]
   "TARGET_SIMD"
  {
    rtx neg = gen_reg_rtx (<MODE>mode);
@@ -875,9 +1235,9 @@
  })
  
  (define_expand "aarch64_lshr_simddi"
-  [(match_operand:DI 0 "register_operand" "=w")
-   (match_operand:DI 1 "register_operand" "w")
-   (match_operand:SI 2 "aarch64_shift_imm64_di" "")]
+  [(match_operand:DI 0 "register_operand")
+   (match_operand:DI 1 "register_operand")
+   (match_operand:SI 2 "aarch64_shift_imm64_di")]
    "TARGET_SIMD"
    {
      if (INTVAL (operands[2]) == 64)
@@ -888,19 +1248,6 @@
    }
  )
  
-(define_expand "vec_set<mode>"
-  [(match_operand:VDQ_BHSI 0 "register_operand")
-   (match_operand:<VEL> 1 "register_operand")
-   (match_operand:SI 2 "immediate_operand")]
-  "TARGET_SIMD"
-  {
-    HOST_WIDE_INT elem = (HOST_WIDE_INT) 1 << INTVAL (operands[2]);
-    emit_insn (gen_aarch64_simd_vec_set<mode> (operands[0], operands[1],
-                                           GEN_INT (elem), operands[0]));
-    DONE;
-  }
-)
-
  ;; For 64-bit modes we use ushl/r, as this does not require a SIMD zero.
  (define_insn "vec_shr_<mode>"
    [(set (match_operand:VD 0 "register_operand" "=w")
@@ -917,64 +1264,10 @@
    [(set_attr "type" "neon_shift_imm")]
  )
  
-(define_insn "aarch64_simd_vec_setv2di"
-  [(set (match_operand:V2DI 0 "register_operand" "=w,w")
-        (vec_merge:V2DI
-           (vec_duplicate:V2DI
-               (match_operand:DI 1 "register_operand" "r,w"))
-           (match_operand:V2DI 3 "register_operand" "0,0")
-           (match_operand:SI 2 "immediate_operand" "i,i")))]
-  "TARGET_SIMD"
-  {
-    int elt = ENDIAN_LANE_N (V2DImode, exact_log2 (INTVAL (operands[2])));
-    operands[2] = GEN_INT ((HOST_WIDE_INT) 1 << elt);
-    switch (which_alternative)
-      {
-      case 0:
-       return "ins\\t%0.d[%p2], %1";
-      case 1:
-        return "ins\\t%0.d[%p2], %1.d[0]";
-      default:
-       gcc_unreachable ();
-      }
-  }
-  [(set_attr "type" "neon_from_gp, neon_ins_q")]
-)
-
-(define_expand "vec_setv2di"
-  [(match_operand:V2DI 0 "register_operand")
-   (match_operand:DI 1 "register_operand")
-   (match_operand:SI 2 "immediate_operand")]
-  "TARGET_SIMD"
-  {
-    HOST_WIDE_INT elem = (HOST_WIDE_INT) 1 << INTVAL (operands[2]);
-    emit_insn (gen_aarch64_simd_vec_setv2di (operands[0], operands[1],
-                                         GEN_INT (elem), operands[0]));
-    DONE;
-  }
-)
-
-(define_insn "aarch64_simd_vec_set<mode>"
-  [(set (match_operand:VDQF_F16 0 "register_operand" "=w")
-       (vec_merge:VDQF_F16
-           (vec_duplicate:VDQF_F16
-               (match_operand:<VEL> 1 "register_operand" "w"))
-           (match_operand:VDQF_F16 3 "register_operand" "0")
-           (match_operand:SI 2 "immediate_operand" "i")))]
-  "TARGET_SIMD"
-  {
-    int elt = ENDIAN_LANE_N (<MODE>mode, exact_log2 (INTVAL (operands[2])));
-
-    operands[2] = GEN_INT ((HOST_WIDE_INT)1 << elt);
-    return "ins\t%0.<Vetype>[%p2], %1.<Vetype>[0]";
-  }
-  [(set_attr "type" "neon_ins<q>")]
-)
-
  (define_expand "vec_set<mode>"
-  [(match_operand:VDQF_F16 0 "register_operand" "+w")
-   (match_operand:<VEL> 1 "register_operand" "w")
-   (match_operand:SI 2 "immediate_operand" "")]
+  [(match_operand:VALL_F16 0 "register_operand")
+   (match_operand:<VEL> 1 "register_operand")
+   (match_operand:SI 2 "immediate_operand")]
    "TARGET_SIMD"
    {
      HOST_WIDE_INT elem = (HOST_WIDE_INT) 1 << INTVAL (operands[2]);
@@ -1008,7 +1301,7 @@
          (match_operand:VDQHS 4 "register_operand" "0")))]
   "TARGET_SIMD"
    {
-    operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2])));
+    operands[2] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[2]));
      return "mla\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]";
    }
    [(set_attr "type" "neon_mla_<Vetype>_scalar<q>")]
@@ -1026,13 +1319,24 @@
          (match_operand:VDQHS 4 "register_operand" "0")))]
   "TARGET_SIMD"
    {
-    operands[2] = GEN_INT (ENDIAN_LANE_N (<VSWAP_WIDTH>mode,
-                                         INTVAL (operands[2])));
+    operands[2] = aarch64_endian_lane_rtx (<VSWAP_WIDTH>mode, INTVAL (operands[2]));
      return "mla\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]";
    }
    [(set_attr "type" "neon_mla_<Vetype>_scalar<q>")]
  )
  
+(define_insn "*aarch64_mla_elt_merge<mode>"
+  [(set (match_operand:VDQHS 0 "register_operand" "=w")
+       (plus:VDQHS
+         (mult:VDQHS (vec_duplicate:VDQHS
+                 (match_operand:<VEL> 1 "register_operand" "<h_con>"))
+               (match_operand:VDQHS 2 "register_operand" "w"))
+         (match_operand:VDQHS 3 "register_operand" "0")))]
+ "TARGET_SIMD"
+ "mla\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]"
+  [(set_attr "type" "neon_mla_<Vetype>_scalar<q>")]
+)
+
  (define_insn "aarch64_mls<mode>"
   [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w")
         (minus:VDQ_BHSI (match_operand:VDQ_BHSI 1 "register_operand" "0")
@@ -1055,7 +1359,7 @@
            (match_operand:VDQHS 3 "register_operand" "w"))))]
   "TARGET_SIMD"
    {
-    operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2])));
+    operands[2] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[2]));
      return "mls\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]";
    }
    [(set_attr "type" "neon_mla_<Vetype>_scalar<q>")]
@@ -1073,13 +1377,24 @@
            (match_operand:VDQHS 3 "register_operand" "w"))))]
   "TARGET_SIMD"
    {
-    operands[2] = GEN_INT (ENDIAN_LANE_N (<VSWAP_WIDTH>mode,
-                                         INTVAL (operands[2])));
+    operands[2] = aarch64_endian_lane_rtx (<VSWAP_WIDTH>mode, INTVAL (operands[2]));
      return "mls\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]";
    }
    [(set_attr "type" "neon_mla_<Vetype>_scalar<q>")]
  )
  
+(define_insn "*aarch64_mls_elt_merge<mode>"
+  [(set (match_operand:VDQHS 0 "register_operand" "=w")
+       (minus:VDQHS
+         (match_operand:VDQHS 1 "register_operand" "0")
+         (mult:VDQHS (vec_duplicate:VDQHS
+                 (match_operand:<VEL> 2 "register_operand" "<h_con>"))
+               (match_operand:VDQHS 3 "register_operand" "w"))))]
+  "TARGET_SIMD"
+  "mls\t%0.<Vtype>, %3.<Vtype>, %2.<Vetype>[0]"
+  [(set_attr "type" "neon_mla_<Vetype>_scalar<q>")]
+)
+
  ;; Max/Min operations.
  (define_insn "<su><maxmin><mode>3"
   [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w")
@@ -1091,9 +1406,9 @@
  )
  
  (define_expand "<su><maxmin>v2di3"
- [(set (match_operand:V2DI 0 "register_operand" "")
-       (MAXMIN:V2DI (match_operand:V2DI 1 "register_operand" "")
-                    (match_operand:V2DI 2 "register_operand" "")))]
+ [(set (match_operand:V2DI 0 "register_operand")
+       (MAXMIN:V2DI (match_operand:V2DI 1 "register_operand")
+                    (match_operand:V2DI 2 "register_operand")))]
   "TARGET_SIMD"
  {
    enum rtx_code cmp_operator;
@@ -1166,9 +1481,8 @@
     fmov\\t%d0, %1
     dup\\t%d0, %1"
    [(set_attr "type" "neon_dup<q>,f_mcr,neon_dup<q>")
-   (set_attr "simd" "yes,*,yes")
-   (set_attr "fp" "*,yes,*")
-   (set_attr "length" "4")]
+   (set_attr "length" "4")
+   (set_attr "arch" "simd,fp,simd")]
  )
  
  (define_insn "move_lo_quad_internal_<mode>"
@@ -1182,9 +1496,8 @@
     fmov\\t%d0, %1
     dup\\t%d0, %1"
    [(set_attr "type" "neon_dup<q>,f_mcr,neon_dup<q>")
-   (set_attr "simd" "yes,*,yes")
-   (set_attr "fp" "*,yes,*")
-   (set_attr "length" "4")]
+   (set_attr "length" "4")
+   (set_attr "arch" "simd,fp,simd")]
  )
  
  (define_insn "move_lo_quad_internal_be_<mode>"
@@ -1198,9 +1511,8 @@
     fmov\\t%d0, %1
     dup\\t%d0, %1"
    [(set_attr "type" "neon_dup<q>,f_mcr,neon_dup<q>")
-   (set_attr "simd" "yes,*,yes")
-   (set_attr "fp" "*,yes,*")
-   (set_attr "length" "4")]
+   (set_attr "length" "4")
+   (set_attr "arch" "simd,fp,simd")]
  )
  
  (define_insn "move_lo_quad_internal_be_<mode>"
@@ -1214,9 +1526,8 @@
     fmov\\t%d0, %1
     dup\\t%d0, %1"
    [(set_attr "type" "neon_dup<q>,f_mcr,neon_dup<q>")
-   (set_attr "simd" "yes,*,yes")
-   (set_attr "fp" "*,yes,*")
-   (set_attr "length" "4")]
+   (set_attr "length" "4")
+   (set_attr "arch" "simd,fp,simd")]
  )
  
  (define_expand "move_lo_quad_<mode>"
@@ -1266,11 +1577,11 @@
  )
  
  (define_expand "move_hi_quad_<mode>"
- [(match_operand:VQ 0 "register_operand" "")
-  (match_operand:<VHALF> 1 "register_operand" "")]
+ [(match_operand:VQ 0 "register_operand")
+  (match_operand:<VHALF> 1 "register_operand")]
   "TARGET_SIMD"
  {
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, false);
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false);
    if (BYTES_BIG_ENDIAN)
      emit_insn (gen_aarch64_simd_move_hi_quad_be_<mode> (operands[0],
                     operands[1], p));
@@ -1292,9 +1603,9 @@
  )
  
  (define_expand "vec_pack_trunc_<mode>"
- [(match_operand:<VNARROWD> 0 "register_operand" "")
-  (match_operand:VDN 1 "register_operand" "")
-  (match_operand:VDN 2 "register_operand" "")]
+ [(match_operand:<VNARROWD> 0 "register_operand")
+  (match_operand:VDN 1 "register_operand")
+  (match_operand:VDN 2 "register_operand")]
   "TARGET_SIMD"
  {
    rtx tempreg = gen_reg_rtx (<VDBL>mode);
@@ -1334,7 +1645,7 @@
                                (match_operand:VQW 2 "vect_par_cnst_lo_half" "")
                             )))]
    "TARGET_SIMD"
-  "<su>shll\t%0.<Vwtype>, %1.<Vhalftype>, 0"
+  "<su>xtl\t%0.<Vwtype>, %1.<Vhalftype>"
    [(set_attr "type" "neon_shift_imm_long")]
  )
  
@@ -1345,16 +1656,16 @@
                                (match_operand:VQW 2 "vect_par_cnst_hi_half" "")
                             )))]
    "TARGET_SIMD"
-  "<su>shll2\t%0.<Vwtype>, %1.<Vtype>, 0"
+  "<su>xtl2\t%0.<Vwtype>, %1.<Vtype>"
    [(set_attr "type" "neon_shift_imm_long")]
  )
  
  (define_expand "vec_unpack<su>_hi_<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "")
+  [(match_operand:<VWIDE> 0 "register_operand")
     (ANY_EXTEND:<VWIDE> (match_operand:VQW 1 "register_operand"))]
    "TARGET_SIMD"
    {
-    rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+    rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
      emit_insn (gen_aarch64_simd_vec_unpack<su>_hi_<mode> (operands[0],
                                                           operands[1], p));
      DONE;
@@ -1362,11 +1673,11 @@
  )
  
  (define_expand "vec_unpack<su>_lo_<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "")
-   (ANY_EXTEND:<VWIDE> (match_operand:VQW 1 "register_operand" ""))]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (ANY_EXTEND:<VWIDE> (match_operand:VQW 1 "register_operand"))]
    "TARGET_SIMD"
    {
-    rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, false);
+    rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false);
      emit_insn (gen_aarch64_simd_vec_unpack<su>_lo_<mode> (operands[0],
                                                           operands[1], p));
      DONE;
@@ -1481,12 +1792,12 @@
  )
  
  (define_expand "vec_widen_<su>mult_lo_<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "")
-   (ANY_EXTEND:<VWIDE> (match_operand:VQW 1 "register_operand" ""))
-   (ANY_EXTEND:<VWIDE> (match_operand:VQW 2 "register_operand" ""))]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (ANY_EXTEND:<VWIDE> (match_operand:VQW 1 "register_operand"))
+   (ANY_EXTEND:<VWIDE> (match_operand:VQW 2 "register_operand"))]
   "TARGET_SIMD"
   {
-   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, false);
+   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false);
     emit_insn (gen_aarch64_simd_vec_<su>mult_lo_<mode> (operands[0],
                                                        operands[1],
                                                        operands[2], p));
@@ -1508,12 +1819,12 @@
  )
  
  (define_expand "vec_widen_<su>mult_hi_<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "")
-   (ANY_EXTEND:<VWIDE> (match_operand:VQW 1 "register_operand" ""))
-   (ANY_EXTEND:<VWIDE> (match_operand:VQW 2 "register_operand" ""))]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (ANY_EXTEND:<VWIDE> (match_operand:VQW 1 "register_operand"))
+   (ANY_EXTEND:<VWIDE> (match_operand:VQW 2 "register_operand"))]
   "TARGET_SIMD"
   {
-   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
     emit_insn (gen_aarch64_simd_vec_<su>mult_hi_<mode> (operands[0],
                                                        operands[1],
                                                        operands[2], p));
@@ -1575,9 +1886,9 @@
  )
  
  (define_expand "div<mode>3"
- [(set (match_operand:VHSDF 0 "register_operand" "=w")
-       (div:VHSDF (match_operand:VHSDF 1 "register_operand" "w")
-                 (match_operand:VHSDF 2 "register_operand" "w")))]
+ [(set (match_operand:VHSDF 0 "register_operand")
+       (div:VHSDF (match_operand:VHSDF 1 "register_operand")
+                 (match_operand:VHSDF 2 "register_operand")))]
   "TARGET_SIMD"
  {
    if (aarch64_emit_approx_div (operands[0], operands[1], operands[2]))
@@ -1632,7 +1943,7 @@
        (match_operand:VDQF 4 "register_operand" "0")))]
    "TARGET_SIMD"
    {
-    operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2])));
+    operands[2] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[2]));
      return "fmla\\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]";
    }
    [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")]
@@ -1649,8 +1960,7 @@
        (match_operand:VDQSF 4 "register_operand" "0")))]
    "TARGET_SIMD"
    {
-    operands[2] = GEN_INT (ENDIAN_LANE_N (<VSWAP_WIDTH>mode,
-                                         INTVAL (operands[2])));
+    operands[2] = aarch64_endian_lane_rtx (<VSWAP_WIDTH>mode, INTVAL (operands[2]));
      return "fmla\\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]";
    }
    [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")]
@@ -1678,7 +1988,7 @@
        (match_operand:DF 4 "register_operand" "0")))]
    "TARGET_SIMD"
    {
-    operands[2] = GEN_INT (ENDIAN_LANE_N (V2DFmode, INTVAL (operands[2])));
+    operands[2] = aarch64_endian_lane_rtx (V2DFmode, INTVAL (operands[2]));
      return "fmla\\t%0.2d, %3.2d, %1.2d[%2]";
    }
    [(set_attr "type" "neon_fp_mla_d_scalar_q")]
@@ -1687,9 +1997,8 @@
  (define_insn "fnma<mode>4"
    [(set (match_operand:VHSDF 0 "register_operand" "=w")
         (fma:VHSDF
-         (match_operand:VHSDF 1 "register_operand" "w")
-          (neg:VHSDF
-           (match_operand:VHSDF 2 "register_operand" "w"))
+         (neg:VHSDF (match_operand:VHSDF 1 "register_operand" "w"))
+         (match_operand:VHSDF 2 "register_operand" "w")
           (match_operand:VHSDF 3 "register_operand" "0")))]
    "TARGET_SIMD"
    "fmls\\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
@@ -1708,7 +2017,7 @@
        (match_operand:VDQF 4 "register_operand" "0")))]
    "TARGET_SIMD"
    {
-    operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2])));
+    operands[2] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[2]));
      return "fmls\\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]";
    }
    [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")]
@@ -1726,8 +2035,7 @@
        (match_operand:VDQSF 4 "register_operand" "0")))]
    "TARGET_SIMD"
    {
-    operands[2] = GEN_INT (ENDIAN_LANE_N (<VSWAP_WIDTH>mode,
-                                         INTVAL (operands[2])));
+    operands[2] = aarch64_endian_lane_rtx (<VSWAP_WIDTH>mode, INTVAL (operands[2]));
      return "fmls\\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]";
    }
    [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")]
@@ -1757,7 +2065,7 @@
        (match_operand:DF 4 "register_operand" "0")))]
    "TARGET_SIMD"
    {
-    operands[2] = GEN_INT (ENDIAN_LANE_N (V2DFmode, INTVAL (operands[2])));
+    operands[2] = aarch64_endian_lane_rtx (V2DFmode, INTVAL (operands[2]));
      return "fmls\\t%0.2d, %3.2d, %1.2d[%2]";
    }
    [(set_attr "type" "neon_fp_mla_d_scalar_q")]
@@ -1915,11 +2223,11 @@
  ;; other big-endian patterns their behavior is as required.
  
  (define_expand "vec_unpacks_lo_<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "")
-   (match_operand:VQ_HSF 1 "register_operand" "")]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (match_operand:VQ_HSF 1 "register_operand")]
    "TARGET_SIMD"
    {
-    rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, false);
+    rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false);
      emit_insn (gen_aarch64_simd_vec_unpacks_lo_<mode> (operands[0],
                                                        operands[1], p));
      DONE;
@@ -1938,11 +2246,11 @@
  )
  
  (define_expand "vec_unpacks_hi_<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "")
-   (match_operand:VQ_HSF 1 "register_operand" "")]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (match_operand:VQ_HSF 1 "register_operand")]
    "TARGET_SIMD"
    {
-    rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+    rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
      emit_insn (gen_aarch64_simd_vec_unpacks_lo_<mode> (operands[0],
                                                        operands[1], p));
      DONE;
@@ -1991,9 +2299,9 @@
  )
  
  (define_expand "aarch64_float_truncate_hi_<Vdbl>"
-  [(match_operand:<VDBL> 0 "register_operand" "=w")
-   (match_operand:VDF 1 "register_operand" "0")
-   (match_operand:<VWIDE> 2 "register_operand" "w")]
+  [(match_operand:<VDBL> 0 "register_operand")
+   (match_operand:VDF 1 "register_operand")
+   (match_operand:<VWIDE> 2 "register_operand")]
    "TARGET_SIMD"
  {
    rtx (*gen) (rtx, rtx, rtx) = BYTES_BIG_ENDIAN
@@ -2050,8 +2358,9 @@
  ;; Max/Min are introduced by idiom recognition by GCC's mid-end.  An
  ;; expression like:
  ;;      a = (b < c) ? b : c;
-;; is idiom-matched as MIN_EXPR<b,c> only if -ffinite-math-only is enabled
-;; either explicitly or indirectly via -ffast-math.
+;; is idiom-matched as MIN_EXPR<b,c> only if -ffinite-math-only and
+;; -fno-signed-zeros are enabled either explicitly or indirectly via
+;; -ffast-math.
  ;;
  ;; MIN_EXPR and MAX_EXPR eventually map to 'smin' and 'smax' in RTL.
  ;; The 'smax' and 'smin' RTL standard pattern names do not specify which
@@ -2085,12 +2394,12 @@
  ;; 'across lanes' add.
  
  (define_expand "reduc_plus_scal_<mode>"
-  [(match_operand:<VEL> 0 "register_operand" "=w")
-   (unspec:VDQ_I [(match_operand:VDQ_I 1 "register_operand" "w")]
+  [(match_operand:<VEL> 0 "register_operand")
+   (unspec:VDQ_I [(match_operand:VDQ_I 1 "register_operand")]
                UNSPEC_ADDV)]
    "TARGET_SIMD"
    {
-    rtx elt = GEN_INT (ENDIAN_LANE_N (<MODE>mode, 0));
+    rtx elt = aarch64_endian_lane_rtx (<MODE>mode, 0);
      rtx scratch = gen_reg_rtx (<MODE>mode);
      emit_insn (gen_aarch64_reduc_plus_internal<mode> (scratch, operands[1]));
      emit_insn (gen_aarch64_get_lane<mode> (operands[0], scratch, elt));
@@ -2141,7 +2450,7 @@
                     UNSPEC_FADDV))]
   "TARGET_SIMD"
  {
-  rtx elt = GEN_INT (ENDIAN_LANE_N (V4SFmode, 0));
+  rtx elt = aarch64_endian_lane_rtx (V4SFmode, 0);
    rtx scratch = gen_reg_rtx (V4SFmode);
    emit_insn (gen_aarch64_faddpv4sf (scratch, operands[1], operands[1]));
    emit_insn (gen_aarch64_faddpv4sf (scratch, scratch, scratch));
@@ -2176,14 +2485,14 @@
  ;; 'across lanes' max and min ops.
  
  ;; Template for outputting a scalar, so we can create __builtins which can be
-;; gimple_fold'd to the REDUC_(MAX|MIN)_EXPR tree code.  (This is FP smax/smin).
+;; gimple_fold'd to the IFN_REDUC_(MAX|MIN) function.  (This is FP smax/smin).
  (define_expand "reduc_<maxmin_uns>_scal_<mode>"
    [(match_operand:<VEL> 0 "register_operand")
     (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand")]
                   FMAXMINV)]
    "TARGET_SIMD"
    {
-    rtx elt = GEN_INT (ENDIAN_LANE_N (<MODE>mode, 0));
+    rtx elt = aarch64_endian_lane_rtx (<MODE>mode, 0);
      rtx scratch = gen_reg_rtx (<MODE>mode);
      emit_insn (gen_aarch64_reduc_<maxmin_uns>_internal<mode> (scratch,
                                                               operands[1]));
@@ -2199,7 +2508,7 @@
                     MAXMINV)]
    "TARGET_SIMD"
    {
-    rtx elt = GEN_INT (ENDIAN_LANE_N (<MODE>mode, 0));
+    rtx elt = aarch64_endian_lane_rtx (<MODE>mode, 0);
      rtx scratch = gen_reg_rtx (<MODE>mode);
      emit_insn (gen_aarch64_reduc_<maxmin_uns>_internal<mode> (scratch,
                                                               operands[1]));
@@ -2256,14 +2565,14 @@
  ;; in *aarch64_simd_bsl<mode>_alt.
  
  (define_insn "aarch64_simd_bsl<mode>_internal"
-  [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w")
-       (xor:VSDQ_I_DI
-          (and:VSDQ_I_DI
-            (xor:VSDQ_I_DI
-              (match_operand:<V_cmp_result> 3 "register_operand" "w,0,w")
-              (match_operand:VSDQ_I_DI 2 "register_operand" "w,w,0"))
-            (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w"))
-         (match_dup:<V_cmp_result> 3)
+  [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w")
+       (xor:VDQ_I
+          (and:VDQ_I
+            (xor:VDQ_I
+              (match_operand:<V_INT_EQUIV> 3 "register_operand" "w,0,w")
+              (match_operand:VDQ_I 2 "register_operand" "w,w,0"))
+            (match_operand:VDQ_I 1 "register_operand" "0,w,w"))
+         (match_dup:<V_INT_EQUIV> 3)
         ))]
    "TARGET_SIMD"
    "@
@@ -2280,14 +2589,14 @@
  ;; permutations of commutative operations, we have to have a separate pattern.
  
  (define_insn "*aarch64_simd_bsl<mode>_alt"
-  [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w")
-       (xor:VSDQ_I_DI
-          (and:VSDQ_I_DI
-            (xor:VSDQ_I_DI
-              (match_operand:VSDQ_I_DI 3 "register_operand" "w,w,0")
-              (match_operand:VSDQ_I_DI 2 "register_operand" "w,0,w"))
-             (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w"))
-         (match_dup:VSDQ_I_DI 2)))]
+  [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w")
+       (xor:VDQ_I
+          (and:VDQ_I
+            (xor:VDQ_I
+              (match_operand:VDQ_I 3 "register_operand" "w,w,0")
+              (match_operand:<V_INT_EQUIV> 2 "register_operand" "w,0,w"))
+             (match_operand:VDQ_I 1 "register_operand" "0,w,w"))
+         (match_dup:<V_INT_EQUIV> 2)))]
    "TARGET_SIMD"
    "@
    bsl\\t%0.<Vbtype>, %3.<Vbtype>, %2.<Vbtype>
@@ -2296,9 +2605,103 @@
    [(set_attr "type" "neon_bsl<q>")]
  )
  
+;; DImode is special, we want to avoid computing operations which are
+;; more naturally computed in general purpose registers in the vector
+;; registers.  If we do that, we need to move all three operands from general
+;; purpose registers to vector registers, then back again.  However, we
+;; don't want to make this pattern an UNSPEC as we'd lose scope for
+;; optimizations based on the component operations of a BSL.
+;;
+;; That means we need a splitter back to the individual operations, if they
+;; would be better calculated on the integer side.
+
+(define_insn_and_split "aarch64_simd_bsldi_internal"
+  [(set (match_operand:DI 0 "register_operand" "=w,w,w,&r")
+       (xor:DI
+          (and:DI
+            (xor:DI
+              (match_operand:DI 3 "register_operand" "w,0,w,r")
+              (match_operand:DI 2 "register_operand" "w,w,0,r"))
+            (match_operand:DI 1 "register_operand" "0,w,w,r"))
+         (match_dup:DI 3)
+       ))]
+  "TARGET_SIMD"
+  "@
+  bsl\\t%0.8b, %2.8b, %3.8b
+  bit\\t%0.8b, %2.8b, %1.8b
+  bif\\t%0.8b, %3.8b, %1.8b
+  #"
+  "&& REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))"
+  [(match_dup 1) (match_dup 1) (match_dup 2) (match_dup 3)]
+{
+  /* Split back to individual operations.  If we're before reload, and
+     able to create a temporary register, do so.  If we're after reload,
+     we've got an early-clobber destination register, so use that.
+     Otherwise, we can't create pseudos and we can't yet guarantee that
+     operands[0] is safe to write, so FAIL to split.  */
+
+  rtx scratch;
+  if (reload_completed)
+    scratch = operands[0];
+  else if (can_create_pseudo_p ())
+    scratch = gen_reg_rtx (DImode);
+  else
+    FAIL;
+
+  emit_insn (gen_xordi3 (scratch, operands[2], operands[3]));
+  emit_insn (gen_anddi3 (scratch, scratch, operands[1]));
+  emit_insn (gen_xordi3 (operands[0], scratch, operands[3]));
+  DONE;
+}
+  [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple")
+   (set_attr "length" "4,4,4,12")]
+)
+
+(define_insn_and_split "aarch64_simd_bsldi_alt"
+  [(set (match_operand:DI 0 "register_operand" "=w,w,w,&r")
+       (xor:DI
+          (and:DI
+            (xor:DI
+              (match_operand:DI 3 "register_operand" "w,w,0,r")
+              (match_operand:DI 2 "register_operand" "w,0,w,r"))
+            (match_operand:DI 1 "register_operand" "0,w,w,r"))
+         (match_dup:DI 2)
+       ))]
+  "TARGET_SIMD"
+  "@
+  bsl\\t%0.8b, %3.8b, %2.8b
+  bit\\t%0.8b, %3.8b, %1.8b
+  bif\\t%0.8b, %2.8b, %1.8b
+  #"
+  "&& REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))"
+  [(match_dup 0) (match_dup 1) (match_dup 2) (match_dup 3)]
+{
+  /* Split back to individual operations.  If we're before reload, and
+     able to create a temporary register, do so.  If we're after reload,
+     we've got an early-clobber destination register, so use that.
+     Otherwise, we can't create pseudos and we can't yet guarantee that
+     operands[0] is safe to write, so FAIL to split.  */
+
+  rtx scratch;
+  if (reload_completed)
+    scratch = operands[0];
+  else if (can_create_pseudo_p ())
+    scratch = gen_reg_rtx (DImode);
+  else
+    FAIL;
+
+  emit_insn (gen_xordi3 (scratch, operands[2], operands[3]));
+  emit_insn (gen_anddi3 (scratch, scratch, operands[1]));
+  emit_insn (gen_xordi3 (operands[0], scratch, operands[2]));
+  DONE;
+}
+  [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple")
+   (set_attr "length" "4,4,4,12")]
+)
+
  (define_expand "aarch64_simd_bsl<mode>"
    [(match_operand:VALLDIF 0 "register_operand")
-   (match_operand:<V_cmp_result> 1 "register_operand")
+   (match_operand:<V_INT_EQUIV> 1 "register_operand")
     (match_operand:VALLDIF 2 "register_operand")
     (match_operand:VALLDIF 3 "register_operand")]
   "TARGET_SIMD"
@@ -2307,26 +2710,26 @@
    rtx tmp = operands[0];
    if (FLOAT_MODE_P (<MODE>mode))
      {
-      operands[2] = gen_lowpart (<V_cmp_result>mode, operands[2]);
-      operands[3] = gen_lowpart (<V_cmp_result>mode, operands[3]);
-      tmp = gen_reg_rtx (<V_cmp_result>mode);
+      operands[2] = gen_lowpart (<V_INT_EQUIV>mode, operands[2]);
+      operands[3] = gen_lowpart (<V_INT_EQUIV>mode, operands[3]);
+      tmp = gen_reg_rtx (<V_INT_EQUIV>mode);
      }
-  operands[1] = gen_lowpart (<V_cmp_result>mode, operands[1]);
-  emit_insn (gen_aarch64_simd_bsl<v_cmp_result>_internal (tmp,
-                                                         operands[1],
-                                                         operands[2],
-                                                         operands[3]));
+  operands[1] = gen_lowpart (<V_INT_EQUIV>mode, operands[1]);
+  emit_insn (gen_aarch64_simd_bsl<v_int_equiv>_internal (tmp,
+                                                        operands[1],
+                                                        operands[2],
+                                                        operands[3]));
    if (tmp != operands[0])
      emit_move_insn (operands[0], gen_lowpart (<MODE>mode, tmp));
  
    DONE;
  })
  
-(define_expand "vcond_mask_<mode><v_cmp_result>"
+(define_expand "vcond_mask_<mode><v_int_equiv>"
    [(match_operand:VALLDI 0 "register_operand")
     (match_operand:VALLDI 1 "nonmemory_operand")
     (match_operand:VALLDI 2 "nonmemory_operand")
-   (match_operand:<V_cmp_result> 3 "register_operand")]
+   (match_operand:<V_INT_EQUIV> 3 "register_operand")]
    "TARGET_SIMD"
  {
    /* If we have (a = (P) ? -1 : 0);
@@ -2337,7 +2740,7 @@
    /* Similarly, (a = (P) ? 0 : -1) is just inverting the generated mask.  */
    else if (operands[1] == CONST0_RTX (<MODE>mode)
            && operands[2] == CONSTM1_RTX (<MODE>mode))
-    emit_insn (gen_one_cmpl<v_cmp_result>2 (operands[0], operands[3]));
+    emit_insn (gen_one_cmpl<v_int_equiv>2 (operands[0], operands[3]));
    else
      {
        if (!REG_P (operands[1]))
@@ -2419,7 +2822,7 @@
      case NE:
        /* Handle NE as !EQ.  */
        emit_insn (gen_aarch64_cmeq<mode> (mask, operands[2], operands[3]));
-      emit_insn (gen_one_cmpl<v_cmp_result>2 (mask, mask));
+      emit_insn (gen_one_cmpl<v_int_equiv>2 (mask, mask));
        break;
  
      case EQ:
@@ -2433,8 +2836,8 @@
    DONE;
  })
  
-(define_expand "vec_cmp<mode><v_cmp_result>"
-  [(set (match_operand:<V_cmp_result> 0 "register_operand")
+(define_expand "vec_cmp<mode><v_int_equiv>"
+  [(set (match_operand:<V_INT_EQUIV> 0 "register_operand")
         (match_operator 1 "comparison_operator"
             [(match_operand:VDQF 2 "register_operand")
              (match_operand:VDQF 3 "nonmemory_operand")]))]
@@ -2442,7 +2845,7 @@
  {
    int use_zero_form = 0;
    enum rtx_code code = GET_CODE (operands[1]);
-  rtx tmp = gen_reg_rtx (<V_cmp_result>mode);
+  rtx tmp = gen_reg_rtx (<V_INT_EQUIV>mode);
  
    rtx (*comparison) (rtx, rtx, rtx) = NULL;
  
@@ -2475,10 +2878,10 @@
           break;
         }
        /* Fall through.  */
-    case UNGE:
+    case UNLT:
        std::swap (operands[2], operands[3]);
        /* Fall through.  */
-    case UNLE:
+    case UNGT:
      case GT:
        comparison = gen_aarch64_cmgt<mode>;
        break;
@@ -2489,10 +2892,10 @@
           break;
         }
        /* Fall through.  */
-    case UNGT:
+    case UNLE:
        std::swap (operands[2], operands[3]);
        /* Fall through.  */
-    case UNLT:
+    case UNGE:
      case GE:
        comparison = gen_aarch64_cmge<mode>;
        break;
@@ -2503,6 +2906,7 @@
      case UNEQ:
      case ORDERED:
      case UNORDERED:
+    case LTGT:
        break;
      default:
        gcc_unreachable ();
@@ -2514,21 +2918,41 @@
      case UNGT:
      case UNLE:
      case UNLT:
-    case NE:
-      /* FCM returns false for lanes which are unordered, so if we use
-        the inverse of the comparison we actually want to emit, then
-        invert the result, we will end up with the correct result.
-        Note that a NE NaN and NaN NE b are true for all a, b.
-
-        Our transformations are:
-        a UNGE b -> !(b GT a)
-        a UNGT b -> !(b GE a)
-        a UNLE b -> !(a GT b)
-        a UNLT b -> !(a GE b)
-        a   NE b -> !(a EQ b)  */
-      gcc_assert (comparison != NULL);
-      emit_insn (comparison (operands[0], operands[2], operands[3]));
-      emit_insn (gen_one_cmpl<v_cmp_result>2 (operands[0], operands[0]));
+      {
+       /* All of the above must not raise any FP exceptions.  Thus we first
+          check each operand for NaNs and force any elements containing NaN to
+          zero before using them in the compare.
+          Example: UN<cc> (a, b) -> UNORDERED (a, b) |
+                                    (cm<cc> (isnan (a) ? 0.0 : a,
+                                             isnan (b) ? 0.0 : b))
+          We use the following transformations for doing the comparisions:
+          a UNGE b -> a GE b
+          a UNGT b -> a GT b
+          a UNLE b -> b GE a
+          a UNLT b -> b GT a.  */
+
+       rtx tmp0 = gen_reg_rtx (<V_INT_EQUIV>mode);
+       rtx tmp1 = gen_reg_rtx (<V_INT_EQUIV>mode);
+       rtx tmp2 = gen_reg_rtx (<V_INT_EQUIV>mode);
+       emit_insn (gen_aarch64_cmeq<mode> (tmp0, operands[2], operands[2]));
+       emit_insn (gen_aarch64_cmeq<mode> (tmp1, operands[3], operands[3]));
+       emit_insn (gen_and<v_int_equiv>3 (tmp2, tmp0, tmp1));
+       emit_insn (gen_and<v_int_equiv>3 (tmp0, tmp0,
+                                         lowpart_subreg (<V_INT_EQUIV>mode,
+                                                         operands[2],
+                                                         <MODE>mode)));
+       emit_insn (gen_and<v_int_equiv>3 (tmp1, tmp1,
+                                         lowpart_subreg (<V_INT_EQUIV>mode,
+                                                         operands[3],
+                                                         <MODE>mode)));
+       gcc_assert (comparison != NULL);
+       emit_insn (comparison (operands[0],
+                              lowpart_subreg (<MODE>mode,
+                                              tmp0, <V_INT_EQUIV>mode),
+                              lowpart_subreg (<MODE>mode,
+                                              tmp1, <V_INT_EQUIV>mode)));
+       emit_insn (gen_orn<v_int_equiv>3 (operands[0], tmp2, operands[0]));
+      }
        break;
  
      case LT:
@@ -2536,42 +2960,46 @@
      case GT:
      case GE:
      case EQ:
+    case NE:
        /* The easy case.  Here we emit one of FCMGE, FCMGT or FCMEQ.
          As a LT b <=> b GE a && a LE b <=> b GT a.  Our transformations are:
          a GE b -> a GE b
          a GT b -> a GT b
          a LE b -> b GE a
          a LT b -> b GT a
-        a EQ b -> a EQ b  */
+        a EQ b -> a EQ b
+        a NE b -> ~(a EQ b)  */
        gcc_assert (comparison != NULL);
        emit_insn (comparison (operands[0], operands[2], operands[3]));
+      if (code == NE)
+       emit_insn (gen_one_cmpl<v_int_equiv>2 (operands[0], operands[0]));
        break;
  
-    case UNEQ:
-      /* We first check (a > b ||  b > a) which is !UNEQ, inverting
-        this result will then give us (a == b || a UNORDERED b).  */
+    case LTGT:
+      /* LTGT is not guranteed to not generate a FP exception.  So let's
+        go the faster way : ((a > b) || (b > a)).  */
        emit_insn (gen_aarch64_cmgt<mode> (operands[0],
                                          operands[2], operands[3]));
        emit_insn (gen_aarch64_cmgt<mode> (tmp, operands[3], operands[2]));
-      emit_insn (gen_ior<v_cmp_result>3 (operands[0], operands[0], tmp));
-      emit_insn (gen_one_cmpl<v_cmp_result>2 (operands[0], operands[0]));
-      break;
-
-    case UNORDERED:
-      /* Operands are ORDERED iff (a > b || b >= a), so we can compute
-        UNORDERED as !ORDERED.  */
-      emit_insn (gen_aarch64_cmgt<mode> (tmp, operands[2], operands[3]));
-      emit_insn (gen_aarch64_cmge<mode> (operands[0],
-                                        operands[3], operands[2]));
-      emit_insn (gen_ior<v_cmp_result>3 (operands[0], operands[0], tmp));
-      emit_insn (gen_one_cmpl<v_cmp_result>2 (operands[0], operands[0]));
+      emit_insn (gen_ior<v_int_equiv>3 (operands[0], operands[0], tmp));
        break;
  
      case ORDERED:
-      emit_insn (gen_aarch64_cmgt<mode> (tmp, operands[2], operands[3]));
-      emit_insn (gen_aarch64_cmge<mode> (operands[0],
-                                        operands[3], operands[2]));
-      emit_insn (gen_ior<v_cmp_result>3 (operands[0], operands[0], tmp));
+    case UNORDERED:
+    case UNEQ:
+      /* cmeq (a, a) & cmeq (b, b).  */
+      emit_insn (gen_aarch64_cmeq<mode> (operands[0],
+                                        operands[2], operands[2]));
+      emit_insn (gen_aarch64_cmeq<mode> (tmp, operands[3], operands[3]));
+      emit_insn (gen_and<v_int_equiv>3 (operands[0], operands[0], tmp));
+
+      if (code == UNORDERED)
+       emit_insn (gen_one_cmpl<v_int_equiv>2 (operands[0], operands[0]));
+      else if (code == UNEQ)
+       {
+         emit_insn (gen_aarch64_cmeq<mode> (tmp, operands[2], operands[3]));
+         emit_insn (gen_orn<v_int_equiv>3 (operands[0], operands[0], tmp));
+       }
        break;
  
      default:
@@ -2603,7 +3031,7 @@
           (match_operand:VALLDI 2 "nonmemory_operand")))]
    "TARGET_SIMD"
  {
-  rtx mask = gen_reg_rtx (<V_cmp_result>mode);
+  rtx mask = gen_reg_rtx (<V_INT_EQUIV>mode);
    enum rtx_code code = GET_CODE (operands[3]);
  
    /* NE is handled as !EQ in vec_cmp patterns, we can explicitly invert
@@ -2615,10 +3043,10 @@
                                     operands[4], operands[5]);
        std::swap (operands[1], operands[2]);
      }
-  emit_insn (gen_vec_cmp<mode><v_cmp_result> (mask, operands[3],
-                                             operands[4], operands[5]));
-  emit_insn (gen_vcond_mask_<mode><v_cmp_result> (operands[0], operands[1],
-                                                 operands[2], mask));
+  emit_insn (gen_vec_cmp<mode><v_int_equiv> (mask, operands[3],
+                                            operands[4], operands[5]));
+  emit_insn (gen_vcond_mask_<mode><v_int_equiv> (operands[0], operands[1],
+                                                operands[2], mask));
  
    DONE;
  })
@@ -2633,7 +3061,7 @@
           (match_operand:<V_cmp_mixed> 2 "nonmemory_operand")))]
    "TARGET_SIMD"
  {
-  rtx mask = gen_reg_rtx (<V_cmp_result>mode);
+  rtx mask = gen_reg_rtx (<V_INT_EQUIV>mode);
    enum rtx_code code = GET_CODE (operands[3]);
  
    /* NE is handled as !EQ in vec_cmp patterns, we can explicitly invert
@@ -2645,9 +3073,9 @@
                                     operands[4], operands[5]);
        std::swap (operands[1], operands[2]);
      }
-  emit_insn (gen_vec_cmp<mode><v_cmp_result> (mask, operands[3],
-                                             operands[4], operands[5]));
-  emit_insn (gen_vcond_mask_<v_cmp_mixed><v_cmp_result> (
+  emit_insn (gen_vec_cmp<mode><v_int_equiv> (mask, operands[3],
+                                            operands[4], operands[5]));
+  emit_insn (gen_vcond_mask_<v_cmp_mixed><v_int_equiv> (
                                                 operands[0], operands[1],
                                                 operands[2], mask));
  
@@ -2678,8 +3106,8 @@
      }
    emit_insn (gen_vec_cmp<mode><mode> (mask, operands[3],
                                       operands[4], operands[5]));
-  emit_insn (gen_vcond_mask_<mode><v_cmp_result> (operands[0], operands[1],
-                                                 operands[2], mask));
+  emit_insn (gen_vcond_mask_<mode><v_int_equiv> (operands[0], operands[1],
+                                                operands[2], mask));
    DONE;
  })
  
@@ -2693,7 +3121,7 @@
           (match_operand:VDQF 2 "nonmemory_operand")))]
    "TARGET_SIMD"
  {
-  rtx mask = gen_reg_rtx (<V_cmp_result>mode);
+  rtx mask = gen_reg_rtx (<V_INT_EQUIV>mode);
    enum rtx_code code = GET_CODE (operands[3]);
  
    /* NE is handled as !EQ in vec_cmp patterns, we can explicitly invert
@@ -2708,8 +3136,8 @@
    emit_insn (gen_vec_cmp<v_cmp_mixed><v_cmp_mixed> (
                                                   mask, operands[3],
                                                   operands[4], operands[5]));
-  emit_insn (gen_vcond_mask_<mode><v_cmp_result> (operands[0], operands[1],
-                                                 operands[2], mask));
+  emit_insn (gen_vcond_mask_<mode><v_int_equiv> (operands[0], operands[1],
+                                                operands[2], mask));
    DONE;
  })
  
@@ -2719,42 +3147,44 @@
  (define_insn "*aarch64_get_lane_extend<GPI:mode><VDQQH:mode>"
    [(set (match_operand:GPI 0 "register_operand" "=r")
         (sign_extend:GPI
-         (vec_select:<VEL>
+         (vec_select:<VDQQH:VEL>
             (match_operand:VDQQH 1 "register_operand" "w")
             (parallel [(match_operand:SI 2 "immediate_operand" "i")]))))]
    "TARGET_SIMD"
    {
-    operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2])));
+    operands[2] = aarch64_endian_lane_rtx (<VDQQH:MODE>mode,
+                                          INTVAL (operands[2]));
      return "smov\\t%<GPI:w>0, %1.<VDQQH:Vetype>[%2]";
    }
-  [(set_attr "type" "neon_to_gp<q>")]
+  [(set_attr "type" "neon_to_gp<VDQQH:q>")]
  )
  
-(define_insn "*aarch64_get_lane_zero_extendsi<mode>"
-  [(set (match_operand:SI 0 "register_operand" "=r")
-       (zero_extend:SI
-         (vec_select:<VEL>
+(define_insn "*aarch64_get_lane_zero_extend<GPI:mode><VDQQH:mode>"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+       (zero_extend:GPI
+         (vec_select:<VDQQH:VEL>
             (match_operand:VDQQH 1 "register_operand" "w")
             (parallel [(match_operand:SI 2 "immediate_operand" "i")]))))]
    "TARGET_SIMD"
    {
-    operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2])));
-    return "umov\\t%w0, %1.<Vetype>[%2]";
+    operands[2] = aarch64_endian_lane_rtx (<VDQQH:MODE>mode,
+                                          INTVAL (operands[2]));
+    return "umov\\t%w0, %1.<VDQQH:Vetype>[%2]";
    }
-  [(set_attr "type" "neon_to_gp<q>")]
+  [(set_attr "type" "neon_to_gp<VDQQH:q>")]
  )
  
  ;; Lane extraction of a value, neither sign nor zero extension
  ;; is guaranteed so upper bits should be considered undefined.
  ;; RTL uses GCC vector extension indices throughout so flip only for assembly.
  (define_insn "aarch64_get_lane<mode>"
-  [(set (match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand" "=r, w, Utv")
+  [(set (match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand" "=?r, w, Utv")
         (vec_select:<VEL>
           (match_operand:VALL_F16 1 "register_operand" "w, w, w")
           (parallel [(match_operand:SI 2 "immediate_operand" "i, i, i")])))]
    "TARGET_SIMD"
    {
-    operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2])));
+    operands[2] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[2]));
      switch (which_alternative)
        {
         case 0:
@@ -2770,37 +3200,61 @@
    [(set_attr "type" "neon_to_gp<q>, neon_dup<q>, neon_store1_one_lane<q>")]
  )
  
-;; In this insn, operand 1 should be low, and operand 2 the high part of the
-;; dest vector.
+(define_insn "load_pair_lanes<mode>"
+  [(set (match_operand:<VDBL> 0 "register_operand" "=w")
+       (vec_concat:<VDBL>
+          (match_operand:VDC 1 "memory_operand" "Utq")
+          (match_operand:VDC 2 "memory_operand" "m")))]
+  "TARGET_SIMD && !STRICT_ALIGNMENT
+   && rtx_equal_p (XEXP (operands[2], 0),
+                  plus_constant (Pmode,
+                                 XEXP (operands[1], 0),
+                                 GET_MODE_SIZE (<MODE>mode)))"
+  "ldr\\t%q0, %1"
+  [(set_attr "type" "neon_load1_1reg_q")]
+)
  
-(define_insn "*aarch64_combinez<mode>"
-  [(set (match_operand:<VDBL> 0 "register_operand" "=w,w,w")
-        (vec_concat:<VDBL>
-          (match_operand:VD_BHSI 1 "general_operand" "w,?r,m")
-          (match_operand:VD_BHSI 2 "aarch64_simd_imm_zero" "Dz,Dz,Dz")))]
+(define_insn "store_pair_lanes<mode>"
+  [(set (match_operand:<VDBL> 0 "aarch64_mem_pair_lanes_operand" "=Umn, Umn")
+       (vec_concat:<VDBL>
+          (match_operand:VDC 1 "register_operand" "w, r")
+          (match_operand:VDC 2 "register_operand" "w, r")))]
+  "TARGET_SIMD"
+  "@
+   stp\\t%d1, %d2, %y0
+   stp\\t%x1, %x2, %y0"
+  [(set_attr "type" "neon_stp, store_16")]
+)
+
+;; In this insn, operand 1 should be low, and operand 2 the high part of the
+;; dest vector.
+
+(define_insn "@aarch64_combinez<mode>"
+  [(set (match_operand:<VDBL> 0 "register_operand" "=w,w,w")
+       (vec_concat:<VDBL>
+         (match_operand:VDC 1 "general_operand" "w,?r,m")
+         (match_operand:VDC 2 "aarch64_simd_or_scalar_imm_zero")))]
    "TARGET_SIMD && !BYTES_BIG_ENDIAN"
    "@
     mov\\t%0.8b, %1.8b
     fmov\t%d0, %1
     ldr\\t%d0, %1"
    [(set_attr "type" "neon_move<q>, neon_from_gp, neon_load1_1reg")
-   (set_attr "simd" "yes,*,yes")
-   (set_attr "fp" "*,yes,*")]
+   (set_attr "arch" "simd,fp,simd")]
  )
  
-(define_insn "*aarch64_combinez_be<mode>"
+(define_insn "@aarch64_combinez_be<mode>"
    [(set (match_operand:<VDBL> 0 "register_operand" "=w,w,w")
          (vec_concat:<VDBL>
-          (match_operand:VD_BHSI 2 "aarch64_simd_imm_zero" "Dz,Dz,Dz")
-          (match_operand:VD_BHSI 1 "general_operand" "w,?r,m")))]
+         (match_operand:VDC 2 "aarch64_simd_or_scalar_imm_zero")
+         (match_operand:VDC 1 "general_operand" "w,?r,m")))]
    "TARGET_SIMD && BYTES_BIG_ENDIAN"
    "@
     mov\\t%0.8b, %1.8b
     fmov\t%d0, %1
     ldr\\t%d0, %1"
    [(set_attr "type" "neon_move<q>, neon_from_gp, neon_load1_1reg")
-   (set_attr "simd" "yes,*,yes")
-   (set_attr "fp" "*,yes,*")]
+   (set_attr "arch" "simd,fp,simd")]
  )
  
  (define_expand "aarch64_combine<mode>"
@@ -2809,41 +3263,13 @@
     (match_operand:VDC 2 "register_operand")]
    "TARGET_SIMD"
  {
-  rtx op1, op2;
-  if (BYTES_BIG_ENDIAN)
-    {
-      op1 = operands[2];
-      op2 = operands[1];
-    }
-  else
-    {
-      op1 = operands[1];
-      op2 = operands[2];
-    }
-  emit_insn (gen_aarch64_combine_internal<mode> (operands[0], op1, op2));
-  DONE;
-}
-)
+  aarch64_split_simd_combine (operands[0], operands[1], operands[2]);
  
-(define_insn_and_split "aarch64_combine_internal<mode>"
-  [(set (match_operand:<VDBL> 0 "register_operand" "=&w")
-        (vec_concat:<VDBL> (match_operand:VDC 1 "register_operand" "w")
-                          (match_operand:VDC 2 "register_operand" "w")))]
-  "TARGET_SIMD"
-  "#"
-  "&& reload_completed"
-  [(const_int 0)]
-{
-  if (BYTES_BIG_ENDIAN)
-    aarch64_split_simd_combine (operands[0], operands[2], operands[1]);
-  else
-    aarch64_split_simd_combine (operands[0], operands[1], operands[2]);
    DONE;
  }
-[(set_attr "type" "multiple")]
  )
  
-(define_expand "aarch64_simd_combine<mode>"
+(define_expand "@aarch64_simd_combine<mode>"
    [(match_operand:<VDBL> 0 "register_operand")
     (match_operand:VDC 1 "register_operand")
     (match_operand:VDC 2 "register_operand")]
@@ -2886,48 +3312,48 @@
  
  
  (define_expand "aarch64_saddl2<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
-   (match_operand:VQW 1 "register_operand" "w")
-   (match_operand:VQW 2 "register_operand" "w")]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (match_operand:VQW 1 "register_operand")
+   (match_operand:VQW 2 "register_operand")]
    "TARGET_SIMD"
  {
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
    emit_insn (gen_aarch64_saddl<mode>_hi_internal (operands[0], operands[1],
                                                    operands[2], p));
    DONE;
  })
  
  (define_expand "aarch64_uaddl2<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
-   (match_operand:VQW 1 "register_operand" "w")
-   (match_operand:VQW 2 "register_operand" "w")]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (match_operand:VQW 1 "register_operand")
+   (match_operand:VQW 2 "register_operand")]
    "TARGET_SIMD"
  {
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
    emit_insn (gen_aarch64_uaddl<mode>_hi_internal (operands[0], operands[1],
                                                    operands[2], p));
    DONE;
  })
  
  (define_expand "aarch64_ssubl2<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
-   (match_operand:VQW 1 "register_operand" "w")
-   (match_operand:VQW 2 "register_operand" "w")]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (match_operand:VQW 1 "register_operand")
+   (match_operand:VQW 2 "register_operand")]
    "TARGET_SIMD"
  {
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
    emit_insn (gen_aarch64_ssubl<mode>_hi_internal (operands[0], operands[1],
                                                 operands[2], p));
    DONE;
  })
  
  (define_expand "aarch64_usubl2<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
-   (match_operand:VQW 1 "register_operand" "w")
-   (match_operand:VQW 2 "register_operand" "w")]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (match_operand:VQW 1 "register_operand")
+   (match_operand:VQW 2 "register_operand")]
    "TARGET_SIMD"
  {
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
    emit_insn (gen_aarch64_usubl<mode>_hi_internal (operands[0], operands[1],
                                                 operands[2], p));
    DONE;
@@ -2947,13 +3373,13 @@
  ;; <su><addsub>w<q>.
  
  (define_expand "widen_ssum<mode>3"
-  [(set (match_operand:<VDBLW> 0 "register_operand" "")
+  [(set (match_operand:<VDBLW> 0 "register_operand")
         (plus:<VDBLW> (sign_extend:<VDBLW> 
-                       (match_operand:VQW 1 "register_operand" ""))
-                     (match_operand:<VDBLW> 2 "register_operand" "")))]
+                       (match_operand:VQW 1 "register_operand"))
+                     (match_operand:<VDBLW> 2 "register_operand")))]
    "TARGET_SIMD"
    {
-    rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, false);
+    rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false);
      rtx temp = gen_reg_rtx (GET_MODE (operands[0]));
  
      emit_insn (gen_aarch64_saddw<mode>_internal (temp, operands[2],
@@ -2964,10 +3390,10 @@
  )
  
  (define_expand "widen_ssum<mode>3"
-  [(set (match_operand:<VWIDE> 0 "register_operand" "")
+  [(set (match_operand:<VWIDE> 0 "register_operand")
         (plus:<VWIDE> (sign_extend:<VWIDE>
-                       (match_operand:VD_BHSI 1 "register_operand" ""))
-                     (match_operand:<VWIDE> 2 "register_operand" "")))]
+                       (match_operand:VD_BHSI 1 "register_operand"))
+                     (match_operand:<VWIDE> 2 "register_operand")))]
    "TARGET_SIMD"
  {
    emit_insn (gen_aarch64_saddw<mode> (operands[0], operands[2], operands[1]));
@@ -2975,13 +3401,13 @@
  })
  
  (define_expand "widen_usum<mode>3"
-  [(set (match_operand:<VDBLW> 0 "register_operand" "")
+  [(set (match_operand:<VDBLW> 0 "register_operand")
         (plus:<VDBLW> (zero_extend:<VDBLW> 
-                       (match_operand:VQW 1 "register_operand" ""))
-                     (match_operand:<VDBLW> 2 "register_operand" "")))]
+                       (match_operand:VQW 1 "register_operand"))
+                     (match_operand:<VDBLW> 2 "register_operand")))]
    "TARGET_SIMD"
    {
-    rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, false);
+    rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false);
      rtx temp = gen_reg_rtx (GET_MODE (operands[0]));
  
      emit_insn (gen_aarch64_uaddw<mode>_internal (temp, operands[2],
@@ -2992,69 +3418,105 @@
  )
  
  (define_expand "widen_usum<mode>3"
-  [(set (match_operand:<VWIDE> 0 "register_operand" "")
+  [(set (match_operand:<VWIDE> 0 "register_operand")
         (plus:<VWIDE> (zero_extend:<VWIDE>
-                       (match_operand:VD_BHSI 1 "register_operand" ""))
-                     (match_operand:<VWIDE> 2 "register_operand" "")))]
+                       (match_operand:VD_BHSI 1 "register_operand"))
+                     (match_operand:<VWIDE> 2 "register_operand")))]
    "TARGET_SIMD"
  {
    emit_insn (gen_aarch64_uaddw<mode> (operands[0], operands[2], operands[1]));
    DONE;
  })
  
-(define_insn "aarch64_<ANY_EXTEND:su><ADDSUB:optab>w<mode>"
+(define_insn "aarch64_<ANY_EXTEND:su>subw<mode>"
    [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
-        (ADDSUB:<VWIDE> (match_operand:<VWIDE> 1 "register_operand" "w")
-                       (ANY_EXTEND:<VWIDE>
-                         (match_operand:VD_BHSI 2 "register_operand" "w"))))]
+       (minus:<VWIDE> (match_operand:<VWIDE> 1 "register_operand" "w")
+         (ANY_EXTEND:<VWIDE>
+           (match_operand:VD_BHSI 2 "register_operand" "w"))))]
    "TARGET_SIMD"
-  "<ANY_EXTEND:su><ADDSUB:optab>w\\t%0.<Vwtype>, %1.<Vwtype>, %2.<Vtype>"
-  [(set_attr "type" "neon_<ADDSUB:optab>_widen")]
+  "<ANY_EXTEND:su>subw\\t%0.<Vwtype>, %1.<Vwtype>, %2.<Vtype>"
+  [(set_attr "type" "neon_sub_widen")]
  )
  
-(define_insn "aarch64_<ANY_EXTEND:su><ADDSUB:optab>w<mode>_internal"
+(define_insn "aarch64_<ANY_EXTEND:su>subw<mode>_internal"
    [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
-        (ADDSUB:<VWIDE> (match_operand:<VWIDE> 1 "register_operand" "w")
-                       (ANY_EXTEND:<VWIDE>
-                         (vec_select:<VHALF>
-                          (match_operand:VQW 2 "register_operand" "w")
-                          (match_operand:VQW 3 "vect_par_cnst_lo_half" "")))))]
+       (minus:<VWIDE> (match_operand:<VWIDE> 1 "register_operand" "w")
+         (ANY_EXTEND:<VWIDE>
+           (vec_select:<VHALF>
+             (match_operand:VQW 2 "register_operand" "w")
+             (match_operand:VQW 3 "vect_par_cnst_lo_half" "")))))]
    "TARGET_SIMD"
-  "<ANY_EXTEND:su><ADDSUB:optab>w\\t%0.<Vwtype>, %1.<Vwtype>, %2.<Vhalftype>"
-  [(set_attr "type" "neon_<ADDSUB:optab>_widen")]
+  "<ANY_EXTEND:su>subw\\t%0.<Vwtype>, %1.<Vwtype>, %2.<Vhalftype>"
+  [(set_attr "type" "neon_sub_widen")]
  )
  
-(define_insn "aarch64_<ANY_EXTEND:su><ADDSUB:optab>w2<mode>_internal"
+(define_insn "aarch64_<ANY_EXTEND:su>subw2<mode>_internal"
    [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
-        (ADDSUB:<VWIDE> (match_operand:<VWIDE> 1 "register_operand" "w")
-                       (ANY_EXTEND:<VWIDE>
-                         (vec_select:<VHALF>
-                          (match_operand:VQW 2 "register_operand" "w")
-                          (match_operand:VQW 3 "vect_par_cnst_hi_half" "")))))]
+       (minus:<VWIDE> (match_operand:<VWIDE> 1 "register_operand" "w")
+         (ANY_EXTEND:<VWIDE>
+           (vec_select:<VHALF>
+             (match_operand:VQW 2 "register_operand" "w")
+             (match_operand:VQW 3 "vect_par_cnst_hi_half" "")))))]
    "TARGET_SIMD"
-  "<ANY_EXTEND:su><ADDSUB:optab>w2\\t%0.<Vwtype>, %1.<Vwtype>, %2.<Vtype>"
-  [(set_attr "type" "neon_<ADDSUB:optab>_widen")]
+  "<ANY_EXTEND:su>subw2\\t%0.<Vwtype>, %1.<Vwtype>, %2.<Vtype>"
+  [(set_attr "type" "neon_sub_widen")]
+)
+
+(define_insn "aarch64_<ANY_EXTEND:su>addw<mode>"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+       (plus:<VWIDE>
+         (ANY_EXTEND:<VWIDE> (match_operand:VD_BHSI 2 "register_operand" "w"))
+         (match_operand:<VWIDE> 1 "register_operand" "w")))]
+  "TARGET_SIMD"
+  "<ANY_EXTEND:su>addw\\t%0.<Vwtype>, %1.<Vwtype>, %2.<Vtype>"
+  [(set_attr "type" "neon_add_widen")]
+)
+
+(define_insn "aarch64_<ANY_EXTEND:su>addw<mode>_internal"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+       (plus:<VWIDE>
+         (ANY_EXTEND:<VWIDE>
+           (vec_select:<VHALF>
+             (match_operand:VQW 2 "register_operand" "w")
+             (match_operand:VQW 3 "vect_par_cnst_lo_half" "")))
+         (match_operand:<VWIDE> 1 "register_operand" "w")))]
+  "TARGET_SIMD"
+  "<ANY_EXTEND:su>addw\\t%0.<Vwtype>, %1.<Vwtype>, %2.<Vhalftype>"
+  [(set_attr "type" "neon_add_widen")]
+)
+
+(define_insn "aarch64_<ANY_EXTEND:su>addw2<mode>_internal"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+       (plus:<VWIDE>
+         (ANY_EXTEND:<VWIDE>
+           (vec_select:<VHALF>
+             (match_operand:VQW 2 "register_operand" "w")
+             (match_operand:VQW 3 "vect_par_cnst_hi_half" "")))
+         (match_operand:<VWIDE> 1 "register_operand" "w")))]
+  "TARGET_SIMD"
+  "<ANY_EXTEND:su>addw2\\t%0.<Vwtype>, %1.<Vwtype>, %2.<Vtype>"
+  [(set_attr "type" "neon_add_widen")]
  )
  
  (define_expand "aarch64_saddw2<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
-   (match_operand:<VWIDE> 1 "register_operand" "w")
-   (match_operand:VQW 2 "register_operand" "w")]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (match_operand:<VWIDE> 1 "register_operand")
+   (match_operand:VQW 2 "register_operand")]
    "TARGET_SIMD"
  {
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
    emit_insn (gen_aarch64_saddw2<mode>_internal (operands[0], operands[1],
                                                 operands[2], p));
    DONE;
  })
  
  (define_expand "aarch64_uaddw2<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
-   (match_operand:<VWIDE> 1 "register_operand" "w")
-   (match_operand:VQW 2 "register_operand" "w")]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (match_operand:<VWIDE> 1 "register_operand")
+   (match_operand:VQW 2 "register_operand")]
    "TARGET_SIMD"
  {
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
    emit_insn (gen_aarch64_uaddw2<mode>_internal (operands[0], operands[1],
                                                 operands[2], p));
    DONE;
@@ -3062,24 +3524,24 @@
  
  
  (define_expand "aarch64_ssubw2<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
-   (match_operand:<VWIDE> 1 "register_operand" "w")
-   (match_operand:VQW 2 "register_operand" "w")]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (match_operand:<VWIDE> 1 "register_operand")
+   (match_operand:VQW 2 "register_operand")]
    "TARGET_SIMD"
  {
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
    emit_insn (gen_aarch64_ssubw2<mode>_internal (operands[0], operands[1],
                                                 operands[2], p));
    DONE;
  })
  
  (define_expand "aarch64_usubw2<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
-   (match_operand:<VWIDE> 1 "register_operand" "w")
-   (match_operand:VQW 2 "register_operand" "w")]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (match_operand:<VWIDE> 1 "register_operand")
+   (match_operand:VQW 2 "register_operand")]
    "TARGET_SIMD"
  {
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
    emit_insn (gen_aarch64_usubw2<mode>_internal (operands[0], operands[1],
                                                 operands[2], p));
    DONE;
@@ -3087,6 +3549,22 @@
  
  ;; <su><r>h<addsub>.
  
+(define_expand "<u>avg<mode>3_floor"
+  [(set (match_operand:VDQ_BHSI 0 "register_operand")
+       (unspec:VDQ_BHSI [(match_operand:VDQ_BHSI 1 "register_operand")
+                         (match_operand:VDQ_BHSI 2 "register_operand")]
+                        HADD))]
+  "TARGET_SIMD"
+)
+
+(define_expand "<u>avg<mode>3_ceil"
+  [(set (match_operand:VDQ_BHSI 0 "register_operand")
+       (unspec:VDQ_BHSI [(match_operand:VDQ_BHSI 1 "register_operand")
+                         (match_operand:VDQ_BHSI 2 "register_operand")]
+                        RHADD))]
+  "TARGET_SIMD"
+)
+
  (define_insn "aarch64_<sur>h<addsub><mode>"
    [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w")
          (unspec:VDQ_BHSI [(match_operand:VDQ_BHSI 1 "register_operand" "w")
@@ -3158,8 +3636,7 @@
          UNSPEC_FMULX))]
    "TARGET_SIMD"
    {
-    operands[3] = GEN_INT (ENDIAN_LANE_N (<VSWAP_WIDTH>mode,
-                                         INTVAL (operands[3])));
+    operands[3] = aarch64_endian_lane_rtx (<VSWAP_WIDTH>mode, INTVAL (operands[3]));
      return "fmulx\t%<v>0<Vmtype>, %<v>1<Vmtype>, %2.<Vetype>[%3]";
    }
    [(set_attr "type" "neon_fp_mul_<Vetype>_scalar<q>")]
@@ -3178,7 +3655,7 @@
          UNSPEC_FMULX))]
    "TARGET_SIMD"
    {
-    operands[3] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[3])));
+    operands[3] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[3]));
      return "fmulx\t%<v>0<Vmtype>, %<v>1<Vmtype>, %2.<Vetype>[%3]";
    }
    [(set_attr "type" "neon_fp_mul_<Vetype><q>")]
@@ -3212,7 +3689,7 @@
          UNSPEC_FMULX))]
    "TARGET_SIMD"
    {
-    operands[3] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[3])));
+    operands[3] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[3]));
      return "fmulx\t%<Vetype>0, %<Vetype>1, %2.<Vetype>[%3]";
    }
    [(set_attr "type" "fmul<Vetype>")]
@@ -3298,7 +3775,7 @@
          VQDMULH))]
    "TARGET_SIMD"
    "*
-   operands[3] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[3])));
+   operands[3] = aarch64_endian_lane_rtx (<VCOND>mode, INTVAL (operands[3]));
     return \"sq<r>dmulh\\t%0.<Vtype>, %1.<Vtype>, %2.<Vetype>[%3]\";"
    [(set_attr "type" "neon_sat_mul_<Vetype>_scalar<q>")]
  )
@@ -3313,7 +3790,7 @@
          VQDMULH))]
    "TARGET_SIMD"
    "*
-   operands[3] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[3])));
+   operands[3] = aarch64_endian_lane_rtx (<VCONQ>mode, INTVAL (operands[3]));
     return \"sq<r>dmulh\\t%0.<Vtype>, %1.<Vtype>, %2.<Vetype>[%3]\";"
    [(set_attr "type" "neon_sat_mul_<Vetype>_scalar<q>")]
  )
@@ -3328,7 +3805,7 @@
          VQDMULH))]
    "TARGET_SIMD"
    "*
-   operands[3] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[3])));
+   operands[3] = aarch64_endian_lane_rtx (<VCOND>mode, INTVAL (operands[3]));
     return \"sq<r>dmulh\\t%<v>0, %<v>1, %2.<v>[%3]\";"
    [(set_attr "type" "neon_sat_mul_<Vetype>_scalar<q>")]
  )
@@ -3343,7 +3820,7 @@
          VQDMULH))]
    "TARGET_SIMD"
    "*
-   operands[3] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[3])));
+   operands[3] = aarch64_endian_lane_rtx (<VCONQ>mode, INTVAL (operands[3]));
     return \"sq<r>dmulh\\t%<v>0, %<v>1, %2.<v>[%3]\";"
    [(set_attr "type" "neon_sat_mul_<Vetype>_scalar<q>")]
  )
@@ -3375,7 +3852,7 @@
           SQRDMLH_AS))]
     "TARGET_SIMD_RDMA"
     {
-     operands[4] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[4])));
+     operands[4] = aarch64_endian_lane_rtx (<VCOND>mode, INTVAL (operands[4]));
       return
        "sqrdml<SQRDMLH_AS:rdma_as>h\\t%0.<Vtype>, %2.<Vtype>, %3.<Vetype>[%4]";
     }
@@ -3393,7 +3870,7 @@
           SQRDMLH_AS))]
     "TARGET_SIMD_RDMA"
     {
-     operands[4] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[4])));
+     operands[4] = aarch64_endian_lane_rtx (<VCOND>mode, INTVAL (operands[4]));
       return
        "sqrdml<SQRDMLH_AS:rdma_as>h\\t%<v>0, %<v>2, %3.<Vetype>[%4]";
     }
@@ -3413,7 +3890,7 @@
           SQRDMLH_AS))]
     "TARGET_SIMD_RDMA"
     {
-     operands[4] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[4])));
+     operands[4] = aarch64_endian_lane_rtx (<VCONQ>mode, INTVAL (operands[4]));
       return
        "sqrdml<SQRDMLH_AS:rdma_as>h\\t%0.<Vtype>, %2.<Vtype>, %3.<Vetype>[%4]";
     }
@@ -3431,7 +3908,7 @@
           SQRDMLH_AS))]
     "TARGET_SIMD_RDMA"
     {
-     operands[4] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[4])));
+     operands[4] = aarch64_endian_lane_rtx (<VCONQ>mode, INTVAL (operands[4]));
       return
        "sqrdml<SQRDMLH_AS:rdma_as>h\\t%<v>0, %<v>2, %3.<v>[%4]";
     }
@@ -3475,7 +3952,7 @@
             (const_int 1))))]
    "TARGET_SIMD"
    {
-    operands[4] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[4])));
+    operands[4] = aarch64_endian_lane_rtx (<VCOND>mode, INTVAL (operands[4]));
      return
        "sqdml<SBINQOPS:as>l\\t%<vw2>0<Vmwtype>, %<v>2<Vmtype>, %3.<Vetype>[%4]";
    }
@@ -3499,7 +3976,7 @@
             (const_int 1))))]
    "TARGET_SIMD"
    {
-    operands[4] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[4])));
+    operands[4] = aarch64_endian_lane_rtx (<VCONQ>mode, INTVAL (operands[4]));
      return
        "sqdml<SBINQOPS:as>l\\t%<vw2>0<Vmwtype>, %<v>2<Vmtype>, %3.<Vetype>[%4]";
    }
@@ -3522,7 +3999,7 @@
             (const_int 1))))]
    "TARGET_SIMD"
    {
-    operands[4] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[4])));
+    operands[4] = aarch64_endian_lane_rtx (<VCOND>mode, INTVAL (operands[4]));
      return
        "sqdml<SBINQOPS:as>l\\t%<vw2>0<Vmwtype>, %<v>2<Vmtype>, %3.<Vetype>[%4]";
    }
@@ -3545,7 +4022,7 @@
             (const_int 1))))]
    "TARGET_SIMD"
    {
-    operands[4] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[4])));
+    operands[4] = aarch64_endian_lane_rtx (<VCONQ>mode, INTVAL (operands[4]));
      return
        "sqdml<SBINQOPS:as>l\\t%<vw2>0<Vmwtype>, %<v>2<Vmtype>, %3.<Vetype>[%4]";
    }
@@ -3594,26 +4071,26 @@
  )
  
  (define_expand "aarch64_sqdmlal2<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
-   (match_operand:<VWIDE> 1 "register_operand" "w")
-   (match_operand:VQ_HSI 2 "register_operand" "w")
-   (match_operand:VQ_HSI 3 "register_operand" "w")]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (match_operand:<VWIDE> 1 "register_operand")
+   (match_operand:VQ_HSI 2 "register_operand")
+   (match_operand:VQ_HSI 3 "register_operand")]
    "TARGET_SIMD"
  {
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
    emit_insn (gen_aarch64_sqdmlal2<mode>_internal (operands[0], operands[1],
                                                   operands[2], operands[3], p));
    DONE;
  })
  
  (define_expand "aarch64_sqdmlsl2<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
-   (match_operand:<VWIDE> 1 "register_operand" "w")
-   (match_operand:VQ_HSI 2 "register_operand" "w")
-   (match_operand:VQ_HSI 3 "register_operand" "w")]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (match_operand:<VWIDE> 1 "register_operand")
+   (match_operand:VQ_HSI 2 "register_operand")
+   (match_operand:VQ_HSI 3 "register_operand")]
    "TARGET_SIMD"
  {
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
    emit_insn (gen_aarch64_sqdmlsl2<mode>_internal (operands[0], operands[1],
                                                   operands[2], operands[3], p));
    DONE;
@@ -3640,7 +4117,7 @@
               (const_int 1))))]
    "TARGET_SIMD"
    {
-    operands[4] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[4])));
+    operands[4] = aarch64_endian_lane_rtx (<VCOND>mode, INTVAL (operands[4]));
      return
       "sqdml<SBINQOPS:as>l2\\t%<vw2>0<Vmwtype>, %<v>2<Vmtype>, %3.<Vetype>[%4]";
    }
@@ -3666,7 +4143,7 @@
               (const_int 1))))]
    "TARGET_SIMD"
    {
-    operands[4] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[4])));
+    operands[4] = aarch64_endian_lane_rtx (<VCONQ>mode, INTVAL (operands[4]));
      return
       "sqdml<SBINQOPS:as>l2\\t%<vw2>0<Vmwtype>, %<v>2<Vmtype>, %3.<Vetype>[%4]";
    }
@@ -3674,14 +4151,14 @@
  )
  
  (define_expand "aarch64_sqdmlal2_lane<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
-   (match_operand:<VWIDE> 1 "register_operand" "w")
-   (match_operand:VQ_HSI 2 "register_operand" "w")
-   (match_operand:<VCOND> 3 "register_operand" "<vwx>")
-   (match_operand:SI 4 "immediate_operand" "i")]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (match_operand:<VWIDE> 1 "register_operand")
+   (match_operand:VQ_HSI 2 "register_operand")
+   (match_operand:<VCOND> 3 "register_operand")
+   (match_operand:SI 4 "immediate_operand")]
    "TARGET_SIMD"
  {
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
    emit_insn (gen_aarch64_sqdmlal2_lane<mode>_internal (operands[0], operands[1],
                                                        operands[2], operands[3],
                                                        operands[4], p));
@@ -3689,14 +4166,14 @@
  })
  
  (define_expand "aarch64_sqdmlal2_laneq<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
-   (match_operand:<VWIDE> 1 "register_operand" "w")
-   (match_operand:VQ_HSI 2 "register_operand" "w")
-   (match_operand:<VCONQ> 3 "register_operand" "<vwx>")
-   (match_operand:SI 4 "immediate_operand" "i")]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (match_operand:<VWIDE> 1 "register_operand")
+   (match_operand:VQ_HSI 2 "register_operand")
+   (match_operand:<VCONQ> 3 "register_operand")
+   (match_operand:SI 4 "immediate_operand")]
    "TARGET_SIMD"
  {
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
    emit_insn (gen_aarch64_sqdmlal2_laneq<mode>_internal (operands[0], operands[1],
                                                        operands[2], operands[3],
                                                        operands[4], p));
@@ -3704,14 +4181,14 @@
  })
  
  (define_expand "aarch64_sqdmlsl2_lane<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
-   (match_operand:<VWIDE> 1 "register_operand" "w")
-   (match_operand:VQ_HSI 2 "register_operand" "w")
-   (match_operand:<VCOND> 3 "register_operand" "<vwx>")
-   (match_operand:SI 4 "immediate_operand" "i")]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (match_operand:<VWIDE> 1 "register_operand")
+   (match_operand:VQ_HSI 2 "register_operand")
+   (match_operand:<VCOND> 3 "register_operand")
+   (match_operand:SI 4 "immediate_operand")]
    "TARGET_SIMD"
  {
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
    emit_insn (gen_aarch64_sqdmlsl2_lane<mode>_internal (operands[0], operands[1],
                                                        operands[2], operands[3],
                                                        operands[4], p));
@@ -3719,14 +4196,14 @@
  })
  
  (define_expand "aarch64_sqdmlsl2_laneq<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
-   (match_operand:<VWIDE> 1 "register_operand" "w")
-   (match_operand:VQ_HSI 2 "register_operand" "w")
-   (match_operand:<VCONQ> 3 "register_operand" "<vwx>")
-   (match_operand:SI 4 "immediate_operand" "i")]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (match_operand:<VWIDE> 1 "register_operand")
+   (match_operand:VQ_HSI 2 "register_operand")
+   (match_operand:<VCONQ> 3 "register_operand")
+   (match_operand:SI 4 "immediate_operand")]
    "TARGET_SIMD"
  {
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
    emit_insn (gen_aarch64_sqdmlsl2_laneq<mode>_internal (operands[0], operands[1],
                                                        operands[2], operands[3],
                                                        operands[4], p));
@@ -3753,13 +4230,13 @@
  )
  
  (define_expand "aarch64_sqdmlal2_n<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
-   (match_operand:<VWIDE> 1 "register_operand" "w")
-   (match_operand:VQ_HSI 2 "register_operand" "w")
-   (match_operand:<VEL> 3 "register_operand" "w")]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (match_operand:<VWIDE> 1 "register_operand")
+   (match_operand:VQ_HSI 2 "register_operand")
+   (match_operand:<VEL> 3 "register_operand")]
    "TARGET_SIMD"
  {
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
    emit_insn (gen_aarch64_sqdmlal2_n<mode>_internal (operands[0], operands[1],
                                                     operands[2], operands[3],
                                                     p));
@@ -3767,13 +4244,13 @@
  })
  
  (define_expand "aarch64_sqdmlsl2_n<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
-   (match_operand:<VWIDE> 1 "register_operand" "w")
-   (match_operand:VQ_HSI 2 "register_operand" "w")
-   (match_operand:<VEL> 3 "register_operand" "w")]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (match_operand:<VWIDE> 1 "register_operand")
+   (match_operand:VQ_HSI 2 "register_operand")
+   (match_operand:<VEL> 3 "register_operand")]
    "TARGET_SIMD"
  {
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
    emit_insn (gen_aarch64_sqdmlsl2_n<mode>_internal (operands[0], operands[1],
                                                     operands[2], operands[3],
                                                     p));
@@ -3813,7 +4290,7 @@
              (const_int 1)))]
    "TARGET_SIMD"
    {
-    operands[3] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[3])));
+    operands[3] = aarch64_endian_lane_rtx (<VCOND>mode, INTVAL (operands[3]));
      return "sqdmull\\t%<vw2>0<Vmwtype>, %<v>1<Vmtype>, %2.<Vetype>[%3]";
    }
    [(set_attr "type" "neon_sat_mul_<Vetype>_scalar_long")]
@@ -3834,7 +4311,7 @@
              (const_int 1)))]
    "TARGET_SIMD"
    {
-    operands[3] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[3])));
+    operands[3] = aarch64_endian_lane_rtx (<VCONQ>mode, INTVAL (operands[3]));
      return "sqdmull\\t%<vw2>0<Vmwtype>, %<v>1<Vmtype>, %2.<Vetype>[%3]";
    }
    [(set_attr "type" "neon_sat_mul_<Vetype>_scalar_long")]
@@ -3854,7 +4331,7 @@
              (const_int 1)))]
    "TARGET_SIMD"
    {
-    operands[3] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[3])));
+    operands[3] = aarch64_endian_lane_rtx (<VCOND>mode, INTVAL (operands[3]));
      return "sqdmull\\t%<vw2>0<Vmwtype>, %<v>1<Vmtype>, %2.<Vetype>[%3]";
    }
    [(set_attr "type" "neon_sat_mul_<Vetype>_scalar_long")]
@@ -3874,7 +4351,7 @@
              (const_int 1)))]
    "TARGET_SIMD"
    {
-    operands[3] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[3])));
+    operands[3] = aarch64_endian_lane_rtx (<VCONQ>mode, INTVAL (operands[3]));
      return "sqdmull\\t%<vw2>0<Vmwtype>, %<v>1<Vmtype>, %2.<Vetype>[%3]";
    }
    [(set_attr "type" "neon_sat_mul_<Vetype>_scalar_long")]
@@ -3922,12 +4399,12 @@
  )
  
  (define_expand "aarch64_sqdmull2<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
-   (match_operand:VQ_HSI 1 "register_operand" "w")
-   (match_operand:VQ_HSI 2 "register_operand" "w")]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (match_operand:VQ_HSI 1 "register_operand")
+   (match_operand:VQ_HSI 2 "register_operand")]
    "TARGET_SIMD"
  {
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
    emit_insn (gen_aarch64_sqdmull2<mode>_internal (operands[0], operands[1],
                                                   operands[2], p));
    DONE;
@@ -3952,7 +4429,7 @@
              (const_int 1)))]
    "TARGET_SIMD"
    {
-    operands[3] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[3])));
+    operands[3] = aarch64_endian_lane_rtx (<VCOND>mode, INTVAL (operands[3]));
      return "sqdmull2\\t%<vw2>0<Vmwtype>, %<v>1<Vmtype>, %2.<Vetype>[%3]";
    }
    [(set_attr "type" "neon_sat_mul_<Vetype>_scalar_long")]
@@ -3975,20 +4452,20 @@
              (const_int 1)))]
    "TARGET_SIMD"
    {
-    operands[3] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[3])));
+    operands[3] = aarch64_endian_lane_rtx (<VCONQ>mode, INTVAL (operands[3]));
      return "sqdmull2\\t%<vw2>0<Vmwtype>, %<v>1<Vmtype>, %2.<Vetype>[%3]";
    }
    [(set_attr "type" "neon_sat_mul_<Vetype>_scalar_long")]
  )
  
  (define_expand "aarch64_sqdmull2_lane<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
-   (match_operand:VQ_HSI 1 "register_operand" "w")
-   (match_operand:<VCOND> 2 "register_operand" "<vwx>")
-   (match_operand:SI 3 "immediate_operand" "i")]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (match_operand:VQ_HSI 1 "register_operand")
+   (match_operand:<VCOND> 2 "register_operand")
+   (match_operand:SI 3 "immediate_operand")]
    "TARGET_SIMD"
  {
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
    emit_insn (gen_aarch64_sqdmull2_lane<mode>_internal (operands[0], operands[1],
                                                        operands[2], operands[3],
                                                        p));
@@ -3996,13 +4473,13 @@
  })
  
  (define_expand "aarch64_sqdmull2_laneq<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
-   (match_operand:VQ_HSI 1 "register_operand" "w")
-   (match_operand:<VCONQ> 2 "register_operand" "<vwx>")
-   (match_operand:SI 3 "immediate_operand" "i")]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (match_operand:VQ_HSI 1 "register_operand")
+   (match_operand:<VCONQ> 2 "register_operand")
+   (match_operand:SI 3 "immediate_operand")]
    "TARGET_SIMD"
  {
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
    emit_insn (gen_aarch64_sqdmull2_laneq<mode>_internal (operands[0], operands[1],
                                                        operands[2], operands[3],
                                                        p));
@@ -4030,12 +4507,12 @@
  )
  
  (define_expand "aarch64_sqdmull2_n<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
-   (match_operand:VQ_HSI 1 "register_operand" "w")
-   (match_operand:<VEL> 2 "register_operand" "w")]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (match_operand:VQ_HSI 1 "register_operand")
+   (match_operand:<VEL> 2 "register_operand")]
    "TARGET_SIMD"
  {
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
    emit_insn (gen_aarch64_sqdmull2_n<mode>_internal (operands[0], operands[1],
                                                     operands[2], p));
    DONE;
@@ -4177,9 +4654,9 @@
  ;; have different ideas of what should be passed to this pattern.
  
  (define_insn "aarch64_cm<optab><mode>"
-  [(set (match_operand:<V_cmp_result> 0 "register_operand" "=w,w")
-       (neg:<V_cmp_result>
-         (COMPARISONS:<V_cmp_result>
+  [(set (match_operand:<V_INT_EQUIV> 0 "register_operand" "=w,w")
+       (neg:<V_INT_EQUIV>
+         (COMPARISONS:<V_INT_EQUIV>
             (match_operand:VDQ_I 1 "register_operand" "w,w")
             (match_operand:VDQ_I 2 "aarch64_simd_reg_or_zero" "w,ZDz")
           )))]
@@ -4200,7 +4677,7 @@
       (clobber (reg:CC CC_REGNUM))]
    "TARGET_SIMD"
    "#"
-  "reload_completed"
+  "&& reload_completed"
    [(set (match_operand:DI 0 "register_operand")
         (neg:DI
           (COMPARISONS:DI
@@ -4242,9 +4719,9 @@
  ;; cm(hs|hi)
  
  (define_insn "aarch64_cm<optab><mode>"
-  [(set (match_operand:<V_cmp_result> 0 "register_operand" "=w")
-       (neg:<V_cmp_result>
-         (UCOMPARISONS:<V_cmp_result>
+  [(set (match_operand:<V_INT_EQUIV> 0 "register_operand" "=w")
+       (neg:<V_INT_EQUIV>
+         (UCOMPARISONS:<V_INT_EQUIV>
             (match_operand:VDQ_I 1 "register_operand" "w")
             (match_operand:VDQ_I 2 "register_operand" "w")
           )))]
@@ -4263,7 +4740,7 @@
      (clobber (reg:CC CC_REGNUM))]
    "TARGET_SIMD"
    "#"
-  "reload_completed"
+  "&& reload_completed"
    [(set (match_operand:DI 0 "register_operand")
         (neg:DI
           (UCOMPARISONS:DI
@@ -4309,14 +4786,14 @@
  ;; plus (eq (and x y) 0) -1.
  
  (define_insn "aarch64_cmtst<mode>"
-  [(set (match_operand:<V_cmp_result> 0 "register_operand" "=w")
-       (plus:<V_cmp_result>
-         (eq:<V_cmp_result>
+  [(set (match_operand:<V_INT_EQUIV> 0 "register_operand" "=w")
+       (plus:<V_INT_EQUIV>
+         (eq:<V_INT_EQUIV>
             (and:VDQ_I
               (match_operand:VDQ_I 1 "register_operand" "w")
               (match_operand:VDQ_I 2 "register_operand" "w"))
             (match_operand:VDQ_I 3 "aarch64_simd_imm_zero"))
-         (match_operand:<V_cmp_result> 4 "aarch64_simd_imm_minus_one")))
+         (match_operand:<V_INT_EQUIV> 4 "aarch64_simd_imm_minus_one")))
    ]
    "TARGET_SIMD"
    "cmtst\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
@@ -4334,7 +4811,7 @@
      (clobber (reg:CC CC_REGNUM))]
    "TARGET_SIMD"
    "#"
-  "reload_completed"
+  "&& reload_completed"
    [(set (match_operand:DI 0 "register_operand")
         (neg:DI
           (ne:DI
@@ -4377,9 +4854,9 @@
  ;; fcm(eq|ge|gt|le|lt)
  
  (define_insn "aarch64_cm<optab><mode>"
-  [(set (match_operand:<V_cmp_result> 0 "register_operand" "=w,w")
-       (neg:<V_cmp_result>
-         (COMPARISONS:<V_cmp_result>
+  [(set (match_operand:<V_INT_EQUIV> 0 "register_operand" "=w,w")
+       (neg:<V_INT_EQUIV>
+         (COMPARISONS:<V_INT_EQUIV>
             (match_operand:VHSDF_HSDF 1 "register_operand" "w,w")
             (match_operand:VHSDF_HSDF 2 "aarch64_simd_reg_or_zero" "w,YDz")
           )))]
@@ -4395,9 +4872,9 @@
  ;; generating fac(ge|gt).
  
  (define_insn "aarch64_fac<optab><mode>"
-  [(set (match_operand:<V_cmp_result> 0 "register_operand" "=w")
-       (neg:<V_cmp_result>
-         (FAC_COMPARISONS:<V_cmp_result>
+  [(set (match_operand:<V_INT_EQUIV> 0 "register_operand" "=w")
+       (neg:<V_INT_EQUIV>
+         (FAC_COMPARISONS:<V_INT_EQUIV>
             (abs:VHSDF_HSDF
               (match_operand:VHSDF_HSDF 1 "register_operand" "w"))
             (abs:VHSDF_HSDF
@@ -4434,8 +4911,8 @@
  ;; sqrt
  
  (define_expand "sqrt<mode>2"
-  [(set (match_operand:VHSDF 0 "register_operand" "=w")
-       (sqrt:VHSDF (match_operand:VHSDF 1 "register_operand" "w")))]
+  [(set (match_operand:VHSDF 0 "register_operand")
+       (sqrt:VHSDF (match_operand:VHSDF 1 "register_operand")))]
    "TARGET_SIMD"
  {
    if (aarch64_emit_approx_sqrt (operands[0], operands[1], false))
@@ -4481,15 +4958,15 @@
                    UNSPEC_LD2_LANE))]
    "TARGET_SIMD"
    {
-    operands[3] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[3])));
+    operands[3] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[3]));
      return "ld2\\t{%S0.<Vetype> - %T0.<Vetype>}[%3], %1";
    }
    [(set_attr "type" "neon_load2_one_lane")]
  )
  
  (define_expand "vec_load_lanesoi<mode>"
-  [(set (match_operand:OI 0 "register_operand" "=w")
-       (unspec:OI [(match_operand:OI 1 "aarch64_simd_struct_operand" "Utv")
+  [(set (match_operand:OI 0 "register_operand")
+       (unspec:OI [(match_operand:OI 1 "aarch64_simd_struct_operand")
                     (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_LD2))]
    "TARGET_SIMD"
@@ -4497,7 +4974,7 @@
    if (BYTES_BIG_ENDIAN)
      {
        rtx tmp = gen_reg_rtx (OImode);
-      rtx mask = aarch64_reverse_mask (<MODE>mode);
+      rtx mask = aarch64_reverse_mask (<MODE>mode, <nunits>);
        emit_insn (gen_aarch64_simd_ld2<mode> (tmp, operands[1]));
        emit_insn (gen_aarch64_rev_reglistoi (operands[0], tmp, mask));
      }
@@ -4525,15 +5002,15 @@
                    UNSPEC_ST2_LANE))]
    "TARGET_SIMD"
    {
-    operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2])));
+    operands[2] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[2]));
      return "st2\\t{%S1.<Vetype> - %T1.<Vetype>}[%2], %0";
    }
    [(set_attr "type" "neon_store2_one_lane<q>")]
  )
  
  (define_expand "vec_store_lanesoi<mode>"
-  [(set (match_operand:OI 0 "aarch64_simd_struct_operand" "=Utv")
-       (unspec:OI [(match_operand:OI 1 "register_operand" "w")
+  [(set (match_operand:OI 0 "aarch64_simd_struct_operand")
+       (unspec:OI [(match_operand:OI 1 "register_operand")
                      (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                     UNSPEC_ST2))]
    "TARGET_SIMD"
@@ -4541,7 +5018,7 @@
    if (BYTES_BIG_ENDIAN)
      {
        rtx tmp = gen_reg_rtx (OImode);
-      rtx mask = aarch64_reverse_mask (<MODE>mode);
+      rtx mask = aarch64_reverse_mask (<MODE>mode, <nunits>);
        emit_insn (gen_aarch64_rev_reglistoi (tmp, operands[1], mask));
        emit_insn (gen_aarch64_simd_st2<mode> (operands[0], tmp));
      }
@@ -4579,15 +5056,15 @@
                    UNSPEC_LD3_LANE))]
    "TARGET_SIMD"
  {
-    operands[3] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[3])));
+    operands[3] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[3]));
      return "ld3\\t{%S0.<Vetype> - %U0.<Vetype>}[%3], %1";
  }
    [(set_attr "type" "neon_load3_one_lane")]
  )
  
  (define_expand "vec_load_lanesci<mode>"
-  [(set (match_operand:CI 0 "register_operand" "=w")
-       (unspec:CI [(match_operand:CI 1 "aarch64_simd_struct_operand" "Utv")
+  [(set (match_operand:CI 0 "register_operand")
+       (unspec:CI [(match_operand:CI 1 "aarch64_simd_struct_operand")
                     (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_LD3))]
    "TARGET_SIMD"
@@ -4595,7 +5072,7 @@
    if (BYTES_BIG_ENDIAN)
      {
        rtx tmp = gen_reg_rtx (CImode);
-      rtx mask = aarch64_reverse_mask (<MODE>mode);
+      rtx mask = aarch64_reverse_mask (<MODE>mode, <nunits>);
        emit_insn (gen_aarch64_simd_ld3<mode> (tmp, operands[1]));
        emit_insn (gen_aarch64_rev_reglistci (operands[0], tmp, mask));
      }
@@ -4623,15 +5100,15 @@
                     UNSPEC_ST3_LANE))]
    "TARGET_SIMD"
    {
-    operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2])));
+    operands[2] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[2]));
      return "st3\\t{%S1.<Vetype> - %U1.<Vetype>}[%2], %0";
    }
    [(set_attr "type" "neon_store3_one_lane<q>")]
  )
  
  (define_expand "vec_store_lanesci<mode>"
-  [(set (match_operand:CI 0 "aarch64_simd_struct_operand" "=Utv")
-       (unspec:CI [(match_operand:CI 1 "register_operand" "w")
+  [(set (match_operand:CI 0 "aarch64_simd_struct_operand")
+       (unspec:CI [(match_operand:CI 1 "register_operand")
                      (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                     UNSPEC_ST3))]
    "TARGET_SIMD"
@@ -4639,7 +5116,7 @@
    if (BYTES_BIG_ENDIAN)
      {
        rtx tmp = gen_reg_rtx (CImode);
-      rtx mask = aarch64_reverse_mask (<MODE>mode);
+      rtx mask = aarch64_reverse_mask (<MODE>mode, <nunits>);
        emit_insn (gen_aarch64_rev_reglistci (tmp, operands[1], mask));
        emit_insn (gen_aarch64_simd_st3<mode> (operands[0], tmp));
      }
@@ -4677,15 +5154,15 @@
                    UNSPEC_LD4_LANE))]
    "TARGET_SIMD"
  {
-    operands[3] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[3])));
+    operands[3] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[3]));
      return "ld4\\t{%S0.<Vetype> - %V0.<Vetype>}[%3], %1";
  }
    [(set_attr "type" "neon_load4_one_lane")]
  )
  
  (define_expand "vec_load_lanesxi<mode>"
-  [(set (match_operand:XI 0 "register_operand" "=w")
-       (unspec:XI [(match_operand:XI 1 "aarch64_simd_struct_operand" "Utv")
+  [(set (match_operand:XI 0 "register_operand")
+       (unspec:XI [(match_operand:XI 1 "aarch64_simd_struct_operand")
                     (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_LD4))]
    "TARGET_SIMD"
@@ -4693,7 +5170,7 @@
    if (BYTES_BIG_ENDIAN)
      {
        rtx tmp = gen_reg_rtx (XImode);
-      rtx mask = aarch64_reverse_mask (<MODE>mode);
+      rtx mask = aarch64_reverse_mask (<MODE>mode, <nunits>);
        emit_insn (gen_aarch64_simd_ld4<mode> (tmp, operands[1]));
        emit_insn (gen_aarch64_rev_reglistxi (operands[0], tmp, mask));
      }
@@ -4721,15 +5198,15 @@
                     UNSPEC_ST4_LANE))]
    "TARGET_SIMD"
    {
-    operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2])));
+    operands[2] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[2]));
      return "st4\\t{%S1.<Vetype> - %V1.<Vetype>}[%2], %0";
    }
    [(set_attr "type" "neon_store4_one_lane<q>")]
  )
  
  (define_expand "vec_store_lanesxi<mode>"
-  [(set (match_operand:XI 0 "aarch64_simd_struct_operand" "=Utv")
-       (unspec:XI [(match_operand:XI 1 "register_operand" "w")
+  [(set (match_operand:XI 0 "aarch64_simd_struct_operand")
+       (unspec:XI [(match_operand:XI 1 "register_operand")
                      (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                     UNSPEC_ST4))]
    "TARGET_SIMD"
@@ -4737,7 +5214,7 @@
    if (BYTES_BIG_ENDIAN)
      {
        rtx tmp = gen_reg_rtx (XImode);
-      rtx mask = aarch64_reverse_mask (<MODE>mode);
+      rtx mask = aarch64_reverse_mask (<MODE>mode, <nunits>);
        emit_insn (gen_aarch64_rev_reglistxi (tmp, operands[1], mask));
        emit_insn (gen_aarch64_simd_st4<mode> (operands[0], tmp));
      }
@@ -4774,8 +5251,8 @@
  ;; Reload patterns for AdvSIMD register list operands.
  
  (define_expand "mov<mode>"
-  [(set (match_operand:VSTRUCT 0 "nonimmediate_operand" "")
-       (match_operand:VSTRUCT 1 "general_operand" ""))]
+  [(set (match_operand:VSTRUCT 0 "nonimmediate_operand")
+       (match_operand:VSTRUCT 1 "general_operand"))]
    "TARGET_SIMD"
  {
    if (can_create_pseudo_p ())
@@ -4785,6 +5262,114 @@
      }
  })
  
+
+(define_expand "aarch64_ld1x3<VALLDIF:mode>"
+  [(match_operand:CI 0 "register_operand")
+   (match_operand:DI 1 "register_operand")
+   (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_SIMD"
+{
+  rtx mem = gen_rtx_MEM (CImode, operands[1]);
+  emit_insn (gen_aarch64_ld1_x3_<VALLDIF:mode> (operands[0], mem));
+  DONE;
+})
+
+(define_insn "aarch64_ld1_x3_<mode>"
+  [(set (match_operand:CI 0 "register_operand" "=w")
+        (unspec:CI
+         [(match_operand:CI 1 "aarch64_simd_struct_operand" "Utv")
+          (unspec:VALLDIF [(const_int 3)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_LD1))]
+  "TARGET_SIMD"
+  "ld1\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
+  [(set_attr "type" "neon_load1_3reg<q>")]
+)
+
+(define_expand "aarch64_ld1x4<VALLDIF:mode>"
+  [(match_operand:XI 0 "register_operand" "=w")
+   (match_operand:DI 1 "register_operand" "r")
+   (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_SIMD"
+{
+  rtx mem = gen_rtx_MEM (XImode, operands[1]);
+  emit_insn (gen_aarch64_ld1_x4_<VALLDIF:mode> (operands[0], mem));
+  DONE;
+})
+
+(define_insn "aarch64_ld1_x4_<mode>"
+  [(set (match_operand:XI 0 "register_operand" "=w")
+       (unspec:XI
+         [(match_operand:XI 1 "aarch64_simd_struct_operand" "Utv")
+          (unspec:VALLDIF [(const_int 4)] UNSPEC_VSTRUCTDUMMY)]
+       UNSPEC_LD1))]
+  "TARGET_SIMD"
+  "ld1\\t{%S0.<Vtype> - %V0.<Vtype>}, %1"
+  [(set_attr "type" "neon_load1_4reg<q>")]
+)
+
+(define_expand "aarch64_st1x2<VALLDIF:mode>"
+  [(match_operand:DI 0 "register_operand")
+   (match_operand:OI 1 "register_operand")
+   (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_SIMD"
+{
+  rtx mem = gen_rtx_MEM (OImode, operands[0]);
+  emit_insn (gen_aarch64_st1_x2_<VALLDIF:mode> (mem, operands[1]));
+  DONE;
+})
+
+(define_insn "aarch64_st1_x2_<mode>"
+   [(set (match_operand:OI 0 "aarch64_simd_struct_operand" "=Utv")
+        (unspec:OI
+         [(match_operand:OI 1 "register_operand" "w")
+          (unspec:VALLDIF [(const_int 2)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_ST1))]
+  "TARGET_SIMD"
+  "st1\\t{%S1.<Vtype> - %T1.<Vtype>}, %0"
+  [(set_attr "type" "neon_store1_2reg<q>")]
+)
+
+(define_expand "aarch64_st1x3<VALLDIF:mode>"
+  [(match_operand:DI 0 "register_operand")
+   (match_operand:CI 1 "register_operand")
+   (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_SIMD"
+{
+  rtx mem = gen_rtx_MEM (CImode, operands[0]);
+  emit_insn (gen_aarch64_st1_x3_<VALLDIF:mode> (mem, operands[1]));
+  DONE;
+})
+
+(define_insn "aarch64_st1_x3_<mode>"
+   [(set (match_operand:CI 0 "aarch64_simd_struct_operand" "=Utv")
+       (unspec:CI
+         [(match_operand:CI 1 "register_operand" "w")
+         (unspec:VALLDIF [(const_int 3)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_ST1))]
+  "TARGET_SIMD"
+  "st1\\t{%S1.<Vtype> - %U1.<Vtype>}, %0"
+  [(set_attr "type" "neon_store1_3reg<q>")]
+)
+
+(define_expand "aarch64_st1x4<VALLDIF:mode>"
+  [(match_operand:DI 0 "register_operand" "")
+   (match_operand:XI 1 "register_operand" "")
+   (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_SIMD"
+{
+  rtx mem = gen_rtx_MEM (XImode, operands[0]);
+  emit_insn (gen_aarch64_st1_x4_<VALLDIF:mode> (mem, operands[1]));
+  DONE;
+})
+
+(define_insn "aarch64_st1_x4_<mode>"
+  [(set (match_operand:XI 0 "aarch64_simd_struct_operand" "=Utv")
+       (unspec:XI
+          [(match_operand:XI 1 "register_operand" "w")
+          (unspec:VALLDIF [(const_int 4)] UNSPEC_VSTRUCTDUMMY)]
+       UNSPEC_ST1))]
+  "TARGET_SIMD"
+  "st1\\t{%S1.<Vtype> - %V1.<Vtype>}, %0"
+  [(set_attr "type" "neon_store1_4reg<q>")]
+)
+
  (define_insn "*aarch64_mov<mode>"
    [(set (match_operand:VSTRUCT 0 "aarch64_simd_nonimmediate_operand" "=w,Utv,w")
         (match_operand:VSTRUCT 1 "aarch64_simd_general_operand" " w,w,Utv"))]
@@ -4918,8 +5503,8 @@
  })
  
  (define_expand "aarch64_ld<VSTRUCT:nregs>r<VALLDIF:mode>"
-  [(match_operand:VSTRUCT 0 "register_operand" "=w")
-   (match_operand:DI 1 "register_operand" "w")
+  [(match_operand:VSTRUCT 0 "register_operand")
+   (match_operand:DI 1 "register_operand")
     (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
    "TARGET_SIMD"
  {
@@ -4932,297 +5517,76 @@
    DONE;
  })
  
-(define_insn "aarch64_ld2<mode>_dreg_le"
-  [(set (match_operand:OI 0 "register_operand" "=w")
-       (subreg:OI
-         (vec_concat:<VRL2>
-           (vec_concat:<VDBL>
-            (unspec:VD
-               [(match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
-               UNSPEC_LD2)
-            (vec_duplicate:VD (const_int 0)))
-           (vec_concat:<VDBL>
-            (unspec:VD [(match_dup 1)]
-                       UNSPEC_LD2)
-            (vec_duplicate:VD (const_int 0)))) 0))]
-  "TARGET_SIMD && !BYTES_BIG_ENDIAN"
-  "ld2\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
-  [(set_attr "type" "neon_load2_2reg<q>")]
-)
-
-(define_insn "aarch64_ld2<mode>_dreg_be"
+(define_insn "aarch64_ld2<mode>_dreg"
    [(set (match_operand:OI 0 "register_operand" "=w")
-       (subreg:OI
-         (vec_concat:<VRL2>
-           (vec_concat:<VDBL>
-            (vec_duplicate:VD (const_int 0))
-            (unspec:VD
-               [(match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
-               UNSPEC_LD2))
-           (vec_concat:<VDBL>
-            (vec_duplicate:VD (const_int 0))
-            (unspec:VD [(match_dup 1)]
-                       UNSPEC_LD2))) 0))]
-  "TARGET_SIMD && BYTES_BIG_ENDIAN"
+       (unspec:OI [(match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")
+                   (unspec:VD [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                  UNSPEC_LD2_DREG))]
+  "TARGET_SIMD"
    "ld2\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
    [(set_attr "type" "neon_load2_2reg<q>")]
  )
  
-(define_insn "aarch64_ld2<mode>_dreg_le"
+(define_insn "aarch64_ld2<mode>_dreg"
    [(set (match_operand:OI 0 "register_operand" "=w")
-       (subreg:OI
-         (vec_concat:<VRL2>
-           (vec_concat:<VDBL>
-            (unspec:DX
-               [(match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
-               UNSPEC_LD2)
-            (const_int 0))
-           (vec_concat:<VDBL>
-            (unspec:DX [(match_dup 1)]
-                       UNSPEC_LD2)
-            (const_int 0))) 0))]
-  "TARGET_SIMD && !BYTES_BIG_ENDIAN"
-  "ld1\\t{%S0.1d - %T0.1d}, %1"
-  [(set_attr "type" "neon_load1_2reg<q>")]
-)
-
-(define_insn "aarch64_ld2<mode>_dreg_be"
-  [(set (match_operand:OI 0 "register_operand" "=w")
-       (subreg:OI
-         (vec_concat:<VRL2>
-           (vec_concat:<VDBL>
-            (const_int 0)
-            (unspec:DX
-               [(match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
-               UNSPEC_LD2))
-           (vec_concat:<VDBL>
-            (const_int 0)
-            (unspec:DX [(match_dup 1)]
-                       UNSPEC_LD2))) 0))]
-  "TARGET_SIMD && BYTES_BIG_ENDIAN"
+       (unspec:OI [(match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")
+                   (unspec:DX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                  UNSPEC_LD2_DREG))]
+  "TARGET_SIMD"
    "ld1\\t{%S0.1d - %T0.1d}, %1"
    [(set_attr "type" "neon_load1_2reg<q>")]
  )
  
-(define_insn "aarch64_ld3<mode>_dreg_le"
-  [(set (match_operand:CI 0 "register_operand" "=w")
-       (subreg:CI
-        (vec_concat:<VRL3>
-         (vec_concat:<VRL2>
-           (vec_concat:<VDBL>
-            (unspec:VD
-               [(match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
-               UNSPEC_LD3)
-            (vec_duplicate:VD (const_int 0)))
-           (vec_concat:<VDBL>
-            (unspec:VD [(match_dup 1)]
-                       UNSPEC_LD3)
-            (vec_duplicate:VD (const_int 0))))
-         (vec_concat:<VDBL>
-            (unspec:VD [(match_dup 1)]
-                       UNSPEC_LD3)
-            (vec_duplicate:VD (const_int 0)))) 0))]
-  "TARGET_SIMD && !BYTES_BIG_ENDIAN"
-  "ld3\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
-  [(set_attr "type" "neon_load3_3reg<q>")]
-)
-
-(define_insn "aarch64_ld3<mode>_dreg_be"
+(define_insn "aarch64_ld3<mode>_dreg"
    [(set (match_operand:CI 0 "register_operand" "=w")
-       (subreg:CI
-        (vec_concat:<VRL3>
-         (vec_concat:<VRL2>
-           (vec_concat:<VDBL>
-            (vec_duplicate:VD (const_int 0))
-            (unspec:VD
-               [(match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
-               UNSPEC_LD3))
-           (vec_concat:<VDBL>
-            (vec_duplicate:VD (const_int 0))
-            (unspec:VD [(match_dup 1)]
-                       UNSPEC_LD3)))
-         (vec_concat:<VDBL>
-            (vec_duplicate:VD (const_int 0))
-            (unspec:VD [(match_dup 1)]
-                       UNSPEC_LD3))) 0))]
-  "TARGET_SIMD && BYTES_BIG_ENDIAN"
+       (unspec:CI [(match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")
+                   (unspec:VD [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                  UNSPEC_LD3_DREG))]
+  "TARGET_SIMD"
    "ld3\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
    [(set_attr "type" "neon_load3_3reg<q>")]
  )
  
-(define_insn "aarch64_ld3<mode>_dreg_le"
+(define_insn "aarch64_ld3<mode>_dreg"
    [(set (match_operand:CI 0 "register_operand" "=w")
-       (subreg:CI
-        (vec_concat:<VRL3>
-         (vec_concat:<VRL2>
-           (vec_concat:<VDBL>
-            (unspec:DX
-               [(match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
-               UNSPEC_LD3)
-            (const_int 0))
-           (vec_concat:<VDBL>
-            (unspec:DX [(match_dup 1)]
-                       UNSPEC_LD3)
-            (const_int 0)))
-         (vec_concat:<VDBL>
-            (unspec:DX [(match_dup 1)]
-                       UNSPEC_LD3)
-            (const_int 0))) 0))]
-  "TARGET_SIMD && !BYTES_BIG_ENDIAN"
-  "ld1\\t{%S0.1d - %U0.1d}, %1"
-  [(set_attr "type" "neon_load1_3reg<q>")]
-)
-
-(define_insn "aarch64_ld3<mode>_dreg_be"
-  [(set (match_operand:CI 0 "register_operand" "=w")
-       (subreg:CI
-        (vec_concat:<VRL3>
-         (vec_concat:<VRL2>
-           (vec_concat:<VDBL>
-            (const_int 0)
-            (unspec:DX
-               [(match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
-               UNSPEC_LD3))
-           (vec_concat:<VDBL>
-            (const_int 0)
-            (unspec:DX [(match_dup 1)]
-                       UNSPEC_LD3)))
-         (vec_concat:<VDBL>
-            (const_int 0)
-            (unspec:DX [(match_dup 1)]
-                       UNSPEC_LD3))) 0))]
-  "TARGET_SIMD && BYTES_BIG_ENDIAN"
+       (unspec:CI [(match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")
+                   (unspec:DX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                  UNSPEC_LD3_DREG))]
+  "TARGET_SIMD"
    "ld1\\t{%S0.1d - %U0.1d}, %1"
    [(set_attr "type" "neon_load1_3reg<q>")]
  )
  
-(define_insn "aarch64_ld4<mode>_dreg_le"
-  [(set (match_operand:XI 0 "register_operand" "=w")
-       (subreg:XI
-        (vec_concat:<VRL4>
-          (vec_concat:<VRL2>
-            (vec_concat:<VDBL>
-              (unspec:VD
-               [(match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
-               UNSPEC_LD4)
-              (vec_duplicate:VD (const_int 0)))
-             (vec_concat:<VDBL>
-               (unspec:VD [(match_dup 1)]
-                       UNSPEC_LD4)
-               (vec_duplicate:VD (const_int 0))))
-          (vec_concat:<VRL2>
-            (vec_concat:<VDBL>
-              (unspec:VD [(match_dup 1)]
-                       UNSPEC_LD4)
-              (vec_duplicate:VD (const_int 0)))
-            (vec_concat:<VDBL>
-              (unspec:VD [(match_dup 1)]
-                       UNSPEC_LD4)
-              (vec_duplicate:VD (const_int 0))))) 0))]
-  "TARGET_SIMD && !BYTES_BIG_ENDIAN"
-  "ld4\\t{%S0.<Vtype> - %V0.<Vtype>}, %1"
-  [(set_attr "type" "neon_load4_4reg<q>")]
-)
-
-(define_insn "aarch64_ld4<mode>_dreg_be"
+(define_insn "aarch64_ld4<mode>_dreg"
    [(set (match_operand:XI 0 "register_operand" "=w")
-       (subreg:XI
-        (vec_concat:<VRL4>
-          (vec_concat:<VRL2>
-            (vec_concat:<VDBL>
-              (vec_duplicate:VD (const_int 0))
-              (unspec:VD
-               [(match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
-               UNSPEC_LD4))
-             (vec_concat:<VDBL>
-               (vec_duplicate:VD (const_int 0))
-               (unspec:VD [(match_dup 1)]
-                       UNSPEC_LD4)))
-          (vec_concat:<VRL2>
-            (vec_concat:<VDBL>
-              (vec_duplicate:VD (const_int 0))
-              (unspec:VD [(match_dup 1)]
-                       UNSPEC_LD4))
-            (vec_concat:<VDBL>
-              (vec_duplicate:VD (const_int 0))
-              (unspec:VD [(match_dup 1)]
-                       UNSPEC_LD4)))) 0))]
-  "TARGET_SIMD && BYTES_BIG_ENDIAN"
+       (unspec:XI [(match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")
+                   (unspec:VD [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                  UNSPEC_LD4_DREG))]
+  "TARGET_SIMD"
    "ld4\\t{%S0.<Vtype> - %V0.<Vtype>}, %1"
    [(set_attr "type" "neon_load4_4reg<q>")]
  )
  
-(define_insn "aarch64_ld4<mode>_dreg_le"
-  [(set (match_operand:XI 0 "register_operand" "=w")
-       (subreg:XI
-        (vec_concat:<VRL4>
-          (vec_concat:<VRL2>
-            (vec_concat:<VDBL>
-              (unspec:DX
-               [(match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
-               UNSPEC_LD4)
-              (const_int 0))
-             (vec_concat:<VDBL>
-               (unspec:DX [(match_dup 1)]
-                       UNSPEC_LD4)
-               (const_int 0)))
-          (vec_concat:<VRL2>
-            (vec_concat:<VDBL>
-              (unspec:DX [(match_dup 1)]
-                       UNSPEC_LD4)
-              (const_int 0))
-            (vec_concat:<VDBL>
-              (unspec:DX [(match_dup 1)]
-                       UNSPEC_LD4)
-              (const_int 0)))) 0))]
-  "TARGET_SIMD && !BYTES_BIG_ENDIAN"
-  "ld1\\t{%S0.1d - %V0.1d}, %1"
-  [(set_attr "type" "neon_load1_4reg<q>")]
-)
-
-(define_insn "aarch64_ld4<mode>_dreg_be"
+(define_insn "aarch64_ld4<mode>_dreg"
    [(set (match_operand:XI 0 "register_operand" "=w")
-       (subreg:XI
-        (vec_concat:<VRL4>
-          (vec_concat:<VRL2>
-            (vec_concat:<VDBL>
-              (const_int 0)
-              (unspec:DX
-               [(match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
-               UNSPEC_LD4))
-             (vec_concat:<VDBL>
-               (const_int 0)
-               (unspec:DX [(match_dup 1)]
-                       UNSPEC_LD4)))
-          (vec_concat:<VRL2>
-            (vec_concat:<VDBL>
-              (const_int 0)
-              (unspec:DX [(match_dup 1)]
-                       UNSPEC_LD4))
-            (vec_concat:<VDBL>
-              (const_int 0)
-              (unspec:DX [(match_dup 1)]
-                       UNSPEC_LD4)))) 0))]
-  "TARGET_SIMD && BYTES_BIG_ENDIAN"
+       (unspec:XI [(match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")
+                   (unspec:DX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                  UNSPEC_LD4_DREG))]
+  "TARGET_SIMD"
    "ld1\\t{%S0.1d - %V0.1d}, %1"
    [(set_attr "type" "neon_load1_4reg<q>")]
  )
  
  (define_expand "aarch64_ld<VSTRUCT:nregs><VDC:mode>"
- [(match_operand:VSTRUCT 0 "register_operand" "=w")
-  (match_operand:DI 1 "register_operand" "r")
+ [(match_operand:VSTRUCT 0 "register_operand")
+  (match_operand:DI 1 "register_operand")
    (unspec:VDC [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
    "TARGET_SIMD"
  {
    rtx mem = gen_rtx_MEM (BLKmode, operands[1]);
    set_mem_size (mem, <VSTRUCT:nregs> * 8);
  
-  if (BYTES_BIG_ENDIAN)
-    emit_insn (gen_aarch64_ld<VSTRUCT:nregs><VDC:mode>_dreg_be (operands[0],
-                                                               mem));
-  else
-    emit_insn (gen_aarch64_ld<VSTRUCT:nregs><VDC:mode>_dreg_le (operands[0],
-                                                               mem));
+  emit_insn (gen_aarch64_ld<VSTRUCT:nregs><VDC:mode>_dreg (operands[0], mem));
    DONE;
  })
  
@@ -5242,8 +5606,8 @@
  })
  
  (define_expand "aarch64_ld<VSTRUCT:nregs><VQ:mode>"
- [(match_operand:VSTRUCT 0 "register_operand" "=w")
-  (match_operand:DI 1 "register_operand" "r")
+ [(match_operand:VSTRUCT 0 "register_operand")
+  (match_operand:DI 1 "register_operand")
    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
    "TARGET_SIMD"
  {
@@ -5254,11 +5618,38 @@
    DONE;
  })
  
+(define_expand "aarch64_ld1x2<VQ:mode>"
+ [(match_operand:OI 0 "register_operand")
+  (match_operand:DI 1 "register_operand")
+  (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_SIMD"
+{
+  machine_mode mode = OImode;
+  rtx mem = gen_rtx_MEM (mode, operands[1]);
+
+  emit_insn (gen_aarch64_simd_ld1<VQ:mode>_x2 (operands[0], mem));
+  DONE;
+})
+
+(define_expand "aarch64_ld1x2<VDC:mode>"
+ [(match_operand:OI 0 "register_operand")
+  (match_operand:DI 1 "register_operand")
+  (unspec:VDC [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_SIMD"
+{
+  machine_mode mode = OImode;
+  rtx mem = gen_rtx_MEM (mode, operands[1]);
+
+  emit_insn (gen_aarch64_simd_ld1<VDC:mode>_x2 (operands[0], mem));
+  DONE;
+})
+
+
  (define_expand "aarch64_ld<VSTRUCT:nregs>_lane<VALLDIF:mode>"
-  [(match_operand:VSTRUCT 0 "register_operand" "=w")
-       (match_operand:DI 1 "register_operand" "w")
-       (match_operand:VSTRUCT 2 "register_operand" "0")
-       (match_operand:SI 3 "immediate_operand" "i")
+  [(match_operand:VSTRUCT 0 "register_operand")
+       (match_operand:DI 1 "register_operand")
+       (match_operand:VSTRUCT 2 "register_operand")
+       (match_operand:SI 3 "immediate_operand")
         (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
    "TARGET_SIMD"
  {
@@ -5266,9 +5657,7 @@
    set_mem_size (mem, GET_MODE_SIZE (GET_MODE_INNER (<VALLDIF:MODE>mode))
                      * <VSTRUCT:nregs>);
  
-  aarch64_simd_lane_bounds (operands[3], 0,
-                           GET_MODE_NUNITS (<VALLDIF:MODE>mode),
-                           NULL);
+  aarch64_simd_lane_bounds (operands[3], 0, <VALLDIF:nunits>, NULL);
    emit_insn (gen_aarch64_vec_load_lanes<VSTRUCT:mode>_lane<VALLDIF:mode> (
         operands[0], mem, operands[2], operands[3]));
    DONE;
@@ -5280,9 +5669,9 @@
  ;; D-register list.
  
  (define_expand "aarch64_get_dreg<VSTRUCT:mode><VDC:mode>"
- [(match_operand:VDC 0 "register_operand" "=w")
-  (match_operand:VSTRUCT 1 "register_operand" "w")
-  (match_operand:SI 2 "immediate_operand" "i")]
+ [(match_operand:VDC 0 "register_operand")
+  (match_operand:VSTRUCT 1 "register_operand")
+  (match_operand:SI 2 "immediate_operand")]
    "TARGET_SIMD"
  {
    int part = INTVAL (operands[2]);
@@ -5297,9 +5686,9 @@
  ;; Q-register list.
  
  (define_expand "aarch64_get_qreg<VSTRUCT:mode><VQ:mode>"
- [(match_operand:VQ 0 "register_operand" "=w")
-  (match_operand:VSTRUCT 1 "register_operand" "w")
-  (match_operand:SI 2 "immediate_operand" "i")]
+ [(match_operand:VQ 0 "register_operand")
+  (match_operand:VSTRUCT 1 "register_operand")
+  (match_operand:SI 2 "immediate_operand")]
    "TARGET_SIMD"
  {
    int part = INTVAL (operands[2]);
@@ -5316,20 +5705,6 @@
  
  ;; vec_perm support
  
-(define_expand "vec_perm_const<mode>"
-  [(match_operand:VALL_F16 0 "register_operand")
-   (match_operand:VALL_F16 1 "register_operand")
-   (match_operand:VALL_F16 2 "register_operand")
-   (match_operand:<V_cmp_result> 3)]
-  "TARGET_SIMD"
-{
-  if (aarch64_expand_vec_perm_const (operands[0], operands[1],
-                                    operands[2], operands[3]))
-    DONE;
-  else
-    FAIL;
-})
-
  (define_expand "vec_perm<mode>"
    [(match_operand:VB 0 "register_operand")
     (match_operand:VB 1 "register_operand")
@@ -5338,7 +5713,7 @@
    "TARGET_SIMD"
  {
    aarch64_expand_vec_perm (operands[0], operands[1],
-                          operands[2], operands[3]);
+                          operands[2], operands[3], <nunits>);
    DONE;
  })
  
@@ -5447,17 +5822,23 @@
  [(set_attr "type" "multiple")]
  )
  
-(define_insn "aarch64_<PERMUTE:perm_insn><PERMUTE:perm_hilo><mode>"
+;; This instruction's pattern is generated directly by
+;; aarch64_expand_vec_perm_const, so any changes to the pattern would
+;; need corresponding changes there.
+(define_insn "aarch64_<PERMUTE:perm_insn><mode>"
    [(set (match_operand:VALL_F16 0 "register_operand" "=w")
         (unspec:VALL_F16 [(match_operand:VALL_F16 1 "register_operand" "w")
                           (match_operand:VALL_F16 2 "register_operand" "w")]
          PERMUTE))]
    "TARGET_SIMD"
-  "<PERMUTE:perm_insn><PERMUTE:perm_hilo>\\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
+  "<PERMUTE:perm_insn>\\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
    [(set_attr "type" "neon_permute<q>")]
  )
  
-;; Note immediate (third) operand is lane index not byte index.
+;; This instruction's pattern is generated directly by
+;; aarch64_expand_vec_perm_const, so any changes to the pattern would
+;; need corresponding changes there.  Note that the immediate (third)
+;; operand is a lane index not a byte index.
  (define_insn "aarch64_ext<mode>"
    [(set (match_operand:VALL_F16 0 "register_operand" "=w")
          (unspec:VALL_F16 [(match_operand:VALL_F16 1 "register_operand" "w")
@@ -5473,6 +5854,9 @@
    [(set_attr "type" "neon_ext<q>")]
  )
  
+;; This instruction's pattern is generated directly by
+;; aarch64_expand_vec_perm_const, so any changes to the pattern would
+;; need corresponding changes there.
  (define_insn "aarch64_rev<REVERSE:rev_op><mode>"
    [(set (match_operand:VALL_F16 0 "register_operand" "=w")
         (unspec:VALL_F16 [(match_operand:VALL_F16 1 "register_operand" "w")]
@@ -5543,8 +5927,8 @@
  )
  
  (define_expand "aarch64_st<VSTRUCT:nregs><VDC:mode>"
- [(match_operand:DI 0 "register_operand" "r")
-  (match_operand:VSTRUCT 1 "register_operand" "w")
+ [(match_operand:DI 0 "register_operand")
+  (match_operand:VSTRUCT 1 "register_operand")
    (unspec:VDC [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
    "TARGET_SIMD"
  {
@@ -5556,8 +5940,8 @@
  })
  
  (define_expand "aarch64_st<VSTRUCT:nregs><VQ:mode>"
- [(match_operand:DI 0 "register_operand" "r")
-  (match_operand:VSTRUCT 1 "register_operand" "w")
+ [(match_operand:DI 0 "register_operand")
+  (match_operand:VSTRUCT 1 "register_operand")
    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
    "TARGET_SIMD"
  {
@@ -5569,8 +5953,8 @@
  })
  
  (define_expand "aarch64_st<VSTRUCT:nregs>_lane<VALLDIF:mode>"
- [(match_operand:DI 0 "register_operand" "r")
-  (match_operand:VSTRUCT 1 "register_operand" "w")
+ [(match_operand:DI 0 "register_operand")
+  (match_operand:VSTRUCT 1 "register_operand")
    (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)
    (match_operand:SI 2 "immediate_operand")]
    "TARGET_SIMD"
@@ -5606,10 +5990,10 @@
  ;; extend them in arm_neon.h and insert the resulting Q-regs.
  
  (define_expand "aarch64_set_qreg<VSTRUCT:mode><VQ:mode>"
- [(match_operand:VSTRUCT 0 "register_operand" "+w")
-  (match_operand:VSTRUCT 1 "register_operand" "0")
-  (match_operand:VQ 2 "register_operand" "w")
-  (match_operand:SI 3 "immediate_operand" "i")]
+ [(match_operand:VSTRUCT 0 "register_operand")
+  (match_operand:VSTRUCT 1 "register_operand")
+  (match_operand:VQ 2 "register_operand")
+  (match_operand:SI 3 "immediate_operand")]
    "TARGET_SIMD"
  {
    int part = INTVAL (operands[3]);
@@ -5621,10 +6005,19 @@
    DONE;
  })
  
-;; Standard pattern name vec_init<mode>.
+;; Standard pattern name vec_init<mode><Vel>.
+
+(define_expand "vec_init<mode><Vel>"
+  [(match_operand:VALL_F16 0 "register_operand")
+   (match_operand 1 "" "")]
+  "TARGET_SIMD"
+{
+  aarch64_expand_vector_init (operands[0], operands[1]);
+  DONE;
+})
  
-(define_expand "vec_init<mode>"
-  [(match_operand:VALL_F16 0 "register_operand" "")
+(define_expand "vec_init<mode><Vhalf>"
+  [(match_operand:VQ_NO2E 0 "register_operand")
     (match_operand 1 "" "")]
    "TARGET_SIMD"
  {
@@ -5641,25 +6034,47 @@
    [(set_attr "type" "neon_load1_all_lanes")]
  )
  
-(define_insn "aarch64_frecpe<mode>"
-  [(set (match_operand:VHSDF 0 "register_operand" "=w")
-       (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")]
+(define_insn "aarch64_simd_ld1<mode>_x2"
+  [(set (match_operand:OI 0 "register_operand" "=w")
+       (unspec:OI [(match_operand:OI 1 "aarch64_simd_struct_operand" "Utv")
+                   (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                  UNSPEC_LD1))]
+  "TARGET_SIMD"
+  "ld1\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
+  [(set_attr "type" "neon_load1_2reg<q>")]
+)
+
+(define_insn "aarch64_simd_ld1<mode>_x2"
+  [(set (match_operand:OI 0 "register_operand" "=w")
+       (unspec:OI [(match_operand:OI 1 "aarch64_simd_struct_operand" "Utv")
+                   (unspec:VDC [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                  UNSPEC_LD1))]
+  "TARGET_SIMD"
+  "ld1\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
+  [(set_attr "type" "neon_load1_2reg<q>")]
+)
+
+
+(define_insn "@aarch64_frecpe<mode>"
+  [(set (match_operand:VHSDF_HSDF 0 "register_operand" "=w")
+       (unspec:VHSDF_HSDF
+        [(match_operand:VHSDF_HSDF 1 "register_operand" "w")]
          UNSPEC_FRECPE))]
    "TARGET_SIMD"
-  "frecpe\\t%0.<Vtype>, %1.<Vtype>"
+  "frecpe\t%<v>0<Vmtype>, %<v>1<Vmtype>"
    [(set_attr "type" "neon_fp_recpe_<stype><q>")]
  )
  
-(define_insn "aarch64_frecp<FRECP:frecp_suffix><mode>"
+(define_insn "aarch64_frecpx<mode>"
    [(set (match_operand:GPF_F16 0 "register_operand" "=w")
         (unspec:GPF_F16 [(match_operand:GPF_F16 1 "register_operand" "w")]
-        FRECP))]
+        UNSPEC_FRECPX))]
    "TARGET_SIMD"
-  "frecp<FRECP:frecp_suffix>\\t%<s>0, %<s>1"
-  [(set_attr "type" "neon_fp_recp<FRECP:frecp_suffix>_<GPF_F16:stype>")]
+  "frecpx\t%<s>0, %<s>1"
+  [(set_attr "type" "neon_fp_recpx_<GPF_F16:stype>")]
  )
  
-(define_insn "aarch64_frecps<mode>"
+(define_insn "@aarch64_frecps<mode>"
    [(set (match_operand:VHSDF_HSDF 0 "register_operand" "=w")
         (unspec:VHSDF_HSDF
           [(match_operand:VHSDF_HSDF 1 "register_operand" "w")
@@ -5678,12 +6093,12 @@
   "urecpe\\t%0.<Vtype>, %1.<Vtype>"
    [(set_attr "type" "neon_fp_recpe_<Vetype><q>")])
  
-;; Standard pattern name vec_extract<mode>.
+;; Standard pattern name vec_extract<mode><Vel>.
  
-(define_expand "vec_extract<mode>"
-  [(match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand" "")
-   (match_operand:VALL_F16 1 "register_operand" "")
-   (match_operand:SI 2 "immediate_operand" "")]
+(define_expand "vec_extract<mode><Vel>"
+  [(match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand")
+   (match_operand:VALL_F16 1 "register_operand")
+   (match_operand:SI 2 "immediate_operand")]
    "TARGET_SIMD"
  {
      emit_insn
@@ -5695,33 +6110,65 @@
  
  (define_insn "aarch64_crypto_aes<aes_op>v16qi"
    [(set (match_operand:V16QI 0 "register_operand" "=w")
-        (unspec:V16QI [(match_operand:V16QI 1 "register_operand" "0")
-                      (match_operand:V16QI 2 "register_operand" "w")]
+       (unspec:V16QI
+               [(xor:V16QI
+                (match_operand:V16QI 1 "register_operand" "%0")
+                (match_operand:V16QI 2 "register_operand" "w"))]
           CRYPTO_AES))]
-  "TARGET_SIMD && TARGET_CRYPTO"
+  "TARGET_SIMD && TARGET_AES"
    "aes<aes_op>\\t%0.16b, %2.16b"
    [(set_attr "type" "crypto_aese")]
  )
  
-;; When AES/AESMC fusion is enabled we want the register allocation to
-;; look like:
-;;    AESE Vn, _
-;;    AESMC Vn, Vn
-;; So prefer to tie operand 1 to operand 0 when fusing.
-
  (define_insn "aarch64_crypto_aes<aesmc_op>v16qi"
-  [(set (match_operand:V16QI 0 "register_operand" "=w,w")
-       (unspec:V16QI [(match_operand:V16QI 1 "register_operand" "0,w")]
+  [(set (match_operand:V16QI 0 "register_operand" "=w")
+       (unspec:V16QI [(match_operand:V16QI 1 "register_operand" "w")]
          CRYPTO_AESMC))]
-  "TARGET_SIMD && TARGET_CRYPTO"
+  "TARGET_SIMD && TARGET_AES"
    "aes<aesmc_op>\\t%0.16b, %1.16b"
-  [(set_attr "type" "crypto_aesmc")
-   (set_attr_alternative "enabled"
-     [(if_then_else (match_test
-                      "aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)")
-                    (const_string "yes" )
-                    (const_string "no"))
-      (const_string "yes")])]
+  [(set_attr "type" "crypto_aesmc")]
+)
+
+;; When AESE/AESMC fusion is enabled we really want to keep the two together
+;; and enforce the register dependency without scheduling or register
+;; allocation messing up the order or introducing moves inbetween.
+;;  Mash the two together during combine.
+
+(define_insn "*aarch64_crypto_aese_fused"
+  [(set (match_operand:V16QI 0 "register_operand" "=w")
+       (unspec:V16QI
+         [(unspec:V16QI
+          [(xor:V16QI
+               (match_operand:V16QI 1 "register_operand" "%0")
+               (match_operand:V16QI 2 "register_operand" "w"))]
+            UNSPEC_AESE)]
+       UNSPEC_AESMC))]
+  "TARGET_SIMD && TARGET_AES
+   && aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)"
+  "aese\\t%0.16b, %2.16b\;aesmc\\t%0.16b, %0.16b"
+  [(set_attr "type" "crypto_aese")
+   (set_attr "length" "8")]
+)
+
+;; When AESD/AESIMC fusion is enabled we really want to keep the two together
+;; and enforce the register dependency without scheduling or register
+;; allocation messing up the order or introducing moves inbetween.
+;;  Mash the two together during combine.
+
+(define_insn "*aarch64_crypto_aesd_fused"
+  [(set (match_operand:V16QI 0 "register_operand" "=w")
+       (unspec:V16QI
+         [(unspec:V16QI
+                   [(xor:V16QI
+                       (match_operand:V16QI 1 "register_operand" "%0")
+                       (match_operand:V16QI 2 "register_operand" "w"))]
+               UNSPEC_AESD)]
+         UNSPEC_AESIMC))]
+  "TARGET_SIMD && TARGET_AES
+   && aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)"
+  "aesd\\t%0.16b, %2.16b\;aesimc\\t%0.16b, %0.16b"
+  [(set_attr "type" "crypto_aese")
+   (set_attr "length" "8")]
  )
  
  ;; sha1
@@ -5731,7 +6178,7 @@
          (unspec:SI [(match_operand:SI 1
                         "register_operand" "w")]
           UNSPEC_SHA1H))]
-  "TARGET_SIMD && TARGET_CRYPTO"
+  "TARGET_SIMD && TARGET_SHA2"
    "sha1h\\t%s0, %s1"
    [(set_attr "type" "crypto_sha1_fast")]
  )
@@ -5741,7 +6188,7 @@
         (unspec:SI [(vec_select:SI (match_operand:V4SI 1 "register_operand" "w")
                      (parallel [(const_int 0)]))]
          UNSPEC_SHA1H))]
-  "TARGET_SIMD && TARGET_CRYPTO && !BYTES_BIG_ENDIAN"
+  "TARGET_SIMD && TARGET_SHA2 && !BYTES_BIG_ENDIAN"
    "sha1h\\t%s0, %s1"
    [(set_attr "type" "crypto_sha1_fast")]
  )
@@ -5751,7 +6198,7 @@
         (unspec:SI [(vec_select:SI (match_operand:V4SI 1 "register_operand" "w")
                      (parallel [(const_int 3)]))]
          UNSPEC_SHA1H))]
-  "TARGET_SIMD && TARGET_CRYPTO && BYTES_BIG_ENDIAN"
+  "TARGET_SIMD && TARGET_SHA2 && BYTES_BIG_ENDIAN"
    "sha1h\\t%s0, %s1"
    [(set_attr "type" "crypto_sha1_fast")]
  )
@@ -5761,7 +6208,7 @@
          (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "0")
                        (match_operand:V4SI 2 "register_operand" "w")]
           UNSPEC_SHA1SU1))]
-  "TARGET_SIMD && TARGET_CRYPTO"
+  "TARGET_SIMD && TARGET_SHA2"
    "sha1su1\\t%0.4s, %2.4s"
    [(set_attr "type" "crypto_sha1_fast")]
  )
@@ -5772,7 +6219,7 @@
                        (match_operand:SI 2 "register_operand" "w")
                        (match_operand:V4SI 3 "register_operand" "w")]
           CRYPTO_SHA1))]
-  "TARGET_SIMD && TARGET_CRYPTO"
+  "TARGET_SIMD && TARGET_SHA2"
    "sha1<sha1_op>\\t%q0, %s2, %3.4s"
    [(set_attr "type" "crypto_sha1_slow")]
  )
@@ -5783,7 +6230,7 @@
                        (match_operand:V4SI 2 "register_operand" "w")
                        (match_operand:V4SI 3 "register_operand" "w")]
           UNSPEC_SHA1SU0))]
-  "TARGET_SIMD && TARGET_CRYPTO"
+  "TARGET_SIMD && TARGET_SHA2"
    "sha1su0\\t%0.4s, %2.4s, %3.4s"
    [(set_attr "type" "crypto_sha1_xor")]
  )
@@ -5796,7 +6243,7 @@
                        (match_operand:V4SI 2 "register_operand" "w")
                        (match_operand:V4SI 3 "register_operand" "w")]
           CRYPTO_SHA256))]
-  "TARGET_SIMD && TARGET_CRYPTO"
+  "TARGET_SIMD && TARGET_SHA2"
    "sha256h<sha256_op>\\t%q0, %q2, %3.4s"
    [(set_attr "type" "crypto_sha256_slow")]
  )
@@ -5806,7 +6253,7 @@
          (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "0")
                        (match_operand:V4SI 2 "register_operand" "w")]
           UNSPEC_SHA256SU0))]
-  "TARGET_SIMD &&TARGET_CRYPTO"
+  "TARGET_SIMD && TARGET_SHA2"
    "sha256su0\\t%0.4s, %2.4s"
    [(set_attr "type" "crypto_sha256_fast")]
  )
@@ -5817,11 +6264,728 @@
                        (match_operand:V4SI 2 "register_operand" "w")
                        (match_operand:V4SI 3 "register_operand" "w")]
           UNSPEC_SHA256SU1))]
-  "TARGET_SIMD &&TARGET_CRYPTO"
+  "TARGET_SIMD && TARGET_SHA2"
    "sha256su1\\t%0.4s, %2.4s, %3.4s"
    [(set_attr "type" "crypto_sha256_slow")]
  )
  
+;; sha512
+
+(define_insn "aarch64_crypto_sha512h<sha512_op>qv2di"
+  [(set (match_operand:V2DI 0 "register_operand" "=w")
+        (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0")
+                      (match_operand:V2DI 2 "register_operand" "w")
+                      (match_operand:V2DI 3 "register_operand" "w")]
+         CRYPTO_SHA512))]
+  "TARGET_SIMD && TARGET_SHA3"
+  "sha512h<sha512_op>\\t%q0, %q2, %3.2d"
+  [(set_attr "type" "crypto_sha512")]
+)
+
+(define_insn "aarch64_crypto_sha512su0qv2di"
+  [(set (match_operand:V2DI 0 "register_operand" "=w")
+        (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0")
+                      (match_operand:V2DI 2 "register_operand" "w")]
+         UNSPEC_SHA512SU0))]
+  "TARGET_SIMD && TARGET_SHA3"
+  "sha512su0\\t%0.2d, %2.2d"
+  [(set_attr "type" "crypto_sha512")]
+)
+
+(define_insn "aarch64_crypto_sha512su1qv2di"
+  [(set (match_operand:V2DI 0 "register_operand" "=w")
+        (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0")
+                      (match_operand:V2DI 2 "register_operand" "w")
+                      (match_operand:V2DI 3 "register_operand" "w")]
+         UNSPEC_SHA512SU1))]
+  "TARGET_SIMD && TARGET_SHA3"
+  "sha512su1\\t%0.2d, %2.2d, %3.2d"
+  [(set_attr "type" "crypto_sha512")]
+)
+
+;; sha3
+
+(define_insn "eor3q<mode>4"
+  [(set (match_operand:VQ_I 0 "register_operand" "=w")
+       (xor:VQ_I
+        (xor:VQ_I
+         (match_operand:VQ_I 2 "register_operand" "w")
+         (match_operand:VQ_I 3 "register_operand" "w"))
+        (match_operand:VQ_I 1 "register_operand" "w")))]
+  "TARGET_SIMD && TARGET_SHA3"
+  "eor3\\t%0.16b, %1.16b, %2.16b, %3.16b"
+  [(set_attr "type" "crypto_sha3")]
+)
+
+(define_insn "aarch64_rax1qv2di"
+  [(set (match_operand:V2DI 0 "register_operand" "=w")
+       (xor:V2DI
+        (rotate:V2DI
+         (match_operand:V2DI 2 "register_operand" "w")
+         (const_int 1))
+        (match_operand:V2DI 1 "register_operand" "w")))]
+  "TARGET_SIMD && TARGET_SHA3"
+  "rax1\\t%0.2d, %1.2d, %2.2d"
+  [(set_attr "type" "crypto_sha3")]
+)
+
+(define_insn "aarch64_xarqv2di"
+  [(set (match_operand:V2DI 0 "register_operand" "=w")
+       (rotatert:V2DI
+        (xor:V2DI
+         (match_operand:V2DI 1 "register_operand" "%w")
+         (match_operand:V2DI 2 "register_operand" "w"))
+        (match_operand:SI 3 "aarch64_simd_shift_imm_di" "Usd")))]
+  "TARGET_SIMD && TARGET_SHA3"
+  "xar\\t%0.2d, %1.2d, %2.2d, %3"
+  [(set_attr "type" "crypto_sha3")]
+)
+
+(define_insn "bcaxq<mode>4"
+  [(set (match_operand:VQ_I 0 "register_operand" "=w")
+       (xor:VQ_I
+        (and:VQ_I
+         (not:VQ_I (match_operand:VQ_I 3 "register_operand" "w"))
+         (match_operand:VQ_I 2 "register_operand" "w"))
+        (match_operand:VQ_I 1 "register_operand" "w")))]
+  "TARGET_SIMD && TARGET_SHA3"
+  "bcax\\t%0.16b, %1.16b, %2.16b, %3.16b"
+  [(set_attr "type" "crypto_sha3")]
+)
+
+;; SM3
+
+(define_insn "aarch64_sm3ss1qv4si"
+  [(set (match_operand:V4SI 0 "register_operand" "=w")
+       (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "w")
+                     (match_operand:V4SI 2 "register_operand" "w")
+                     (match_operand:V4SI 3 "register_operand" "w")]
+        UNSPEC_SM3SS1))]
+  "TARGET_SIMD && TARGET_SM4"
+  "sm3ss1\\t%0.4s, %1.4s, %2.4s, %3.4s"
+  [(set_attr "type" "crypto_sm3")]
+)
+
+
+(define_insn "aarch64_sm3tt<sm3tt_op>qv4si"
+  [(set (match_operand:V4SI 0 "register_operand" "=w")
+       (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "0")
+                     (match_operand:V4SI 2 "register_operand" "w")
+                     (match_operand:V4SI 3 "register_operand" "w")
+                     (match_operand:SI 4 "aarch64_imm2" "Ui2")]
+        CRYPTO_SM3TT))]
+  "TARGET_SIMD && TARGET_SM4"
+  "sm3tt<sm3tt_op>\\t%0.4s, %2.4s, %3.4s[%4]"
+  [(set_attr "type" "crypto_sm3")]
+)
+
+(define_insn "aarch64_sm3partw<sm3part_op>qv4si"
+  [(set (match_operand:V4SI 0 "register_operand" "=w")
+       (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "0")
+                     (match_operand:V4SI 2 "register_operand" "w")
+                     (match_operand:V4SI 3 "register_operand" "w")]
+        CRYPTO_SM3PART))]
+  "TARGET_SIMD && TARGET_SM4"
+  "sm3partw<sm3part_op>\\t%0.4s, %2.4s, %3.4s"
+  [(set_attr "type" "crypto_sm3")]
+)
+
+;; SM4
+
+(define_insn "aarch64_sm4eqv4si"
+  [(set (match_operand:V4SI 0 "register_operand" "=w")
+       (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "0")
+                     (match_operand:V4SI 2 "register_operand" "w")]
+        UNSPEC_SM4E))]
+  "TARGET_SIMD && TARGET_SM4"
+  "sm4e\\t%0.4s, %2.4s"
+  [(set_attr "type" "crypto_sm4")]
+)
+
+(define_insn "aarch64_sm4ekeyqv4si"
+  [(set (match_operand:V4SI 0 "register_operand" "=w")
+       (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "w")
+                     (match_operand:V4SI 2 "register_operand" "w")]
+        UNSPEC_SM4EKEY))]
+  "TARGET_SIMD && TARGET_SM4"
+  "sm4ekey\\t%0.4s, %1.4s, %2.4s"
+  [(set_attr "type" "crypto_sm4")]
+)
+
+;; fp16fml
+
+(define_expand "aarch64_fml<f16mac1>l<f16quad>_low<mode>"
+  [(set (match_operand:VDQSF 0 "register_operand")
+       (unspec:VDQSF
+        [(match_operand:VDQSF 1 "register_operand")
+         (match_operand:<VFMLA_W> 2 "register_operand")
+         (match_operand:<VFMLA_W> 3 "register_operand")]
+        VFMLA16_LOW))]
+  "TARGET_F16FML"
+{
+  rtx p1 = aarch64_simd_vect_par_cnst_half (<VFMLA_W>mode,
+                                           <nunits> * 2, false);
+  rtx p2 = aarch64_simd_vect_par_cnst_half (<VFMLA_W>mode,
+                                           <nunits> * 2, false);
+
+  emit_insn (gen_aarch64_simd_fml<f16mac1>l<f16quad>_low<mode> (operands[0],
+                                                               operands[1],
+                                                               operands[2],
+                                                               operands[3],
+                                                               p1, p2));
+  DONE;
+
+})
+
+(define_expand "aarch64_fml<f16mac1>l<f16quad>_high<mode>"
+  [(set (match_operand:VDQSF 0 "register_operand")
+       (unspec:VDQSF
+        [(match_operand:VDQSF 1 "register_operand")
+         (match_operand:<VFMLA_W> 2 "register_operand")
+         (match_operand:<VFMLA_W> 3 "register_operand")]
+        VFMLA16_HIGH))]
+  "TARGET_F16FML"
+{
+  rtx p1 = aarch64_simd_vect_par_cnst_half (<VFMLA_W>mode, <nunits> * 2, true);
+  rtx p2 = aarch64_simd_vect_par_cnst_half (<VFMLA_W>mode, <nunits> * 2, true);
+
+  emit_insn (gen_aarch64_simd_fml<f16mac1>l<f16quad>_high<mode> (operands[0],
+                                                                operands[1],
+                                                                operands[2],
+                                                                operands[3],
+                                                                p1, p2));
+  DONE;
+})
+
+(define_insn "aarch64_simd_fmlal<f16quad>_low<mode>"
+  [(set (match_operand:VDQSF 0 "register_operand" "=w")
+       (fma:VDQSF
+        (float_extend:VDQSF
+         (vec_select:<VFMLA_SEL_W>
+          (match_operand:<VFMLA_W> 2 "register_operand" "w")
+          (match_operand:<VFMLA_W> 4 "vect_par_cnst_lo_half" "")))
+        (float_extend:VDQSF
+         (vec_select:<VFMLA_SEL_W>
+          (match_operand:<VFMLA_W> 3 "register_operand" "w")
+          (match_operand:<VFMLA_W> 5 "vect_par_cnst_lo_half" "")))
+        (match_operand:VDQSF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlal\\t%0.<nunits>s, %2.<nunits>h, %3.<nunits>h"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_insn "aarch64_simd_fmlsl<f16quad>_low<mode>"
+  [(set (match_operand:VDQSF 0 "register_operand" "=w")
+       (fma:VDQSF
+        (float_extend:VDQSF
+         (neg:<VFMLA_SEL_W>
+          (vec_select:<VFMLA_SEL_W>
+           (match_operand:<VFMLA_W> 2 "register_operand" "w")
+           (match_operand:<VFMLA_W> 4 "vect_par_cnst_lo_half" ""))))
+        (float_extend:VDQSF
+         (vec_select:<VFMLA_SEL_W>
+          (match_operand:<VFMLA_W> 3 "register_operand" "w")
+          (match_operand:<VFMLA_W> 5 "vect_par_cnst_lo_half" "")))
+        (match_operand:VDQSF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlsl\\t%0.<nunits>s, %2.<nunits>h, %3.<nunits>h"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_insn "aarch64_simd_fmlal<f16quad>_high<mode>"
+  [(set (match_operand:VDQSF 0 "register_operand" "=w")
+       (fma:VDQSF
+        (float_extend:VDQSF
+         (vec_select:<VFMLA_SEL_W>
+          (match_operand:<VFMLA_W> 2 "register_operand" "w")
+          (match_operand:<VFMLA_W> 4 "vect_par_cnst_hi_half" "")))
+        (float_extend:VDQSF
+         (vec_select:<VFMLA_SEL_W>
+          (match_operand:<VFMLA_W> 3 "register_operand" "w")
+          (match_operand:<VFMLA_W> 5 "vect_par_cnst_hi_half" "")))
+        (match_operand:VDQSF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlal2\\t%0.<nunits>s, %2.<nunits>h, %3.<nunits>h"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_insn "aarch64_simd_fmlsl<f16quad>_high<mode>"
+  [(set (match_operand:VDQSF 0 "register_operand" "=w")
+       (fma:VDQSF
+        (float_extend:VDQSF
+         (neg:<VFMLA_SEL_W>
+          (vec_select:<VFMLA_SEL_W>
+           (match_operand:<VFMLA_W> 2 "register_operand" "w")
+           (match_operand:<VFMLA_W> 4 "vect_par_cnst_hi_half" ""))))
+        (float_extend:VDQSF
+         (vec_select:<VFMLA_SEL_W>
+          (match_operand:<VFMLA_W> 3 "register_operand" "w")
+          (match_operand:<VFMLA_W> 5 "vect_par_cnst_hi_half" "")))
+        (match_operand:VDQSF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlsl2\\t%0.<nunits>s, %2.<nunits>h, %3.<nunits>h"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_expand "aarch64_fml<f16mac1>l_lane_lowv2sf"
+  [(set (match_operand:V2SF 0 "register_operand")
+       (unspec:V2SF [(match_operand:V2SF 1 "register_operand")
+                          (match_operand:V4HF 2 "register_operand")
+                          (match_operand:V4HF 3 "register_operand")
+                          (match_operand:SI 4 "aarch64_imm2")]
+        VFMLA16_LOW))]
+  "TARGET_F16FML"
+{
+    rtx p1 = aarch64_simd_vect_par_cnst_half (V4HFmode, 4, false);
+    rtx lane = aarch64_endian_lane_rtx (V4HFmode, INTVAL (operands[4]));
+
+    emit_insn (gen_aarch64_simd_fml<f16mac1>l_lane_lowv2sf (operands[0],
+                                                           operands[1],
+                                                           operands[2],
+                                                           operands[3],
+                                                           p1, lane));
+    DONE;
+}
+)
+
+(define_expand "aarch64_fml<f16mac1>l_lane_highv2sf"
+  [(set (match_operand:V2SF 0 "register_operand")
+       (unspec:V2SF [(match_operand:V2SF 1 "register_operand")
+                          (match_operand:V4HF 2 "register_operand")
+                          (match_operand:V4HF 3 "register_operand")
+                          (match_operand:SI 4 "aarch64_imm2")]
+        VFMLA16_HIGH))]
+  "TARGET_F16FML"
+{
+    rtx p1 = aarch64_simd_vect_par_cnst_half (V4HFmode, 4, true);
+    rtx lane = aarch64_endian_lane_rtx (V4HFmode, INTVAL (operands[4]));
+
+    emit_insn (gen_aarch64_simd_fml<f16mac1>l_lane_highv2sf (operands[0],
+                                                            operands[1],
+                                                            operands[2],
+                                                            operands[3],
+                                                            p1, lane));
+    DONE;
+})
+
+(define_insn "aarch64_simd_fmlal_lane_lowv2sf"
+  [(set (match_operand:V2SF 0 "register_operand" "=w")
+       (fma:V2SF
+        (float_extend:V2SF
+          (vec_select:V2HF
+           (match_operand:V4HF 2 "register_operand" "w")
+           (match_operand:V4HF 4 "vect_par_cnst_lo_half" "")))
+        (float_extend:V2SF
+          (vec_duplicate:V2HF
+           (vec_select:HF
+            (match_operand:V4HF 3 "register_operand" "x")
+            (parallel [(match_operand:SI 5 "aarch64_imm2" "Ui2")]))))
+        (match_operand:V2SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlal\\t%0.2s, %2.2h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_insn "aarch64_simd_fmlsl_lane_lowv2sf"
+  [(set (match_operand:V2SF 0 "register_operand" "=w")
+       (fma:V2SF
+        (float_extend:V2SF
+         (neg:V2HF
+          (vec_select:V2HF
+           (match_operand:V4HF 2 "register_operand" "w")
+           (match_operand:V4HF 4 "vect_par_cnst_lo_half" ""))))
+        (float_extend:V2SF
+         (vec_duplicate:V2HF
+          (vec_select:HF
+           (match_operand:V4HF 3 "register_operand" "x")
+           (parallel [(match_operand:SI 5 "aarch64_imm2" "Ui2")]))))
+        (match_operand:V2SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlsl\\t%0.2s, %2.2h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_insn "aarch64_simd_fmlal_lane_highv2sf"
+  [(set (match_operand:V2SF 0 "register_operand" "=w")
+       (fma:V2SF
+        (float_extend:V2SF
+          (vec_select:V2HF
+           (match_operand:V4HF 2 "register_operand" "w")
+           (match_operand:V4HF 4 "vect_par_cnst_hi_half" "")))
+        (float_extend:V2SF
+          (vec_duplicate:V2HF
+           (vec_select:HF
+            (match_operand:V4HF 3 "register_operand" "x")
+            (parallel [(match_operand:SI 5 "aarch64_imm2" "Ui2")]))))
+        (match_operand:V2SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlal2\\t%0.2s, %2.2h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_insn "aarch64_simd_fmlsl_lane_highv2sf"
+  [(set (match_operand:V2SF 0 "register_operand" "=w")
+       (fma:V2SF
+        (float_extend:V2SF
+          (neg:V2HF
+           (vec_select:V2HF
+            (match_operand:V4HF 2 "register_operand" "w")
+            (match_operand:V4HF 4 "vect_par_cnst_hi_half" ""))))
+        (float_extend:V2SF
+          (vec_duplicate:V2HF
+           (vec_select:HF
+            (match_operand:V4HF 3 "register_operand" "x")
+            (parallel [(match_operand:SI 5 "aarch64_imm2" "Ui2")]))))
+        (match_operand:V2SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlsl2\\t%0.2s, %2.2h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_expand "aarch64_fml<f16mac1>lq_laneq_lowv4sf"
+  [(set (match_operand:V4SF 0 "register_operand")
+       (unspec:V4SF [(match_operand:V4SF 1 "register_operand")
+                          (match_operand:V8HF 2 "register_operand")
+                          (match_operand:V8HF 3 "register_operand")
+                          (match_operand:SI 4 "aarch64_lane_imm3")]
+        VFMLA16_LOW))]
+  "TARGET_F16FML"
+{
+    rtx p1 = aarch64_simd_vect_par_cnst_half (V8HFmode, 8, false);
+    rtx lane = aarch64_endian_lane_rtx (V8HFmode, INTVAL (operands[4]));
+
+    emit_insn (gen_aarch64_simd_fml<f16mac1>lq_laneq_lowv4sf (operands[0],
+                                                             operands[1],
+                                                             operands[2],
+                                                             operands[3],
+                                                             p1, lane));
+    DONE;
+})
+
+(define_expand "aarch64_fml<f16mac1>lq_laneq_highv4sf"
+  [(set (match_operand:V4SF 0 "register_operand")
+       (unspec:V4SF [(match_operand:V4SF 1 "register_operand")
+                          (match_operand:V8HF 2 "register_operand")
+                          (match_operand:V8HF 3 "register_operand")
+                          (match_operand:SI 4 "aarch64_lane_imm3")]
+        VFMLA16_HIGH))]
+  "TARGET_F16FML"
+{
+    rtx p1 = aarch64_simd_vect_par_cnst_half (V8HFmode, 8, true);
+    rtx lane = aarch64_endian_lane_rtx (V8HFmode, INTVAL (operands[4]));
+
+    emit_insn (gen_aarch64_simd_fml<f16mac1>lq_laneq_highv4sf (operands[0],
+                                                              operands[1],
+                                                              operands[2],
+                                                              operands[3],
+                                                              p1, lane));
+    DONE;
+})
+
+(define_insn "aarch64_simd_fmlalq_laneq_lowv4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+       (fma:V4SF
+        (float_extend:V4SF
+         (vec_select:V4HF
+           (match_operand:V8HF 2 "register_operand" "w")
+           (match_operand:V8HF 4 "vect_par_cnst_lo_half" "")))
+        (float_extend:V4SF
+         (vec_duplicate:V4HF
+          (vec_select:HF
+           (match_operand:V8HF 3 "register_operand" "x")
+           (parallel [(match_operand:SI 5 "aarch64_lane_imm3" "Ui7")]))))
+        (match_operand:V4SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlal\\t%0.4s, %2.4h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_insn "aarch64_simd_fmlslq_laneq_lowv4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+       (fma:V4SF
+         (float_extend:V4SF
+          (neg:V4HF
+           (vec_select:V4HF
+            (match_operand:V8HF 2 "register_operand" "w")
+            (match_operand:V8HF 4 "vect_par_cnst_lo_half" ""))))
+        (float_extend:V4SF
+         (vec_duplicate:V4HF
+          (vec_select:HF
+           (match_operand:V8HF 3 "register_operand" "x")
+           (parallel [(match_operand:SI 5 "aarch64_lane_imm3" "Ui7")]))))
+        (match_operand:V4SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlsl\\t%0.4s, %2.4h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_insn "aarch64_simd_fmlalq_laneq_highv4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+       (fma:V4SF
+        (float_extend:V4SF
+         (vec_select:V4HF
+           (match_operand:V8HF 2 "register_operand" "w")
+           (match_operand:V8HF 4 "vect_par_cnst_hi_half" "")))
+        (float_extend:V4SF
+         (vec_duplicate:V4HF
+          (vec_select:HF
+           (match_operand:V8HF 3 "register_operand" "x")
+           (parallel [(match_operand:SI 5 "aarch64_lane_imm3" "Ui7")]))))
+        (match_operand:V4SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlal2\\t%0.4s, %2.4h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_insn "aarch64_simd_fmlslq_laneq_highv4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+       (fma:V4SF
+        (float_extend:V4SF
+         (neg:V4HF
+          (vec_select:V4HF
+           (match_operand:V8HF 2 "register_operand" "w")
+           (match_operand:V8HF 4 "vect_par_cnst_hi_half" ""))))
+        (float_extend:V4SF
+         (vec_duplicate:V4HF
+          (vec_select:HF
+           (match_operand:V8HF 3 "register_operand" "x")
+           (parallel [(match_operand:SI 5 "aarch64_lane_imm3" "Ui7")]))))
+        (match_operand:V4SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlsl2\\t%0.4s, %2.4h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_expand "aarch64_fml<f16mac1>l_laneq_lowv2sf"
+  [(set (match_operand:V2SF 0 "register_operand")
+       (unspec:V2SF [(match_operand:V2SF 1 "register_operand")
+                     (match_operand:V4HF 2 "register_operand")
+                     (match_operand:V8HF 3 "register_operand")
+                     (match_operand:SI 4 "aarch64_lane_imm3")]
+        VFMLA16_LOW))]
+  "TARGET_F16FML"
+{
+    rtx p1 = aarch64_simd_vect_par_cnst_half (V4HFmode, 4, false);
+    rtx lane = aarch64_endian_lane_rtx (V8HFmode, INTVAL (operands[4]));
+
+    emit_insn (gen_aarch64_simd_fml<f16mac1>l_laneq_lowv2sf (operands[0],
+                                                            operands[1],
+                                                            operands[2],
+                                                            operands[3],
+                                                            p1, lane));
+    DONE;
+
+})
+
+(define_expand "aarch64_fml<f16mac1>l_laneq_highv2sf"
+  [(set (match_operand:V2SF 0 "register_operand")
+       (unspec:V2SF [(match_operand:V2SF 1 "register_operand")
+                     (match_operand:V4HF 2 "register_operand")
+                     (match_operand:V8HF 3 "register_operand")
+                     (match_operand:SI 4 "aarch64_lane_imm3")]
+        VFMLA16_HIGH))]
+  "TARGET_F16FML"
+{
+    rtx p1 = aarch64_simd_vect_par_cnst_half (V4HFmode, 4, true);
+    rtx lane = aarch64_endian_lane_rtx (V8HFmode, INTVAL (operands[4]));
+
+    emit_insn (gen_aarch64_simd_fml<f16mac1>l_laneq_highv2sf (operands[0],
+                                                             operands[1],
+                                                             operands[2],
+                                                             operands[3],
+                                                             p1, lane));
+    DONE;
+
+})
+
+(define_insn "aarch64_simd_fmlal_laneq_lowv2sf"
+  [(set (match_operand:V2SF 0 "register_operand" "=w")
+       (fma:V2SF
+        (float_extend:V2SF
+          (vec_select:V2HF
+           (match_operand:V4HF 2 "register_operand" "w")
+           (match_operand:V4HF 4 "vect_par_cnst_lo_half" "")))
+        (float_extend:V2SF
+         (vec_duplicate:V2HF
+          (vec_select:HF
+           (match_operand:V8HF 3 "register_operand" "x")
+           (parallel [(match_operand:SI 5 "aarch64_lane_imm3" "Ui7")]))))
+        (match_operand:V2SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlal\\t%0.2s, %2.2h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_insn "aarch64_simd_fmlsl_laneq_lowv2sf"
+  [(set (match_operand:V2SF 0 "register_operand" "=w")
+       (fma:V2SF
+        (float_extend:V2SF
+         (neg:V2HF
+          (vec_select:V2HF
+           (match_operand:V4HF 2 "register_operand" "w")
+           (match_operand:V4HF 4 "vect_par_cnst_lo_half" ""))))
+        (float_extend:V2SF
+         (vec_duplicate:V2HF
+          (vec_select:HF
+           (match_operand:V8HF 3 "register_operand" "x")
+           (parallel [(match_operand:SI 5 "aarch64_lane_imm3" "Ui7")]))))
+        (match_operand:V2SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlsl\\t%0.2s, %2.2h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_insn "aarch64_simd_fmlal_laneq_highv2sf"
+  [(set (match_operand:V2SF 0 "register_operand" "=w")
+       (fma:V2SF
+        (float_extend:V2SF
+          (vec_select:V2HF
+           (match_operand:V4HF 2 "register_operand" "w")
+           (match_operand:V4HF 4 "vect_par_cnst_hi_half" "")))
+        (float_extend:V2SF
+         (vec_duplicate:V2HF
+          (vec_select:HF
+           (match_operand:V8HF 3 "register_operand" "x")
+           (parallel [(match_operand:SI 5 "aarch64_lane_imm3" "Ui7")]))))
+        (match_operand:V2SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlal2\\t%0.2s, %2.2h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_insn "aarch64_simd_fmlsl_laneq_highv2sf"
+  [(set (match_operand:V2SF 0 "register_operand" "=w")
+       (fma:V2SF
+        (float_extend:V2SF
+         (neg:V2HF
+          (vec_select:V2HF
+           (match_operand:V4HF 2 "register_operand" "w")
+           (match_operand:V4HF 4 "vect_par_cnst_hi_half" ""))))
+        (float_extend:V2SF
+         (vec_duplicate:V2HF
+          (vec_select:HF
+           (match_operand:V8HF 3 "register_operand" "x")
+           (parallel [(match_operand:SI 5 "aarch64_lane_imm3" "Ui7")]))))
+        (match_operand:V2SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlsl2\\t%0.2s, %2.2h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_expand "aarch64_fml<f16mac1>lq_lane_lowv4sf"
+  [(set (match_operand:V4SF 0 "register_operand")
+       (unspec:V4SF [(match_operand:V4SF 1 "register_operand")
+                     (match_operand:V8HF 2 "register_operand")
+                     (match_operand:V4HF 3 "register_operand")
+                     (match_operand:SI 4 "aarch64_imm2")]
+        VFMLA16_LOW))]
+  "TARGET_F16FML"
+{
+    rtx p1 = aarch64_simd_vect_par_cnst_half (V8HFmode, 8, false);
+    rtx lane = aarch64_endian_lane_rtx (V4HFmode, INTVAL (operands[4]));
+
+    emit_insn (gen_aarch64_simd_fml<f16mac1>lq_lane_lowv4sf (operands[0],
+                                                            operands[1],
+                                                            operands[2],
+                                                            operands[3],
+                                                            p1, lane));
+    DONE;
+})
+
+(define_expand "aarch64_fml<f16mac1>lq_lane_highv4sf"
+  [(set (match_operand:V4SF 0 "register_operand")
+       (unspec:V4SF [(match_operand:V4SF 1 "register_operand")
+                     (match_operand:V8HF 2 "register_operand")
+                     (match_operand:V4HF 3 "register_operand")
+                     (match_operand:SI 4 "aarch64_imm2")]
+        VFMLA16_HIGH))]
+  "TARGET_F16FML"
+{
+    rtx p1 = aarch64_simd_vect_par_cnst_half (V8HFmode, 8, true);
+    rtx lane = aarch64_endian_lane_rtx (V4HFmode, INTVAL (operands[4]));
+
+    emit_insn (gen_aarch64_simd_fml<f16mac1>lq_lane_highv4sf (operands[0],
+                                                             operands[1],
+                                                             operands[2],
+                                                             operands[3],
+                                                             p1, lane));
+    DONE;
+})
+
+(define_insn "aarch64_simd_fmlalq_lane_lowv4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+       (fma:V4SF
+        (float_extend:V4SF
+         (vec_select:V4HF
+          (match_operand:V8HF 2 "register_operand" "w")
+          (match_operand:V8HF 4 "vect_par_cnst_lo_half" "")))
+        (float_extend:V4SF
+         (vec_duplicate:V4HF
+          (vec_select:HF
+           (match_operand:V4HF 3 "register_operand" "x")
+           (parallel [(match_operand:SI 5 "aarch64_imm2" "Ui2")]))))
+        (match_operand:V4SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlal\\t%0.4s, %2.4h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_insn "aarch64_simd_fmlslq_lane_lowv4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+       (fma:V4SF
+        (float_extend:V4SF
+         (neg:V4HF
+          (vec_select:V4HF
+           (match_operand:V8HF 2 "register_operand" "w")
+           (match_operand:V8HF 4 "vect_par_cnst_lo_half" ""))))
+        (float_extend:V4SF
+         (vec_duplicate:V4HF
+          (vec_select:HF
+           (match_operand:V4HF 3 "register_operand" "x")
+           (parallel [(match_operand:SI 5 "aarch64_imm2" "Ui2")]))))
+        (match_operand:V4SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlsl\\t%0.4s, %2.4h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_insn "aarch64_simd_fmlalq_lane_highv4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+       (fma:V4SF
+        (float_extend:V4SF
+         (vec_select:V4HF
+          (match_operand:V8HF 2 "register_operand" "w")
+          (match_operand:V8HF 4 "vect_par_cnst_hi_half" "")))
+        (float_extend:V4SF
+         (vec_duplicate:V4HF
+          (vec_select:HF
+           (match_operand:V4HF 3 "register_operand" "x")
+           (parallel [(match_operand:SI 5 "aarch64_imm2" "Ui2")]))))
+        (match_operand:V4SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlal2\\t%0.4s, %2.4h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_insn "aarch64_simd_fmlslq_lane_highv4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+       (fma:V4SF
+        (float_extend:V4SF
+         (neg:V4HF
+          (vec_select:V4HF
+           (match_operand:V8HF 2 "register_operand" "w")
+           (match_operand:V8HF 4 "vect_par_cnst_hi_half" ""))))
+        (float_extend:V4SF
+         (vec_duplicate:V4HF
+          (vec_select:HF
+           (match_operand:V4HF 3 "register_operand" "x")
+           (parallel [(match_operand:SI 5 "aarch64_imm2" "Ui2")]))))
+        (match_operand:V4SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlsl2\\t%0.4s, %2.4h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
  ;; pmull
  
  (define_insn "aarch64_crypto_pmulldi"
@@ -5829,7 +6993,7 @@
          (unspec:TI  [(match_operand:DI 1 "register_operand" "w")
                      (match_operand:DI 2 "register_operand" "w")]
                     UNSPEC_PMULL))]
- "TARGET_SIMD && TARGET_CRYPTO"
+ "TARGET_SIMD && TARGET_AES"
   "pmull\\t%0.1q, %1.1d, %2.1d"
    [(set_attr "type" "crypto_pmull")]
  )
@@ -5839,7 +7003,25 @@
         (unspec:TI [(match_operand:V2DI 1 "register_operand" "w")
                    (match_operand:V2DI 2 "register_operand" "w")]
                   UNSPEC_PMULL2))]
-  "TARGET_SIMD && TARGET_CRYPTO"
+  "TARGET_SIMD && TARGET_AES"
    "pmull2\\t%0.1q, %1.2d, %2.2d"
    [(set_attr "type" "crypto_pmull")]
  )
+
+;; Sign- or zero-extend a 64-bit integer vector to a 128-bit vector.
+(define_insn "<optab><Vnarrowq><mode>2"
+  [(set (match_operand:VQN 0 "register_operand" "=w")
+       (ANY_EXTEND:VQN (match_operand:<VNARROWQ> 1 "register_operand" "w")))]
+  "TARGET_SIMD"
+  "<su>xtl\t%0.<Vtype>, %1.<Vntype>"
+  [(set_attr "type" "neon_shift_imm_long")]
+)
+
+;; Truncate a 128-bit integer vector to a 64-bit vector.
+(define_insn "trunc<mode><Vnarrowq>2"
+  [(set (match_operand:<VNARROWQ> 0 "register_operand" "=w")
+       (truncate:<VNARROWQ> (match_operand:VQN 1 "register_operand" "w")))]
+  "TARGET_SIMD"
+  "xtn\t%0.<Vntype>, %1.<Vtype>"
+  [(set_attr "type" "neon_shift_imm_narrow_q")]
+)