[(set_attr "type" "neon_shift_imm_long")]
)
-(define_insn "aarch64_simd_vec_unpack<su>_hi_<mode>"
+(define_insn_and_split "aarch64_simd_vec_unpack<su>_hi_<mode>"
[(set (match_operand:<VWIDE> 0 "register_operand" "=w")
(ANY_EXTEND:<VWIDE> (vec_select:<VHALF>
(match_operand:VQW 1 "register_operand" "w")
)))]
"TARGET_SIMD"
"<su>xtl2\t%0.<Vwtype>, %1.<Vtype>"
- [(set_attr "type" "neon_shift_imm_long")]
-)
-
-(define_expand "vec_unpacku_hi_<mode>"
- [(match_operand:<VWIDE> 0 "register_operand")
- (match_operand:VQW 1 "register_operand")]
- "TARGET_SIMD"
+ "&& <CODE> == ZERO_EXTEND
+ && aarch64_split_simd_shift_p (insn)"
+ [(const_int 0)]
{
- rtx res = gen_reg_rtx (<MODE>mode);
- rtx tmp = aarch64_gen_shareable_zero (<MODE>mode);
- if (BYTES_BIG_ENDIAN)
- emit_insn (gen_aarch64_zip2<mode> (res, tmp, operands[1]));
- else
- emit_insn (gen_aarch64_zip2<mode> (res, operands[1], tmp));
- emit_move_insn (operands[0],
- simplify_gen_subreg (<VWIDE>mode, res, <MODE>mode, 0));
+ /* On many cores, it is cheaper to implement UXTL2 using a ZIP2 with zero,
+ provided that the cost of the zero can be amortized over several
+ operations. We'll later recombine the zero and zip if there are
+ not sufficient uses of the zero to make the split worthwhile. */
+ rtx res = simplify_gen_subreg (<MODE>mode, operands[0], <VWIDE>mode, 0);
+ rtx zero = aarch64_gen_shareable_zero (<MODE>mode);
+ emit_insn (gen_aarch64_zip2<mode> (res, operands[1], zero));
DONE;
}
+ [(set_attr "type" "neon_shift_imm_long")]
)
-(define_expand "vec_unpacks_hi_<mode>"
+(define_expand "vec_unpack<su>_hi_<mode>"
[(match_operand:<VWIDE> 0 "register_operand")
- (match_operand:VQW 1 "register_operand")]
+ (ANY_EXTEND:<VWIDE> (match_operand:VQW 1 "register_operand"))]
"TARGET_SIMD"
{
rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
- emit_insn (gen_aarch64_simd_vec_unpacks_hi_<mode> (operands[0],
- operands[1], p));
+ emit_insn (gen_aarch64_simd_vec_unpack<su>_hi_<mode> (operands[0],
+ operands[1], p));
DONE;
}
)
-(define_expand "vec_unpacku_lo_<mode>"
+(define_expand "vec_unpack<su>_lo_<mode>"
[(match_operand:<VWIDE> 0 "register_operand")
- (match_operand:VQW 1 "register_operand")]
- "TARGET_SIMD"
- {
- rtx res = gen_reg_rtx (<MODE>mode);
- rtx tmp = aarch64_gen_shareable_zero (<MODE>mode);
- if (BYTES_BIG_ENDIAN)
- emit_insn (gen_aarch64_zip1<mode> (res, tmp, operands[1]));
- else
- emit_insn (gen_aarch64_zip1<mode> (res, operands[1], tmp));
- emit_move_insn (operands[0],
- simplify_gen_subreg (<VWIDE>mode, res, <MODE>mode, 0));
- DONE;
- }
-)
-
-(define_expand "vec_unpacks_lo_<mode>"
- [(match_operand:<VWIDE> 0 "register_operand")
- (match_operand:VQW 1 "register_operand")]
+ (ANY_EXTEND:<VWIDE> (match_operand:VQW 1 "register_operand"))]
"TARGET_SIMD"
{
rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false);
- emit_insn (gen_aarch64_simd_vec_unpacks_lo_<mode> (operands[0],
- operands[1], p));
+ emit_insn (gen_aarch64_simd_vec_unpack<su>_lo_<mode> (operands[0],
+ operands[1], p));
DONE;
}
)
[(set_attr "type" "neon_sub_widen")]
)
-(define_insn "aarch64_usubw<mode>_lo_zip"
- [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
- (minus:<VWIDE>
- (match_operand:<VWIDE> 1 "register_operand" "w")
- (subreg:<VWIDE>
- (unspec:<MODE> [
- (match_operand:VQW 2 "register_operand" "w")
- (match_operand:VQW 3 "aarch64_simd_imm_zero")
- ] UNSPEC_ZIP1) 0)))]
- "TARGET_SIMD"
- "usubw\\t%0.<Vwtype>, %1.<Vwtype>, %2.<Vhalftype>"
- [(set_attr "type" "neon_sub_widen")]
-)
-
-(define_insn "aarch64_uaddw<mode>_lo_zip"
- [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
- (plus:<VWIDE>
- (subreg:<VWIDE>
- (unspec:<MODE> [
- (match_operand:VQW 2 "register_operand" "w")
- (match_operand:VQW 3 "aarch64_simd_imm_zero")
- ] UNSPEC_ZIP1) 0)
- (match_operand:<VWIDE> 1 "register_operand" "w")))]
- "TARGET_SIMD"
- "uaddw\\t%0.<Vwtype>, %1.<Vwtype>, %2.<Vhalftype>"
- [(set_attr "type" "neon_add_widen")]
-)
-
-(define_insn "aarch64_usubw<mode>_hi_zip"
- [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
- (minus:<VWIDE>
- (match_operand:<VWIDE> 1 "register_operand" "w")
- (subreg:<VWIDE>
- (unspec:<MODE> [
- (match_operand:VQW 2 "register_operand" "w")
- (match_operand:VQW 3 "aarch64_simd_imm_zero")
- ] UNSPEC_ZIP2) 0)))]
- "TARGET_SIMD"
- "usubw2\\t%0.<Vwtype>, %1.<Vwtype>, %2.<Vtype>"
- [(set_attr "type" "neon_sub_widen")]
-)
-
-(define_insn "aarch64_uaddw<mode>_hi_zip"
- [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
- (plus:<VWIDE>
- (subreg:<VWIDE>
- (unspec:<MODE> [
- (match_operand:VQW 2 "register_operand" "w")
- (match_operand:VQW 3 "aarch64_simd_imm_zero")
- ] UNSPEC_ZIP2) 0)
- (match_operand:<VWIDE> 1 "register_operand" "w")))]
- "TARGET_SIMD"
- "uaddw2\\t%0.<Vwtype>, %1.<Vwtype>, %2.<Vtype>"
- [(set_attr "type" "neon_add_widen")]
-)
-
(define_insn "aarch64_<ANY_EXTEND:su>addw<mode>"
[(set (match_operand:<VWIDE> 0 "register_operand" "=w")
(plus:<VWIDE>
)
;; Sign- or zero-extend a 64-bit integer vector to a 128-bit vector.
-(define_insn "<optab><Vnarrowq><mode>2"
+(define_insn_and_split "<optab><Vnarrowq><mode>2"
[(set (match_operand:VQN 0 "register_operand" "=w")
(ANY_EXTEND:VQN (match_operand:<VNARROWQ> 1 "register_operand" "w")))]
"TARGET_SIMD"
"<su>xtl\t%0.<Vtype>, %1.<Vntype>"
+ "&& <CODE> == ZERO_EXTEND
+ && aarch64_split_simd_shift_p (insn)"
+ [(const_int 0)]
+ {
+ /* On many cores, it is cheaper to implement UXTL using a ZIP1 with zero,
+ provided that the cost of the zero can be amortized over several
+ operations. We'll later recombine the zero and zip if there are
+ not sufficient uses of the zero to make the split worthwhile. */
+ rtx res = simplify_gen_subreg (<VNARROWQ2>mode, operands[0],
+ <MODE>mode, 0);
+ rtx zero = aarch64_gen_shareable_zero (<VNARROWQ2>mode);
+ rtx op = lowpart_subreg (<VNARROWQ2>mode, operands[1], <VNARROWQ>mode);
+ emit_insn (gen_aarch64_zip1<Vnarrowq2> (res, op, zero));
+ DONE;
+ }
[(set_attr "type" "neon_shift_imm_long")]
)
== SYMBOL_TINY_ABSOLUTE;
}
+/* Return a function-invariant register that contains VALUE. *CACHED_INSN
+ caches instructions that set up such registers, so that they can be
+ reused by future calls. */
+
+static rtx
+aarch64_get_shareable_reg (rtx_insn **cached_insn, rtx value)
+{
+ rtx_insn *insn = *cached_insn;
+ if (insn && INSN_P (insn) && !insn->deleted ())
+ {
+ rtx pat = PATTERN (insn);
+ if (GET_CODE (pat) == SET)
+ {
+ rtx dest = SET_DEST (pat);
+ if (REG_P (dest)
+ && !HARD_REGISTER_P (dest)
+ && rtx_equal_p (SET_SRC (pat), value))
+ return dest;
+ }
+ }
+ rtx reg = gen_reg_rtx (GET_MODE (value));
+ *cached_insn = emit_insn_before (gen_rtx_SET (reg, value),
+ function_beg_insn);
+ return reg;
+}
+
/* Create a 0 constant that is based on V4SI to allow CSE to optimally share
the constant creation. */
rtx
aarch64_gen_shareable_zero (machine_mode mode)
{
- machine_mode zmode = V4SImode;
- rtx tmp = gen_reg_rtx (zmode);
- emit_move_insn (tmp, CONST0_RTX (zmode));
- return lowpart_subreg (mode, tmp, zmode);
+ rtx reg = aarch64_get_shareable_reg (&cfun->machine->advsimd_zero_insn,
+ CONST0_RTX (V4SImode));
+ return lowpart_subreg (mode, reg, GET_MODE (reg));
+}
+
+/* INSN is some form of extension or shift that can be split into a
+ permutation involving a shared zero. Return true if we should
+ perform such a split.
+
+ ??? For now, make sure that the split instruction executes more
+ frequently than the zero that feeds it. In future it would be good
+ to split without that restriction and instead recombine shared zeros
+ if they turn out not to be worthwhile. This would allow splits in
+ single-block functions and would also cope more naturally with
+ rematerialization. */
+
+bool
+aarch64_split_simd_shift_p (rtx_insn *insn)
+{
+ return (can_create_pseudo_p ()
+ && optimize_bb_for_speed_p (BLOCK_FOR_INSN (insn))
+ && (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count
+ < BLOCK_FOR_INSN (insn)->count));
}
/* Return a const_int vector of VAL. */