[(set_attr "type" "neon_ins<q>, neon_from_gp<q>, neon_load1_one_lane<q>")]
)
+;; Inserting from the zero register into a vector lane is treated as an
+;; expensive GP->FP move on all CPUs. Avoid it when optimizing for speed.
(define_insn "aarch64_simd_vec_set_zero<mode>"
[(set (match_operand:VALL_F16 0 "register_operand" "=w")
(vec_merge:VALL_F16
(match_operand:VALL_F16 1 "register_operand" "0")
(match_operand:VALL_F16 3 "aarch64_simd_imm_zero" "")
(match_operand:SI 2 "immediate_operand" "i")))]
- "TARGET_SIMD && aarch64_exact_log2_inverse (<nunits>, operands[2]) >= 0"
+ "TARGET_SIMD && aarch64_exact_log2_inverse (<nunits>, operands[2]) >= 0
+ && optimize_function_for_size_p (cfun)"
{
int elt = ENDIAN_LANE_N (<nunits>,
aarch64_exact_log2_inverse (<nunits>,
--- /dev/null
+/* { dg-do compile } */
+/* { dg-additional-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+/* Avoid INS from WZR register when optimizing for speed. */
+
+#include <arm_neon.h>
+
+/*
+** foo:
+** movi? [vdz]([0-9]+)\.?(?:[0-9]*[bhsd])?, #?0
+** ins v0.h\[2\], v(\1).h\[0\]
+** ret
+*/
+uint16x8_t foo(uint16x8_t a) {
+ a[2] = 0;
+ return a;
+}
/*
** test_set_lane4:
+** (
** ins v0.b\[6\], wzr
+** |
+** movi? [vdz]([0-9]+)\.?(?:[0-9]*[bhsd])?, #?0
+** ins v0.b\[6\], v(\1).b\[0\]
+** )
** ret
*/
mfloat8x8_t test_set_lane4(mfloat8x8_t a)
/*
** test_setq_lane4:
+** (
** ins v0.b\[14\], wzr
+** |
+** movi? [vdz]([0-9]+)\.?(?:[0-9]*[bhsd])?, #?0
+** ins v0.b\[14\], v(\1).b\[0\]
+** )
** ret
*/
mfloat8x16_t test_setq_lane4(mfloat8x16_t a)