char *aarch64_output_simd_xor_imm (rtx, unsigned);
char *aarch64_output_fmov (rtx);
+char *aarch64_output_simd_mov_imm_low (rtx *);
char *aarch64_output_sve_mov_immediate (rtx);
char *aarch64_output_sve_ptrues (rtx);
+bool aarch64_const_vec_fmov_p (rtx);
bool aarch64_pad_reg_upward (machine_mode, const_tree, bool);
bool aarch64_regno_ok_for_base_p (int, bool);
bool aarch64_regno_ok_for_index_p (int, bool);
}
else if (!aarch64_simd_imm_zero (operands[1], <MODE>mode)
&& !aarch64_simd_special_constant_p (operands[1], <MODE>mode)
- && !aarch64_simd_valid_mov_imm (operands[1]))
+ && !aarch64_simd_valid_mov_imm (operands[1])
+ && !aarch64_const_vec_fmov_p (operands[1]))
{
rtx x;
/* Expand into VDUP. */
[?r, w ; neon_to_gp<q> , * , *] fmov\t%x0, %d1
[?w, r ; f_mcr , * , *] fmov\t%d0, %1
[?r, r ; mov_reg , * , *] mov\t%0, %1
+ [w , Dc; fmov , * , *] << aarch64_output_simd_mov_imm_low (operands);
[w , Dn; neon_move<q> , simd , *] << aarch64_output_simd_mov_imm (operands[1], 64);
[w , Dz; f_mcr , * , *] fmov\t%d0, xzr
[w , Dx; neon_move , simd , 8] #
[?r , w ; multiple , * , 8] #
[?w , r ; multiple , * , 8] #
[?r , r ; multiple , * , 8] #
+ [w , Dc; fmov , * , 4] << aarch64_output_simd_mov_imm_low (operands);
[w , Dn; neon_move<q> , simd, 4] << aarch64_output_simd_mov_imm (operands[1], 128);
[w , Dz; fmov , * , 4] fmov\t%d0, xzr
[w , Dx; neon_move , simd, 8] #
return aarch64_simd_valid_imm (op, NULL, AARCH64_CHECK_MOV);
}
+
+/* Return true if OP is an FP constant vector in which the low register
+ element can be materialized using FMOV and all other elements are zero. */
+bool
+aarch64_const_vec_fmov_p (rtx op)
+{
+ if (!CONST_VECTOR_P (op))
+ return false;
+
+ machine_mode mode = GET_MODE (op);
+ scalar_mode inner_mode = GET_MODE_INNER (mode);
+
+ if (inner_mode != E_HFmode
+ && inner_mode != E_SFmode
+ && inner_mode != E_DFmode)
+ return false;
+
+ if (inner_mode == E_HFmode && !TARGET_FP_F16INST)
+ return false;
+
+ unsigned int nunits = GET_MODE_NUNITS (mode).to_constant ();
+ unsigned int const_idx = BYTES_BIG_ENDIAN ? nunits - 1 : 0;
+
+ rtx elt = CONST_VECTOR_ELT (op, const_idx);
+ if (!CONST_DOUBLE_P (elt))
+ return false;
+
+ REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
+ if (!aarch64_real_float_const_representable_p (r))
+ return false;
+
+ for (unsigned int i = 0; i < nunits; ++i)
+ {
+ if (i == const_idx)
+ continue;
+
+ rtx x = CONST_VECTOR_ELT (op, i);
+ if (!rtx_equal_p (x, CONST0_RTX (inner_mode)))
+ return false;
+ }
+
+ return true;
+}
+
+/* Output a move of an FP constant vector in which the low register element is
+ materialized using FMOV and all other elements are zero. */
+char *
+aarch64_output_simd_mov_imm_low (rtx *operands)
+{
+ machine_mode mode = GET_MODE (operands[1]);
+ scalar_mode inner_mode = GET_MODE_INNER (mode);
+ unsigned int nunits = GET_MODE_NUNITS (mode).to_constant ();
+ unsigned int const_idx = BYTES_BIG_ENDIAN ? nunits - 1 : 0;
+ rtx elt = CONST_VECTOR_ELT (operands[1], const_idx);
+ rtx xop[2];
+
+ xop[0] = operands[0];
+ xop[1] = elt;
+
+ switch (inner_mode)
+ {
+ case E_HFmode:
+ output_asm_insn ("fmov\t%h0, %1", xop);
+ break;
+
+ case E_SFmode:
+ output_asm_insn ("fmov\t%s0, %1", xop);
+ break;
+
+ case E_DFmode:
+ output_asm_insn ("fmov\t%d0, %1", xop);
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+ return "";
+}
+
/* Return true if OP is a valid SIMD orr immediate for SVE or AdvSIMD. */
bool
aarch64_simd_valid_orr_imm (rtx op)
(and (match_code "const_vector")
(match_test "aarch64_simd_valid_xor_imm (op)")))
+(define_constraint "Dc"
+ "@internal
+ A constraint that matches an FP constant vector in which the low register
+ element can be materialized using FMOV and all other elements are zero."
+ (and (match_code "const_vector")
+ (match_test "aarch64_const_vec_fmov_p (op)")))
+
(define_constraint "Dn"
"@internal
A constraint that matches vector of immediates."
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-additional-options "-march=armv8-a+fp16" } */
+
+/* Check that FP vector constants with only the low element nonzero are
+ materialized with scalar FMOV rather than a literal pool load.
+
+ PR target/113856. */
+
+typedef float vect64_float __attribute__((vector_size(8)));
+typedef float vect128_float __attribute__((vector_size(16)));
+typedef _Float16 vect64_half __attribute__((vector_size(8)));
+typedef _Float16 vect128_half __attribute__((vector_size(16)));
+typedef double vect128_double __attribute__((vector_size(16)));
+
+vect64_float
+f1 (void)
+{
+ return (vect64_float) { 1.0f, 0.0f };
+}
+
+/* Existing duplicated-lane case. */
+vect64_float
+f2 (void)
+{
+ return (vect64_float) { 1.0f, 1.0f };
+}
+
+vect128_float
+f3 (void)
+{
+ return (vect128_float) { 1.0f, 0.0f, 0.0f, 0.0f };
+}
+
+vect64_half
+f4 (void)
+{
+ return (vect64_half) { (_Float16) 1.0, (_Float16) 0.0,
+ (_Float16) 0.0, (_Float16) 0.0 };
+}
+
+vect128_half
+f5 (void)
+{
+ return (vect128_half) { (_Float16) 1.0, (_Float16) 0.0,
+ (_Float16) 0.0, (_Float16) 0.0,
+ (_Float16) 0.0, (_Float16) 0.0,
+ (_Float16) 0.0, (_Float16) 0.0 };
+}
+
+vect128_double
+f6 (void)
+{
+ return (vect128_double) { 1.0, 0.0 };
+}
+
+/* f1 and f3: scalar FMOV into the low SF element. */
+/* { dg-final { scan-assembler-times {\tfmov\ts[0-9]+, 1\.0} 2 } } */
+
+/* f2: existing vector duplicated-FMOV case. */
+/* { dg-final { scan-assembler-times {\tfmov\tv[0-9]+\.2s, 1\.0} 1 } } */
+
+/* f4 and f5: scalar FMOV into the low HF element. */
+/* { dg-final { scan-assembler-times {\tfmov\th[0-9]+, 1\.0} 2 } } */
+
+/* f6: scalar FMOV into the low DF element. */
+/* { dg-final { scan-assembler-times {\tfmov\td[0-9]+, 1\.0} 1 } } */
+
+/* None of them should need a literal pool load. */
+/* { dg-final { scan-assembler-not {\tldr\tq[0-9]+,} } } */