[PATCH] [Aarch64]: Use fmov for some low-lane FP SIMD constant vectors

author Naveen <naveen.siddegowda@oss.qualcomm.com>

Mon, 11 May 2026 05:45:41 +0000 (22:45 -0700)

committer Naveen <naveen.siddegowda@oss.qualcomm.com>

Mon, 11 May 2026 05:49:24 +0000 (22:49 -0700)
author Naveen <naveen.siddegowda@oss.qualcomm.com>
Mon, 11 May 2026 05:45:41 +0000 (22:45 -0700)
committer Naveen <naveen.siddegowda@oss.qualcomm.com>
Mon, 11 May 2026 05:49:24 +0000 (22:49 -0700)
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h

index 16b58f39a97e1f170c2629c55397f0048f22d18f..0798546809de32557c04954d6e4ecd87981b9e55 100644 (file)
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -952,8 +952,10 @@ char *aarch64_output_simd_and_imm (rtx, unsigned);
  char *aarch64_output_simd_xor_imm (rtx, unsigned);
  char *aarch64_output_fmov (rtx);
  
+char *aarch64_output_simd_mov_imm_low (rtx *);
  char *aarch64_output_sve_mov_immediate (rtx);
  char *aarch64_output_sve_ptrues (rtx);
+bool aarch64_const_vec_fmov_p (rtx);
  bool aarch64_pad_reg_upward (machine_mode, const_tree, bool);
  bool aarch64_regno_ok_for_base_p (int, bool);
  bool aarch64_regno_ok_for_index_p (int, bool);
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md

index c314e85927d3824ad8efc61dadd9b6eedf113433..2e142b1e1ee763dd613b268454e7795979141ed9 100644 (file)
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -79,7 +79,8 @@
         }
        else if (!aarch64_simd_imm_zero (operands[1], <MODE>mode)
                && !aarch64_simd_special_constant_p (operands[1], <MODE>mode)
-              && !aarch64_simd_valid_mov_imm (operands[1]))
+              && !aarch64_simd_valid_mov_imm (operands[1])
+              && !aarch64_const_vec_fmov_p (operands[1]))
         {
           rtx x;
           /* Expand into VDUP.  */
@@ -183,6 +184,7 @@
       [?r, w ; neon_to_gp<q>      , *        , *] fmov\t%x0, %d1
       [?w, r ; f_mcr              , *        , *] fmov\t%d0, %1
       [?r, r ; mov_reg            , *        , *] mov\t%0, %1
+     [w , Dc; fmov               , *        , *] << aarch64_output_simd_mov_imm_low (operands);
       [w , Dn; neon_move<q>       , simd     , *] << aarch64_output_simd_mov_imm (operands[1], 64);
       [w , Dz; f_mcr              , *        , *] fmov\t%d0, xzr
       [w , Dx; neon_move          , simd     , 8] #
@@ -212,6 +214,7 @@
       [?r , w ; multiple           , *   , 8] #
       [?w , r ; multiple           , *   , 8] #
       [?r , r ; multiple           , *   , 8] #
+     [w  , Dc; fmov               , *   , 4] << aarch64_output_simd_mov_imm_low (operands);
       [w  , Dn; neon_move<q>       , simd, 4] << aarch64_output_simd_mov_imm (operands[1], 128);
       [w  , Dz; fmov               , *   , 4] fmov\t%d0, xzr
       [w  , Dx; neon_move          , simd, 8] #
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc

index 3816df92b185ae5232e9a204f7af1a2f0879032c..619c2a6d2265aba2db837559646fbf51e571c00e 100644 (file)
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -24604,6 +24604,85 @@ aarch64_simd_valid_mov_imm (rtx op)
    return aarch64_simd_valid_imm (op, NULL, AARCH64_CHECK_MOV);
  }
  
+
+/* Return true if OP is an FP constant vector in which the low register
+   element can be materialized using FMOV and all other elements are zero.  */
+bool
+aarch64_const_vec_fmov_p (rtx op)
+{
+  if (!CONST_VECTOR_P (op))
+    return false;
+
+  machine_mode mode = GET_MODE (op);
+  scalar_mode inner_mode = GET_MODE_INNER (mode);
+
+  if (inner_mode != E_HFmode
+      && inner_mode != E_SFmode
+      && inner_mode != E_DFmode)
+    return false;
+
+  if (inner_mode == E_HFmode && !TARGET_FP_F16INST)
+    return false;
+
+  unsigned int nunits = GET_MODE_NUNITS (mode).to_constant ();
+  unsigned int const_idx = BYTES_BIG_ENDIAN ? nunits - 1 : 0;
+
+  rtx elt = CONST_VECTOR_ELT (op, const_idx);
+  if (!CONST_DOUBLE_P (elt))
+    return false;
+
+  REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
+  if (!aarch64_real_float_const_representable_p (r))
+    return false;
+
+  for (unsigned int i = 0; i < nunits; ++i)
+    {
+      if (i == const_idx)
+       continue;
+
+      rtx x = CONST_VECTOR_ELT (op, i);
+      if (!rtx_equal_p (x, CONST0_RTX (inner_mode)))
+       return false;
+    }
+
+  return true;
+}
+
+/* Output a move of an FP constant vector in which the low register element is
+   materialized using FMOV and all other elements are zero.  */
+char *
+aarch64_output_simd_mov_imm_low (rtx *operands)
+{
+  machine_mode mode = GET_MODE (operands[1]);
+  scalar_mode inner_mode = GET_MODE_INNER (mode);
+  unsigned int nunits = GET_MODE_NUNITS (mode).to_constant ();
+  unsigned int const_idx = BYTES_BIG_ENDIAN ? nunits - 1 : 0;
+  rtx elt = CONST_VECTOR_ELT (operands[1], const_idx);
+  rtx xop[2];
+
+  xop[0] = operands[0];
+  xop[1] = elt;
+
+  switch (inner_mode)
+    {
+      case E_HFmode:
+       output_asm_insn ("fmov\t%h0, %1", xop);
+       break;
+
+      case E_SFmode:
+       output_asm_insn ("fmov\t%s0, %1", xop);
+       break;
+
+      case E_DFmode:
+       output_asm_insn ("fmov\t%d0, %1", xop);
+       break;
+
+      default:
+       gcc_unreachable ();
+    }
+  return "";
+}
+
  /* Return true if OP is a valid SIMD orr immediate for SVE or AdvSIMD.  */
  bool
  aarch64_simd_valid_orr_imm (rtx op)
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md

index 3d166fe3a1765f39e289ccf07b61687aab0d274c..8760220835b74e87497a696865a5ea7304b9d406 100644 (file)
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -503,6 +503,13 @@
   (and (match_code "const_vector")
        (match_test "aarch64_simd_valid_xor_imm (op)")))
  
+(define_constraint "Dc"
+ "@internal
+  A constraint that matches an FP constant vector in which the low register
+  element can be materialized using FMOV and all other elements are zero."
+ (and (match_code "const_vector")
+      (match_test "aarch64_const_vec_fmov_p (op)")))
+
  (define_constraint "Dn"
    "@internal
   A constraint that matches vector of immediates."
diff --git a/gcc/testsuite/gcc.target/aarch64/pr113856.c b/gcc/testsuite/gcc.target/aarch64/pr113856.c

new file mode 100644 (file)

index 0000000..f0facbc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr113856.c
@@ -0,0 +1,70 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-additional-options "-march=armv8-a+fp16" } */
+
+/* Check that FP vector constants with only the low element nonzero are
+   materialized with scalar FMOV rather than a literal pool load.
+
+   PR target/113856.  */
+
+typedef float vect64_float __attribute__((vector_size(8)));
+typedef float vect128_float __attribute__((vector_size(16)));
+typedef _Float16 vect64_half __attribute__((vector_size(8)));
+typedef _Float16 vect128_half __attribute__((vector_size(16)));
+typedef double vect128_double __attribute__((vector_size(16)));
+
+vect64_float
+f1 (void)
+{
+  return (vect64_float) { 1.0f, 0.0f };
+}
+
+/* Existing duplicated-lane case.  */
+vect64_float
+f2 (void)
+{
+  return (vect64_float) { 1.0f, 1.0f };
+}
+
+vect128_float
+f3 (void)
+{
+  return (vect128_float) { 1.0f, 0.0f, 0.0f, 0.0f };
+}
+
+vect64_half
+f4 (void)
+{
+  return (vect64_half) { (_Float16) 1.0, (_Float16) 0.0,
+                         (_Float16) 0.0, (_Float16) 0.0 };
+}
+
+vect128_half
+f5 (void)
+{
+  return (vect128_half) { (_Float16) 1.0, (_Float16) 0.0,
+                          (_Float16) 0.0, (_Float16) 0.0,
+                          (_Float16) 0.0, (_Float16) 0.0,
+                          (_Float16) 0.0, (_Float16) 0.0 };
+}
+
+vect128_double
+f6 (void)
+{
+  return (vect128_double) { 1.0, 0.0 };
+}
+
+/* f1 and f3: scalar FMOV into the low SF element.  */
+/* { dg-final { scan-assembler-times {\tfmov\ts[0-9]+, 1\.0} 2 } } */
+
+/* f2: existing vector duplicated-FMOV case.  */
+/* { dg-final { scan-assembler-times {\tfmov\tv[0-9]+\.2s, 1\.0} 1 } } */
+
+/* f4 and f5: scalar FMOV into the low HF element.  */
+/* { dg-final { scan-assembler-times {\tfmov\th[0-9]+, 1\.0} 2 } } */
+
+/* f6: scalar FMOV into the low DF element.  */
+/* { dg-final { scan-assembler-times {\tfmov\td[0-9]+, 1\.0} 1 } } */
+
+/* None of them should need a literal pool load.  */
+/* { dg-final { scan-assembler-not {\tldr\tq[0-9]+,} } } */
author	Naveen <naveen.siddegowda@oss.qualcomm.com>
	Mon, 11 May 2026 05:45:41 +0000 (22:45 -0700)
committer	Naveen <naveen.siddegowda@oss.qualcomm.com>
	Mon, 11 May 2026 05:49:24 +0000 (22:49 -0700)
gcc/config/aarch64/aarch64-protos.h		patch \| blob \| blame \| history
gcc/config/aarch64/aarch64-simd.md		patch \| blob \| blame \| history
gcc/config/aarch64/aarch64.cc		patch \| blob \| blame \| history
gcc/config/aarch64/constraints.md		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/aarch64/pr113856.c	[new file with mode: 0644]	patch \| blob