aarch64: Avoid redundant writes to FPMR

author Richard Sandiford <richard.sandiford@arm.com>

Thu, 23 Jan 2025 13:57:01 +0000 (13:57 +0000)

committer Richard Sandiford <richard.sandiford@arm.com>

Thu, 23 Jan 2025 13:57:01 +0000 (13:57 +0000)
author Richard Sandiford <richard.sandiford@arm.com>
Thu, 23 Jan 2025 13:57:01 +0000 (13:57 +0000)
committer Richard Sandiford <richard.sandiford@arm.com>
Thu, 23 Jan 2025 13:57:01 +0000 (13:57 +0000)
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def

index 60967aac9037abe204ae1d0aabad31c1a3b4311b..f2c916e9d770e24c6248c6bc7bd190e5e4d12396 100644 (file)
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -48,6 +48,21 @@ AARCH64_EXTRA_TUNING_OPTION ("fully_pipelined_fma", FULLY_PIPELINED_FMA)
     rather than re-use an input predicate register.  */
  AARCH64_EXTRA_TUNING_OPTION ("avoid_pred_rmw", AVOID_PRED_RMW)
  
+/* Whether writes to the FPMR are cheap enough that:
+
+       msr     fpmr, <value>
+
+   is better than:
+
+       mrs     tmp, fpmr
+       cmp     tmp, <value>
+       beq     1f
+       msr     fpmr, <value>
+     1:
+
+   even when the branch is predictably taken.  */
+AARCH64_EXTRA_TUNING_OPTION ("cheap_fpmr_write", CHEAP_FPMR_WRITE)
+
  /* Baseline tuning settings suitable for all modern cores.  */
  #define AARCH64_EXTRA_TUNE_BASE (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND \
                                  | AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA)
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h

index 218868a5246a19c293c03e7be48107c1b0770e27..5cbf442130bc885466221be6a6fac0c14501343a 100644 (file)
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -486,6 +486,11 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE ATTRIBUTE_UNUSED
  /* fp8 instructions are enabled through +fp8.  */
  #define TARGET_FP8 AARCH64_HAVE_ISA (FP8)
  
+/* See the comment above the tuning flag for details.  */
+#define TARGET_CHEAP_FPMR_WRITE \
+  (bool (aarch64_tune_params.extra_tuning_flags \
+        & AARCH64_EXTRA_TUNE_CHEAP_FPMR_WRITE))
+
  /* Combinatorial tests.  */
  
  #define TARGET_SVE2_OR_SME2 \
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md

index 776c4c4ceee144dee1e54e4bb3e2e00ce72eed2c..071058dbeb33297e94f862e029253f0202799978 100644 (file)
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -356,6 +356,7 @@
      UNSPEC_UPDATE_FFRT
      UNSPEC_RDFFR
      UNSPEC_WRFFR
+    UNSPEC_WRITE_FPMR
      UNSPEC_SYSREG_RDI
      UNSPEC_SYSREG_RTI
      UNSPEC_SYSREG_WDI
@@ -1883,6 +1884,44 @@
    }
  )
  
+;; The preferred way of writing to the FPMR is to test whether it already
+;; has the desired value and branch around the write if so.  This reduces
+;; the number of redundant FPMR writes caused by ABI boundaries, such as in:
+;;
+;;    for (...)
+;;      fp8_kernel (..., fpmr_value);
+;;
+;; Without this optimization, fp8_kernel would set FPMR to fpmr_value each
+;; time that it is called.
+;;
+;; We do this as a split so that hardreg_pre can optimize the moves first.
+(define_split
+  [(set (reg:DI FPM_REGNUM)
+        (match_operand:DI 0 "aarch64_reg_or_zero"))]
+  "TARGET_FP8 && !TARGET_CHEAP_FPMR_WRITE && can_create_pseudo_p ()"
+  [(const_int 0)]
+  {
+    auto label = gen_label_rtx ();
+    rtx current = copy_to_reg (gen_rtx_REG (DImode, FPM_REGNUM));
+    rtx cond = gen_rtx_EQ (VOIDmode, current, operands[0]);
+    emit_jump_insn (gen_cbranchdi4 (cond, current, operands[0], label));
+    emit_insn (gen_aarch64_write_fpmr (operands[0]));
+    emit_label (label);
+    DONE;
+  }
+)
+
+;; A write to the FPMR that is already protected by a conditional branch.
+;; Since this instruction is introduced late, it shouldn't matter too much
+;; that we're using an unspec for a move.
+(define_insn "aarch64_write_fpmr"
+  [(set (reg:DI FPM_REGNUM)
+        (unspec:DI [(match_operand:DI 0 "aarch64_reg_or_zero" "rZ")]
+                  UNSPEC_WRITE_FPMR))]
+  "TARGET_FP8"
+  "msr\tfpmr, %x0"
+)
+
  (define_expand "aarch64_cpymemdi"
    [(parallel
       [(set (match_operand 2) (const_int 0))
diff --git a/gcc/testsuite/g++.target/aarch64/sve2/acle/aarch64-sve2-acle-asm.exp b/gcc/testsuite/g++.target/aarch64/sve2/acle/aarch64-sve2-acle-asm.exp

index 4323e5f62ae79b831b2d4fff199e2ab4c7b21a0a..7fc33e99b0526b1df90894a760739125ca5740fd 100644 (file)
--- a/gcc/testsuite/g++.target/aarch64/sve2/acle/aarch64-sve2-acle-asm.exp
+++ b/gcc/testsuite/g++.target/aarch64/sve2/acle/aarch64-sve2-acle-asm.exp
@@ -39,7 +39,7 @@ if { [check_effective_target_aarch64_sve2] } {
  
  # Turn off any codegen tweaks by default that may affect expected assembly.
  # Tests relying on those should turn them on explicitly.
-set sve2_flags "$sve2_flags -mtune=generic -moverride=tune=none"
+set sve2_flags "$sve2_flags -mtune=generic -moverride=tune=none -moverride=tune=cheap_fpmr_write"
  
  set gcc_subdir [string replace $subdir 0 2 gcc]
  lappend extra_flags "-fno-ipa-icf" "-I$srcdir/$gcc_subdir/../../sve/acle/asm"
diff --git a/gcc/testsuite/gcc.target/aarch64/acle/fp8.c b/gcc/testsuite/gcc.target/aarch64/acle/fp8.c

index 63f88e24dfbd60e2de192a379ccb4d2bff649d8f..f0e7035ffc0480cad53c79c291216247162aef05 100644 (file)
--- a/gcc/testsuite/gcc.target/aarch64/acle/fp8.c
+++ b/gcc/testsuite/gcc.target/aarch64/acle/fp8.c
@@ -1,6 +1,6 @@
  /* Test the fp8 ACLE intrinsics family.  */
  /* { dg-do compile } */
-/* { dg-options "-O1 -march=armv8-a" } */
+/* { dg-options "-O1 -march=armv8-a -moverride=tune=cheap_fpmr_write" } */
  /* { dg-final { check-function-bodies "**" "" "" } } */
  
  #include <arm_acle.h>
diff --git a/gcc/testsuite/gcc.target/aarch64/acle/fpmr-2.c b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-2.c

index c5b255b0a9a8ea9161217b22f19adaf58c899dbb..79a9535126ef67cf0560376089778e4e35465344 100644 (file)
--- a/gcc/testsuite/gcc.target/aarch64/acle/fpmr-2.c
+++ b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-2.c
@@ -1,5 +1,5 @@
  /* { dg-do compile } */
-/* { dg-options "-O1 -march=armv8-a+fp8fma" } */
+/* { dg-options "-O1 -march=armv8-a+fp8fma -moverride=tune=cheap_fpmr_write" } */
  
  #include <arm_neon.h>
  
diff --git a/gcc/testsuite/gcc.target/aarch64/acle/fpmr-6.c b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-6.c

new file mode 100644 (file)

index 0000000..6a00e01
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-6.c
@@ -0,0 +1,36 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv9-a+fp8dot4 -moverride=tune=none" } */
+/* { dg-final { check-function-bodies "**" "" "" { target *-*-* } {\.L[0-9]+} } } */
+
+#include "arm_neon.h"
+
+/*
+** f1:
+**     mrs     (x[0-9]+), fpmr
+**     cmp     \1, x0
+**     beq     ([^\n]+)
+**     msr     fpmr, x0
+** ?\2:
+**     fdot    v0.2s, v1.8b, v2.8b
+**     ret
+*/
+float32x2_t
+f1 (float32x2_t a, mfloat8x8_t b, mfloat8x8_t c, fpm_t d)
+{
+  return vdot_f32_mf8_fpm (a, b, c, d);
+}
+
+/*
+** f2:
+**     mrs     (x[0-9]+), fpmr
+**     cbz     \1, ([^\n]+)
+**     msr     fpmr, xzr
+** ?\2:
+**     fdot    v0.2s, v1.8b, v2.8b
+**     ret
+*/
+float32x2_t
+f2 (float32x2_t a, mfloat8x8_t b, mfloat8x8_t c)
+{
+  return vdot_f32_mf8_fpm (a, b, c, 0);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vcvt_fpm.c b/gcc/testsuite/gcc.target/aarch64/simd/vcvt_fpm.c

index 39076684345f5404309c87d929f6ed6d3c68aaf2..29dece61d4ac17cb070c4ac6f0da8d22447ed0d3 100644 (file)
--- a/gcc/testsuite/gcc.target/aarch64/simd/vcvt_fpm.c
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vcvt_fpm.c
@@ -1,5 +1,5 @@
  /* { dg-do compile } */
-/* { dg-additional-options "-O3 -march=armv9-a+fp8" } */
+/* { dg-additional-options "-O3 -march=armv9-a+fp8 -moverride=tune=cheap_fpmr_write" } */
  /* { dg-final { check-function-bodies "**" "" } } */
  
  #include "arm_neon.h"
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vdot2_fpm.c b/gcc/testsuite/gcc.target/aarch64/simd/vdot2_fpm.c

index 5fe139106c6a5f7f4c93ad2b96f3e7d1e53979c2..07decd71926b01ddae2cc461f5d3536f29c941e4 100644 (file)
--- a/gcc/testsuite/gcc.target/aarch64/simd/vdot2_fpm.c
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vdot2_fpm.c
@@ -1,5 +1,5 @@
  /* { dg-do compile } */
-/* { dg-additional-options "-O3 -march=armv9-a+fp8dot2" } */
+/* { dg-additional-options "-O3 -march=armv9-a+fp8dot2 -moverride=tune=cheap_fpmr_write" } */
  /* { dg-final { check-function-bodies "**" "" } } */
  
  #include "arm_neon.h"
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vdot4_fpm.c b/gcc/testsuite/gcc.target/aarch64/simd/vdot4_fpm.c

index e47a737e8b5fae4f302ea172c8afc3916a555101..27c1d38434fb0b898574110037a813150d2cb150 100644 (file)
--- a/gcc/testsuite/gcc.target/aarch64/simd/vdot4_fpm.c
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vdot4_fpm.c
@@ -1,5 +1,5 @@
  /* { dg-do compile } */
-/* { dg-additional-options "-O3 -march=armv9-a+fp8dot4" } */
+/* { dg-additional-options "-O3 -march=armv9-a+fp8dot4 -moverride=tune=cheap_fpmr_write" } */
  /* { dg-final { check-function-bodies "**" "" } } */
  
  #include "arm_neon.h"
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vmla_fpm.c b/gcc/testsuite/gcc.target/aarch64/simd/vmla_fpm.c

index 51b47055ca2a6c22cc56859b341a35fa22145930..8e5835af5a20ef2d2b073f0aa7c4aee20cdbdf67 100644 (file)
--- a/gcc/testsuite/gcc.target/aarch64/simd/vmla_fpm.c
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vmla_fpm.c
@@ -1,5 +1,5 @@
  /* { dg-do compile } */
-/* { dg-additional-options "-O3 -march=armv9-a+fp8fma" } */
+/* { dg-additional-options "-O3 -march=armv9-a+fp8fma -moverride=tune=cheap_fpmr_write" } */
  /* { dg-final { check-function-bodies "**" "" } } */
  
  #include "arm_neon.h"
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/aarch64-sve2-acle-asm.exp b/gcc/testsuite/gcc.target/aarch64/sve2/acle/aarch64-sve2-acle-asm.exp

index 69a3a1786f268880a5b28050387af91793461b84..e950f8613da46fd535a77c4438ac97b157a26011 100644 (file)
--- a/gcc/testsuite/gcc.target/aarch64/sve2/acle/aarch64-sve2-acle-asm.exp
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/aarch64-sve2-acle-asm.exp
@@ -39,7 +39,7 @@ if { [check_effective_target_aarch64_sve2] } {
  
  # Turn off any codegen tweaks by default that may affect expected assembly.
  # Tests relying on those should turn them on explicitly.
-set sve2_flags "$sve2_flags -mtune=generic -moverride=tune=none"
+set sve2_flags "$sve2_flags -mtune=generic -moverride=tune=none -moverride=tune=cheap_fpmr_write"
  
  lappend extra_flags "-fno-ipa-icf"
author	Richard Sandiford <richard.sandiford@arm.com>
	Thu, 23 Jan 2025 13:57:01 +0000 (13:57 +0000)
committer	Richard Sandiford <richard.sandiford@arm.com>
	Thu, 23 Jan 2025 13:57:01 +0000 (13:57 +0000)
gcc/config/aarch64/aarch64-tuning-flags.def		patch \| blob \| blame \| history
gcc/config/aarch64/aarch64.h		patch \| blob \| blame \| history
gcc/config/aarch64/aarch64.md		patch \| blob \| blame \| history
gcc/testsuite/g++.target/aarch64/sve2/acle/aarch64-sve2-acle-asm.exp		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/aarch64/acle/fp8.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/aarch64/acle/fpmr-2.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/aarch64/acle/fpmr-6.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/simd/vcvt_fpm.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/aarch64/simd/vdot2_fpm.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/aarch64/simd/vdot4_fpm.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/aarch64/simd/vmla_fpm.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/aarch64/sve2/acle/aarch64-sve2-acle-asm.exp		patch \| blob \| blame \| history