]> git.ipfire.org Git - thirdparty/gcc.git/commitdiff
aarch64: Enable selective LDAPUR generation for cores with RCPC2
authorSoumya AR <soumyaa@nvidia.com>
Tue, 15 Jul 2025 13:58:44 +0000 (19:28 +0530)
committerSoumya AR <soumyaa@nvidia.com>
Tue, 15 Jul 2025 14:13:38 +0000 (19:43 +0530)
This patch adds the ability to fold the address computation into the addressing
mode for LDAPR instructions using LDAPUR when RCPC2 is available.

LDAPUR emission is enabled by default when RCPC2 is available, but can be
disabled using the avoid_ldapur tune flag on a per-core basis.

Currently, it is disabled for neoverse-v2, neoverse-v3, cortex-x925, and
architecutres before armv8.8-a.

Earlier, the following code:

uint64_t
foo (std::atomic<uint64_t> *x)
{
  return x[1].load(std::memory_order_acquire);
}

would generate:

foo(std::atomic<unsigned long>*):
add     x0, x0, 8
ldapr   x0, [x0]
ret

but now generates:

foo(std::atomic<unsigned long>*):
ldapur  x0, [x0, 8]
ret

The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression.
OK for mainline?

Signed-off-by: Soumya AR <soumyaa@nvidia.com>
gcc/ChangeLog:

* config/aarch64/aarch64-tuning-flags.def (AARCH64_EXTRA_TUNING_OPTION):
Add AVOID_LDAPUR tuning flag.
* config/aarch64/aarch64.cc (aarch64_adjust_generic_arch_tuning):
Set AVOID_LDAPUR for architectures before armv8.8-a.
(aarch64_override_options_internal): Apply generic tuning adjustments
to generic_armv8_a_tunings and generic_armv9_a_tunings.
* config/aarch64/aarch64.h (TARGET_ENABLE_LDAPUR): New macro to
control LDAPUR usage based on RCPC2 and tuning flags.
* config/aarch64/aarch64.md: Add enable_ldapur attribute.
* config/aarch64/atomics.md (aarch64_atomic_load<mode>_rcpc): Modify
to emit LDAPUR for cores with RCPC2.
(*aarch64_atomic_load<ALLX:mode>_rcpc_zext): Likewise.
(*aarch64_atomic_load<ALLX:mode>_rcpc_sext): Update constraint to Ust.
* config/aarch64/tuning_models/cortexx925.h: Add AVOID_LDAPUR flag.
* config/aarch64/tuning_models/neoversev2.h: Likewise.
* config/aarch64/tuning_models/neoversev3.h: Likewise.
* config/aarch64/tuning_models/neoversev3ae.h: Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/ldapr-sext.c: Update expected output to include
offsets.
* gcc.target/aarch64/ldapur.c: New test for LDAPUR.
* gcc.target/aarch64/ldapur_avoid.c: New test for AVOID_LDAPUR flag.

12 files changed:
gcc/config/aarch64/aarch64-tuning-flags.def
gcc/config/aarch64/aarch64.cc
gcc/config/aarch64/aarch64.h
gcc/config/aarch64/aarch64.md
gcc/config/aarch64/atomics.md
gcc/config/aarch64/tuning_models/cortexx925.h
gcc/config/aarch64/tuning_models/neoversev2.h
gcc/config/aarch64/tuning_models/neoversev3.h
gcc/config/aarch64/tuning_models/neoversev3ae.h
gcc/testsuite/gcc.target/aarch64/ldapr-sext.c
gcc/testsuite/gcc.target/aarch64/ldapur.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/ldapur_avoid.c [new file with mode: 0644]

index f2c916e9d770e24c6248c6bc7bd190e5e4d12396..dd91324e9c8063c6aa4b032838426ee499604eef 100644 (file)
@@ -44,6 +44,8 @@ AARCH64_EXTRA_TUNING_OPTION ("avoid_cross_loop_fma", AVOID_CROSS_LOOP_FMA)
 
 AARCH64_EXTRA_TUNING_OPTION ("fully_pipelined_fma", FULLY_PIPELINED_FMA)
 
+AARCH64_EXTRA_TUNING_OPTION ("avoid_ldapur", AVOID_LDAPUR)
+
 /* Enable is the target prefers to use a fresh register for predicate outputs
    rather than re-use an input predicate register.  */
 AARCH64_EXTRA_TUNING_OPTION ("avoid_pred_rmw", AVOID_PRED_RMW)
index 6e16763f9571d97a783e9de7a57b87f923efe06c..0485f695941c70285e075453adadae2452b02dab 100644 (file)
@@ -18793,6 +18793,8 @@ aarch64_adjust_generic_arch_tuning (struct tune_params &current_tune)
   if (TARGET_SVE2)
     current_tune.extra_tuning_flags
       &= ~AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS;
+  if (!AARCH64_HAVE_ISA(V8_8A))
+    aarch64_tune_params.extra_tuning_flags |= AARCH64_EXTRA_TUNE_AVOID_LDAPUR;
 }
 
 static void
@@ -18857,7 +18859,10 @@ aarch64_override_options_internal (struct gcc_options *opts)
   /* Make a copy of the tuning parameters attached to the core, which
      we may later overwrite.  */
   aarch64_tune_params = *(tune->tune);
-  if (tune->tune == &generic_tunings)
+
+  if (tune->tune == &generic_tunings
+      || tune->tune == &generic_armv8_a_tunings
+      || tune->tune == &generic_armv9_a_tunings)
     aarch64_adjust_generic_arch_tuning (aarch64_tune_params);
 
   if (opts->x_aarch64_override_tune_string)
index d5c4a42e96d95f7439952f753e1849767505c449..096c853af7ff0215c49da42f1991b0af53cddeab 100644 (file)
@@ -493,6 +493,11 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE ATTRIBUTE_UNUSED
   (bool (aarch64_tune_params.extra_tuning_flags \
         & AARCH64_EXTRA_TUNE_CHEAP_FPMR_WRITE))
 
+/* Enable folding address computation into LDAPUR when RCPC2 is available.  */
+#define TARGET_ENABLE_LDAPUR (TARGET_RCPC2 \
+                             && !(aarch64_tune_params.extra_tuning_flags \
+                                  & AARCH64_EXTRA_TUNE_AVOID_LDAPUR))
+
 /* Combinatorial tests.  */
 
 #define TARGET_SVE2_OR_SME2 \
index 27efc9155dcb0fe2d57ff2cbedbf5cce75fa82ed..a4ae6859da018c95cddb66988ce985d280f2aff0 100644 (file)
 ;; clobber for SVE predicates.
 (define_attr "pred_clobber" "any,no,yes" (const_string "any"))
 
+(define_attr "enable_ldapur" "any,no,yes" (const_string "any"))
+
 ;; [For compatibility with Arm in pipeline models]
 ;; Attribute that specifies whether or not the instruction touches fp
 ;; registers.
          (eq_attr "pred_clobber" "yes")
          (match_test "TARGET_SVE_PRED_CLOBBER"))
        (eq_attr "pred_clobber" "any"))
-
+      (ior
+       (and
+         (eq_attr "enable_ldapur" "yes")
+         (match_test "TARGET_ENABLE_LDAPUR"))
+       (and
+         (eq_attr "enable_ldapur" "no")
+         (match_test "!TARGET_ENABLE_LDAPUR"))
+       (eq_attr "enable_ldapur" "any"))
       (ior
        (eq_attr "arch" "any")
 
index 36b0dbd1f57fe9d6085f925b334be29032fa5f56..ea4a9367fc880ed449870f80aecd571bf1d61d7b 100644 (file)
 )
 
 (define_insn "aarch64_atomic_load<mode>_rcpc"
-  [(set (match_operand:ALLI 0 "register_operand" "=r")
+  [(set (match_operand:ALLI 0 "register_operand")
     (unspec_volatile:ALLI
-      [(match_operand:ALLI 1 "aarch64_sync_memory_operand" "Q")
+      [(match_operand:ALLI 1 "aarch64_rcpc_memory_operand")
        (match_operand:SI 2 "const_int_operand")]                       ;; model
       UNSPECV_LDAP))]
   "TARGET_RCPC"
-  "ldapr<atomic_sfx>\t%<w>0, %1"
+  {@ [ cons: =0 , 1   ; attrs: enable_ldapur  ]
+     [ r        , Q   ; any                   ] ldapr<atomic_sfx>\t%<w>0, %1
+     [ r        , Ust ; yes                   ] ldapur<atomic_sfx>\t%<w>0, %1
+  }
 )
 
 (define_insn "aarch64_atomic_load<mode>"
 )
 
 (define_insn "*aarch64_atomic_load<ALLX:mode>_rcpc_zext"
-  [(set (match_operand:SD_HSDI 0 "register_operand" "=r")
+  [(set (match_operand:SD_HSDI 0 "register_operand")
     (zero_extend:SD_HSDI
       (unspec_volatile:ALLX
-        [(match_operand:ALLX 1 "aarch64_sync_memory_operand" "Q")
+        [(match_operand:ALLX 1 "aarch64_rcpc_memory_operand")
          (match_operand:SI 2 "const_int_operand")]                     ;; model
        UNSPECV_LDAP)))]
   "TARGET_RCPC && (<SD_HSDI:sizen> > <ALLX:sizen>)"
-  "ldapr<ALLX:atomic_sfx>\t%w0, %1"
+  {@ [ cons: =0 , 1   ; attrs: enable_ldapur ]
+     [ r        , Q   ; any                  ] ldapr<ALLX:atomic_sfx>\t%w0, %1
+     [ r        , Ust ; yes                  ] ldapur<ALLX:atomic_sfx>\t%w0, %1
+  }
 )
 
 (define_insn "*aarch64_atomic_load<ALLX:mode>_rcpc_sext"
   [(set (match_operand:GPI  0 "register_operand" "=r")
     (sign_extend:GPI
       (unspec_volatile:ALLX
-        [(match_operand:ALLX 1 "aarch64_sync_memory_operand" "Q")
+        [(match_operand:ALLX 1 "aarch64_rcpc_memory_operand" "Ust")
          (match_operand:SI 2 "const_int_operand")]                     ;; model
        UNSPECV_LDAP)))]
   "TARGET_RCPC2 && (<GPI:sizen> > <ALLX:sizen>)"
index 7d0162eae54c1823eff7b954d5e1d7564eb31dab..f448493b1bc5a2751c9e8444190d7574db05a802 100644 (file)
@@ -222,7 +222,8 @@ static const struct tune_params cortexx925_tunings =
   (AARCH64_EXTRA_TUNE_BASE
    | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
-   | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW),       /* tune_flags.  */
+   | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW
+   | AARCH64_EXTRA_TUNE_AVOID_LDAPUR), /* tune_flags.  */
   &generic_armv9a_prefetch_tune,
   AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
   AARCH64_LDP_STP_POLICY_ALWAYS           /* stp_policy_model.  */
index b000fb46570953f368c553d991a66661f448c728..266d8f190a250fa0dcde97b778ce320be20d4452 100644 (file)
@@ -220,7 +220,8 @@ static const struct tune_params neoversev2_tunings =
   (AARCH64_EXTRA_TUNE_BASE
    | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
-   | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW),       /* tune_flags.  */
+   | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW
+   | AARCH64_EXTRA_TUNE_AVOID_LDAPUR), /* tune_flags.  */
   &generic_armv9a_prefetch_tune,
   AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
   AARCH64_LDP_STP_POLICY_ALWAYS           /* stp_policy_model.  */
index ad3cd222512df8c55f298c3849bb782cc092f677..f5566d270dacaec05a658fcba1bdc10f8a1304c8 100644 (file)
@@ -220,7 +220,8 @@ static const struct tune_params neoversev3_tunings =
   (AARCH64_EXTRA_TUNE_BASE
    | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
-   | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW),       /* tune_flags.  */
+   | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW
+   | AARCH64_EXTRA_TUNE_AVOID_LDAPUR), /* tune_flags.  */
   &generic_armv9a_prefetch_tune,
   AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
   AARCH64_LDP_STP_POLICY_ALWAYS           /* stp_policy_model.  */
index a0adef00824d906747502f0f7740e29f8bf1c04a..5796e52a266724e0e4295ad6868c49dd91faa491 100644 (file)
@@ -220,7 +220,8 @@ static const struct tune_params neoversev3ae_tunings =
   (AARCH64_EXTRA_TUNE_BASE
    | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
-   | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW),       /* tune_flags.  */
+   | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW
+   | AARCH64_EXTRA_TUNE_AVOID_LDAPUR), /* tune_flags.  */
   &generic_armv9a_prefetch_tune,
   AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
   AARCH64_LDP_STP_POLICY_ALWAYS           /* stp_policy_model.  */
index f57c09d0580624d884fd538075b4e2028aeea733..e8a545a01f9cf98ad3f58c366266b1ed1256a021 100644 (file)
@@ -33,7 +33,7 @@ TEST(s8_s64, s8, long long)
 /*
 **test_s16_s64:
 **...
-**     ldapursh        x0, \[x[0-9]+\]
+**     ldapursh        x0, \[x[0-9]+, [0-9]+\]
 **     ret
 */
 
@@ -42,7 +42,7 @@ TEST(s16_s64, s16, long long)
 /*
 **test_s32_s64:
 **...
-**     ldapursw        x0, \[x[0-9]+\]
+**     ldapursw        x0, \[x[0-9]+, [0-9]+\]
 **     ret
 */
 
@@ -60,7 +60,7 @@ TEST(s8_s32, s8, int)
 /*
 **test_s16_s32:
 **...
-**     ldapursh        w0, \[x[0-9]+\]
+**     ldapursh        w0, \[x[0-9]+, [0-9]+\]
 **     ret
 */
 
diff --git a/gcc/testsuite/gcc.target/aarch64/ldapur.c b/gcc/testsuite/gcc.target/aarch64/ldapur.c
new file mode 100644 (file)
index 0000000..5c68bdd
--- /dev/null
@@ -0,0 +1,77 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <stdatomic.h>
+#include <stdint.h>
+
+#pragma GCC target "arch=armv8.8-a"
+
+atomic_ullong u64;
+atomic_uint u32;
+atomic_ushort u16;
+atomic_uchar u8[2]; /* Force an offset for u8 */
+
+#define TEST(name, ldsize, rettype)                            \
+rettype                                                                \
+test_##name (void)                                             \
+{                                                              \
+  return atomic_load_explicit (&ldsize, memory_order_acquire); \
+}                                                              \
+
+
+/*
+** test_u8_u64:
+**     ...
+**     ldapurb w[0-9]+, \[x[0-9]+, [0-9]+\]
+**     ret
+*/
+TEST(u8_u64, u8[1], uint64_t)
+
+/*
+** test_u16_u64:
+**     ...
+**     ldapurh w[0-9]+, \[x[0-9]+, [0-9]+\]
+**     ret
+*/
+TEST(u16_u64, u16, uint64_t)
+
+/*
+**test_u32_u64:
+**     ...
+**     ldapur  w[0-9]+, \[x[0-9]+, [0-9]+\]
+**     ret
+*/
+TEST(u32_u64, u32, uint64_t)
+
+/*
+**test_u64_u64:
+**     ...
+**     ldapur  x[0-9]+, \[x[0-9]+, [0-9]+\]
+**     ret
+*/
+TEST(u64_u64, u64, uint64_t)
+
+/*
+**test_u8_u32:
+**     ...
+**     ldapurb w[0-9]+, \[x[0-9]+, [0-9]+\]
+**     ret
+*/
+TEST(u8_u32, u8[1], uint32_t)
+
+/*
+**test_u16_u32:
+**     ...
+**     ldapurh w[0-9]+, \[x[0-9]+, [0-9]+\]
+**     ret
+*/
+TEST(u16_u32, u16, uint32_t)
+
+/*
+**test_u32_u32:
+**     ...
+**     ldapur  w[0-9]+, \[x[0-9]+, [0-9]+\]
+**     ret
+*/
+TEST(u32_u32, u32, uint32_t)
\ No newline at end of file
diff --git a/gcc/testsuite/gcc.target/aarch64/ldapur_avoid.c b/gcc/testsuite/gcc.target/aarch64/ldapur_avoid.c
new file mode 100644 (file)
index 0000000..ad87a30
--- /dev/null
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -std=c99 -moverride=tune=avoid_ldapur" } */
+
+#include <stdatomic.h>
+#include <stdint.h>
+
+#pragma GCC target "arch=armv8.8-a"
+/* LDAPUR is only avoided for armv8.4 to armv8.7. This checks for the working
+of avoid_ldapur flag. */
+
+/* { dg-final { scan-assembler-not "ldapur\t" } } */
+
+atomic_ullong u64;
+atomic_uint u32;
+atomic_ushort u16;
+atomic_uchar u8[2]; /* Force an offset for u8 */
+
+#define TEST(name, ldsize, rettype)                            \
+rettype                                                                \
+test_##name (void)                                             \
+{                                                              \
+  return atomic_load_explicit (&ldsize, memory_order_acquire); \
+}                                                              \
+
+TEST(u8_u64, u8[1], uint64_t)
+TEST(u16_u64, u16, uint64_t)
+TEST(u32_u64, u32, uint64_t)
+TEST(u64_u64, u64, uint64_t)
+TEST(u8_u32, u8[1], uint32_t)
+TEST(u16_u32, u16, uint32_t)
+TEST(u32_u32, u32, uint32_t)
+
+/* { dg-final { scan-assembler-times "ldapr\t" 3 } } */
+/* { dg-final { scan-assembler-times "ldaprh\t" 2 } } */
+/* { dg-final { scan-assembler-times "ldaprb\t" 2 } } */
+
+