From: Soumya AR Date: Tue, 15 Jul 2025 13:58:44 +0000 (+0530) Subject: aarch64: Enable selective LDAPUR generation for cores with RCPC2 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=6b76dfad9b2c80a43b2e775d0027ba4b636d6022;p=thirdparty%2Fgcc.git aarch64: Enable selective LDAPUR generation for cores with RCPC2 This patch adds the ability to fold the address computation into the addressing mode for LDAPR instructions using LDAPUR when RCPC2 is available. LDAPUR emission is enabled by default when RCPC2 is available, but can be disabled using the avoid_ldapur tune flag on a per-core basis. Currently, it is disabled for neoverse-v2, neoverse-v3, cortex-x925, and architecutres before armv8.8-a. Earlier, the following code: uint64_t foo (std::atomic *x) { return x[1].load(std::memory_order_acquire); } would generate: foo(std::atomic*): add x0, x0, 8 ldapr x0, [x0] ret but now generates: foo(std::atomic*): ldapur x0, [x0, 8] ret The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression. OK for mainline? Signed-off-by: Soumya AR gcc/ChangeLog: * config/aarch64/aarch64-tuning-flags.def (AARCH64_EXTRA_TUNING_OPTION): Add AVOID_LDAPUR tuning flag. * config/aarch64/aarch64.cc (aarch64_adjust_generic_arch_tuning): Set AVOID_LDAPUR for architectures before armv8.8-a. (aarch64_override_options_internal): Apply generic tuning adjustments to generic_armv8_a_tunings and generic_armv9_a_tunings. * config/aarch64/aarch64.h (TARGET_ENABLE_LDAPUR): New macro to control LDAPUR usage based on RCPC2 and tuning flags. * config/aarch64/aarch64.md: Add enable_ldapur attribute. * config/aarch64/atomics.md (aarch64_atomic_load_rcpc): Modify to emit LDAPUR for cores with RCPC2. (*aarch64_atomic_load_rcpc_zext): Likewise. (*aarch64_atomic_load_rcpc_sext): Update constraint to Ust. * config/aarch64/tuning_models/cortexx925.h: Add AVOID_LDAPUR flag. * config/aarch64/tuning_models/neoversev2.h: Likewise. * config/aarch64/tuning_models/neoversev3.h: Likewise. * config/aarch64/tuning_models/neoversev3ae.h: Likewise. gcc/testsuite/ChangeLog: * gcc.target/aarch64/ldapr-sext.c: Update expected output to include offsets. * gcc.target/aarch64/ldapur.c: New test for LDAPUR. * gcc.target/aarch64/ldapur_avoid.c: New test for AVOID_LDAPUR flag. --- diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def index f2c916e9d77..dd91324e9c8 100644 --- a/gcc/config/aarch64/aarch64-tuning-flags.def +++ b/gcc/config/aarch64/aarch64-tuning-flags.def @@ -44,6 +44,8 @@ AARCH64_EXTRA_TUNING_OPTION ("avoid_cross_loop_fma", AVOID_CROSS_LOOP_FMA) AARCH64_EXTRA_TUNING_OPTION ("fully_pipelined_fma", FULLY_PIPELINED_FMA) +AARCH64_EXTRA_TUNING_OPTION ("avoid_ldapur", AVOID_LDAPUR) + /* Enable is the target prefers to use a fresh register for predicate outputs rather than re-use an input predicate register. */ AARCH64_EXTRA_TUNING_OPTION ("avoid_pred_rmw", AVOID_PRED_RMW) diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 6e16763f957..0485f695941 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -18793,6 +18793,8 @@ aarch64_adjust_generic_arch_tuning (struct tune_params ¤t_tune) if (TARGET_SVE2) current_tune.extra_tuning_flags &= ~AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS; + if (!AARCH64_HAVE_ISA(V8_8A)) + aarch64_tune_params.extra_tuning_flags |= AARCH64_EXTRA_TUNE_AVOID_LDAPUR; } static void @@ -18857,7 +18859,10 @@ aarch64_override_options_internal (struct gcc_options *opts) /* Make a copy of the tuning parameters attached to the core, which we may later overwrite. */ aarch64_tune_params = *(tune->tune); - if (tune->tune == &generic_tunings) + + if (tune->tune == &generic_tunings + || tune->tune == &generic_armv8_a_tunings + || tune->tune == &generic_armv9_a_tunings) aarch64_adjust_generic_arch_tuning (aarch64_tune_params); if (opts->x_aarch64_override_tune_string) diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h index d5c4a42e96d..096c853af7f 100644 --- a/gcc/config/aarch64/aarch64.h +++ b/gcc/config/aarch64/aarch64.h @@ -493,6 +493,11 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE ATTRIBUTE_UNUSED (bool (aarch64_tune_params.extra_tuning_flags \ & AARCH64_EXTRA_TUNE_CHEAP_FPMR_WRITE)) +/* Enable folding address computation into LDAPUR when RCPC2 is available. */ +#define TARGET_ENABLE_LDAPUR (TARGET_RCPC2 \ + && !(aarch64_tune_params.extra_tuning_flags \ + & AARCH64_EXTRA_TUNE_AVOID_LDAPUR)) + /* Combinatorial tests. */ #define TARGET_SVE2_OR_SME2 \ diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 27efc9155dc..a4ae6859da0 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -482,6 +482,8 @@ ;; clobber for SVE predicates. (define_attr "pred_clobber" "any,no,yes" (const_string "any")) +(define_attr "enable_ldapur" "any,no,yes" (const_string "any")) + ;; [For compatibility with Arm in pipeline models] ;; Attribute that specifies whether or not the instruction touches fp ;; registers. @@ -506,7 +508,14 @@ (eq_attr "pred_clobber" "yes") (match_test "TARGET_SVE_PRED_CLOBBER")) (eq_attr "pred_clobber" "any")) - + (ior + (and + (eq_attr "enable_ldapur" "yes") + (match_test "TARGET_ENABLE_LDAPUR")) + (and + (eq_attr "enable_ldapur" "no") + (match_test "!TARGET_ENABLE_LDAPUR")) + (eq_attr "enable_ldapur" "any")) (ior (eq_attr "arch" "any") diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md index 36b0dbd1f57..ea4a9367fc8 100644 --- a/gcc/config/aarch64/atomics.md +++ b/gcc/config/aarch64/atomics.md @@ -679,13 +679,16 @@ ) (define_insn "aarch64_atomic_load_rcpc" - [(set (match_operand:ALLI 0 "register_operand" "=r") + [(set (match_operand:ALLI 0 "register_operand") (unspec_volatile:ALLI - [(match_operand:ALLI 1 "aarch64_sync_memory_operand" "Q") + [(match_operand:ALLI 1 "aarch64_rcpc_memory_operand") (match_operand:SI 2 "const_int_operand")] ;; model UNSPECV_LDAP))] "TARGET_RCPC" - "ldapr\t%0, %1" + {@ [ cons: =0 , 1 ; attrs: enable_ldapur ] + [ r , Q ; any ] ldapr\t%0, %1 + [ r , Ust ; yes ] ldapur\t%0, %1 + } ) (define_insn "aarch64_atomic_load" @@ -705,21 +708,24 @@ ) (define_insn "*aarch64_atomic_load_rcpc_zext" - [(set (match_operand:SD_HSDI 0 "register_operand" "=r") + [(set (match_operand:SD_HSDI 0 "register_operand") (zero_extend:SD_HSDI (unspec_volatile:ALLX - [(match_operand:ALLX 1 "aarch64_sync_memory_operand" "Q") + [(match_operand:ALLX 1 "aarch64_rcpc_memory_operand") (match_operand:SI 2 "const_int_operand")] ;; model UNSPECV_LDAP)))] "TARGET_RCPC && ( > )" - "ldapr\t%w0, %1" + {@ [ cons: =0 , 1 ; attrs: enable_ldapur ] + [ r , Q ; any ] ldapr\t%w0, %1 + [ r , Ust ; yes ] ldapur\t%w0, %1 + } ) (define_insn "*aarch64_atomic_load_rcpc_sext" [(set (match_operand:GPI 0 "register_operand" "=r") (sign_extend:GPI (unspec_volatile:ALLX - [(match_operand:ALLX 1 "aarch64_sync_memory_operand" "Q") + [(match_operand:ALLX 1 "aarch64_rcpc_memory_operand" "Ust") (match_operand:SI 2 "const_int_operand")] ;; model UNSPECV_LDAP)))] "TARGET_RCPC2 && ( > )" diff --git a/gcc/config/aarch64/tuning_models/cortexx925.h b/gcc/config/aarch64/tuning_models/cortexx925.h index 7d0162eae54..f448493b1bc 100644 --- a/gcc/config/aarch64/tuning_models/cortexx925.h +++ b/gcc/config/aarch64/tuning_models/cortexx925.h @@ -222,7 +222,8 @@ static const struct tune_params cortexx925_tunings = (AARCH64_EXTRA_TUNE_BASE | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT - | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW), /* tune_flags. */ + | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW + | AARCH64_EXTRA_TUNE_AVOID_LDAPUR), /* tune_flags. */ &generic_armv9a_prefetch_tune, AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */ AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */ diff --git a/gcc/config/aarch64/tuning_models/neoversev2.h b/gcc/config/aarch64/tuning_models/neoversev2.h index b000fb46570..266d8f190a2 100644 --- a/gcc/config/aarch64/tuning_models/neoversev2.h +++ b/gcc/config/aarch64/tuning_models/neoversev2.h @@ -220,7 +220,8 @@ static const struct tune_params neoversev2_tunings = (AARCH64_EXTRA_TUNE_BASE | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT - | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW), /* tune_flags. */ + | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW + | AARCH64_EXTRA_TUNE_AVOID_LDAPUR), /* tune_flags. */ &generic_armv9a_prefetch_tune, AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */ AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */ diff --git a/gcc/config/aarch64/tuning_models/neoversev3.h b/gcc/config/aarch64/tuning_models/neoversev3.h index ad3cd222512..f5566d270da 100644 --- a/gcc/config/aarch64/tuning_models/neoversev3.h +++ b/gcc/config/aarch64/tuning_models/neoversev3.h @@ -220,7 +220,8 @@ static const struct tune_params neoversev3_tunings = (AARCH64_EXTRA_TUNE_BASE | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT - | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW), /* tune_flags. */ + | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW + | AARCH64_EXTRA_TUNE_AVOID_LDAPUR), /* tune_flags. */ &generic_armv9a_prefetch_tune, AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */ AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */ diff --git a/gcc/config/aarch64/tuning_models/neoversev3ae.h b/gcc/config/aarch64/tuning_models/neoversev3ae.h index a0adef00824..5796e52a266 100644 --- a/gcc/config/aarch64/tuning_models/neoversev3ae.h +++ b/gcc/config/aarch64/tuning_models/neoversev3ae.h @@ -220,7 +220,8 @@ static const struct tune_params neoversev3ae_tunings = (AARCH64_EXTRA_TUNE_BASE | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT - | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW), /* tune_flags. */ + | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW + | AARCH64_EXTRA_TUNE_AVOID_LDAPUR), /* tune_flags. */ &generic_armv9a_prefetch_tune, AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */ AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */ diff --git a/gcc/testsuite/gcc.target/aarch64/ldapr-sext.c b/gcc/testsuite/gcc.target/aarch64/ldapr-sext.c index f57c09d0580..e8a545a01f9 100644 --- a/gcc/testsuite/gcc.target/aarch64/ldapr-sext.c +++ b/gcc/testsuite/gcc.target/aarch64/ldapr-sext.c @@ -33,7 +33,7 @@ TEST(s8_s64, s8, long long) /* **test_s16_s64: **... -** ldapursh x0, \[x[0-9]+\] +** ldapursh x0, \[x[0-9]+, [0-9]+\] ** ret */ @@ -42,7 +42,7 @@ TEST(s16_s64, s16, long long) /* **test_s32_s64: **... -** ldapursw x0, \[x[0-9]+\] +** ldapursw x0, \[x[0-9]+, [0-9]+\] ** ret */ @@ -60,7 +60,7 @@ TEST(s8_s32, s8, int) /* **test_s16_s32: **... -** ldapursh w0, \[x[0-9]+\] +** ldapursh w0, \[x[0-9]+, [0-9]+\] ** ret */ diff --git a/gcc/testsuite/gcc.target/aarch64/ldapur.c b/gcc/testsuite/gcc.target/aarch64/ldapur.c new file mode 100644 index 00000000000..5c68bdde35d --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/ldapur.c @@ -0,0 +1,77 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -std=c99" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +#include +#include + +#pragma GCC target "arch=armv8.8-a" + +atomic_ullong u64; +atomic_uint u32; +atomic_ushort u16; +atomic_uchar u8[2]; /* Force an offset for u8 */ + +#define TEST(name, ldsize, rettype) \ +rettype \ +test_##name (void) \ +{ \ + return atomic_load_explicit (&ldsize, memory_order_acquire); \ +} \ + + +/* +** test_u8_u64: +** ... +** ldapurb w[0-9]+, \[x[0-9]+, [0-9]+\] +** ret +*/ +TEST(u8_u64, u8[1], uint64_t) + +/* +** test_u16_u64: +** ... +** ldapurh w[0-9]+, \[x[0-9]+, [0-9]+\] +** ret +*/ +TEST(u16_u64, u16, uint64_t) + +/* +**test_u32_u64: +** ... +** ldapur w[0-9]+, \[x[0-9]+, [0-9]+\] +** ret +*/ +TEST(u32_u64, u32, uint64_t) + +/* +**test_u64_u64: +** ... +** ldapur x[0-9]+, \[x[0-9]+, [0-9]+\] +** ret +*/ +TEST(u64_u64, u64, uint64_t) + +/* +**test_u8_u32: +** ... +** ldapurb w[0-9]+, \[x[0-9]+, [0-9]+\] +** ret +*/ +TEST(u8_u32, u8[1], uint32_t) + +/* +**test_u16_u32: +** ... +** ldapurh w[0-9]+, \[x[0-9]+, [0-9]+\] +** ret +*/ +TEST(u16_u32, u16, uint32_t) + +/* +**test_u32_u32: +** ... +** ldapur w[0-9]+, \[x[0-9]+, [0-9]+\] +** ret +*/ +TEST(u32_u32, u32, uint32_t) \ No newline at end of file diff --git a/gcc/testsuite/gcc.target/aarch64/ldapur_avoid.c b/gcc/testsuite/gcc.target/aarch64/ldapur_avoid.c new file mode 100644 index 00000000000..ad87a30752a --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/ldapur_avoid.c @@ -0,0 +1,37 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -std=c99 -moverride=tune=avoid_ldapur" } */ + +#include +#include + +#pragma GCC target "arch=armv8.8-a" +/* LDAPUR is only avoided for armv8.4 to armv8.7. This checks for the working +of avoid_ldapur flag. */ + +/* { dg-final { scan-assembler-not "ldapur\t" } } */ + +atomic_ullong u64; +atomic_uint u32; +atomic_ushort u16; +atomic_uchar u8[2]; /* Force an offset for u8 */ + +#define TEST(name, ldsize, rettype) \ +rettype \ +test_##name (void) \ +{ \ + return atomic_load_explicit (&ldsize, memory_order_acquire); \ +} \ + +TEST(u8_u64, u8[1], uint64_t) +TEST(u16_u64, u16, uint64_t) +TEST(u32_u64, u32, uint64_t) +TEST(u64_u64, u64, uint64_t) +TEST(u8_u32, u8[1], uint32_t) +TEST(u16_u32, u16, uint32_t) +TEST(u32_u32, u32, uint32_t) + +/* { dg-final { scan-assembler-times "ldapr\t" 3 } } */ +/* { dg-final { scan-assembler-times "ldaprh\t" 2 } } */ +/* { dg-final { scan-assembler-times "ldaprb\t" 2 } } */ + +