From: Philipp Tomsich Date: Thu, 23 Mar 2023 18:47:57 +0000 (+0100) Subject: aarch64: disable LDP via tuning structure for -mcpu=ampere1 X-Git-Tag: releases/gcc-10.5.0~194 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=cd963f8d6bf24ef8bebe4fc4e51c13b463c358c4;p=thirdparty%2Fgcc.git aarch64: disable LDP via tuning structure for -mcpu=ampere1 AmpereOne (-mcpu=ampere1) breaks LDP instructions into two uops. Given the chance that this causes instructions to slip into the next decoding cycle and the additional overheads when handling cacheline-crossing LDP instructions, we disable the generation of LDP isntructions through the tuning structure from instruction combining (such as in peephole2). Given the code-density benefits in builtins and prologue/epilogue expansion, we allow LDPs there. This commit: * adds a new tuning option AARCH64_EXTRA_TUNE_NO_LDP_COMBINE * allows -moverride=tune=... to override this These changes are benchmark-driven, yielding the following changes (with a net-overall improvement): 503.bwaves_r. -0.88% 507.cactuBSSN_r 0.35% 508.namd_r 3.09% 510.parest_r -2.99% 511.povray_r 5.54% 519.lbm_r 15.83% 521.wrf_r 0.56% 526.blender_r 2.47% 527.cam4_r 0.70% 538.imagick_r 0.00% 544.nab_r -0.33% 549.fotonik3d_r. -0.42% 554.roms_r 0.00% ------------------------- = total 1.79% Signed-off-by: Philipp Tomsich Co-Authored-By: Di Zhao gcc/ChangeLog: * config/aarch64/aarch64-tuning-flags.def (AARCH64_EXTRA_TUNING_OPTION): Add AARCH64_EXTRA_TUNE_NO_LDP_COMBINE. * config/aarch64/aarch64.c (aarch64_operands_ok_for_ldpstp): Check for the above tuning option when processing loads. gcc/testsuite/ChangeLog: * gcc.target/aarch64/ampere1-no_ldp_combine.c: New test. (cherry picked from commit f200c56787f2c6f93ffb739d57d01a294ab72f68) --- diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def index 7677ec0b0b40..9758bfaa65df 100644 --- a/gcc/config/aarch64/aarch64-tuning-flags.def +++ b/gcc/config/aarch64/aarch64-tuning-flags.def @@ -44,6 +44,9 @@ AARCH64_EXTRA_TUNING_OPTION ("cheap_shift_extend", CHEAP_SHIFT_EXTEND) /* Disallow load/store pair instructions on Q-registers. */ AARCH64_EXTRA_TUNING_OPTION ("no_ldp_stp_qregs", NO_LDP_STP_QREGS) +/* Disallow load-pair instructions to be formed in combine/peephole. */ +AARCH64_EXTRA_TUNING_OPTION ("no_ldp_combine", NO_LDP_COMBINE) + AARCH64_EXTRA_TUNING_OPTION ("rename_load_regs", RENAME_LOAD_REGS) /* Prefer Advanced SIMD over SVE for auto-vectorization. */ diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 9aebbbd49100..7bc6e5153add 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -1440,7 +1440,7 @@ static const struct tune_params ampere1_tunings = 2, /* min_div_recip_mul_df. */ 0, /* max_case_values. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + (AARCH64_EXTRA_TUNE_NO_LDP_COMBINE), /* tune_flags. */ &ere1_prefetch_tune }; @@ -1471,7 +1471,7 @@ static const struct tune_params ampere1a_tunings = 2, /* min_div_recip_mul_df. */ 0, /* max_case_values. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + (AARCH64_EXTRA_TUNE_NO_LDP_COMBINE), /* tune_flags. */ &ere1_prefetch_tune }; @@ -22346,6 +22346,12 @@ aarch64_operands_ok_for_ldpstp (rtx *operands, bool load, enum reg_class rclass_1, rclass_2; rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2; + /* Allow the tuning structure to disable LDP instruction formation + from combining instructions (e.g., in peephole2). */ + if (load && (aarch64_tune_params.extra_tuning_flags + & AARCH64_EXTRA_TUNE_NO_LDP_COMBINE)) + return false; + if (load) { mem_1 = operands[1]; diff --git a/gcc/testsuite/gcc.target/aarch64/ampere1-no_ldp_combine.c b/gcc/testsuite/gcc.target/aarch64/ampere1-no_ldp_combine.c new file mode 100644 index 000000000000..bc871f4481d1 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/ampere1-no_ldp_combine.c @@ -0,0 +1,11 @@ +/* { dg-options "-O3 -mtune=ampere1" } */ + +long +foo (long a[]) +{ + return a[0] + a[1]; +} + +/* We should see two ldrs instead of one ldp. */ +/* { dg-final { scan-assembler {\tldr\t} } } */ +/* { dg-final { scan-assembler-not {\tldp\t} } } */