From 392f8c40f0d3d9cbd4ee38d94bde06cf1b845e9f Mon Sep 17 00:00:00 2001 From: WANG Xuerui Date: Sun, 6 Jul 2025 09:06:20 +0800 Subject: [PATCH] LoongArch: Allow to relax instructions into NOPs after handling alignment Right now, LoongArch linker relaxation is 2-pass, since after alignment is done, byte deletion can no longer happen. However, as the alignment pass also shrinks text sections, new relaxation chances may well be created after alignment is done. Although at this point we can no longer delete unused instructions without disturbing alignment, we can still replace them with NOPs; popular LoongArch micro-architectures can eliminate NOPs during execution, so we can expect a (very) slight performance improvement from those late-created relaxation chances. To achieve this, the number of relax passes is raised to 3 for LoongArch, and every relaxation handler except loongarch_relax_align is migrated to a new helper loongarch_relax_delete_or_nop, that either deletes bytes or fills the bytes to be "deleted" with NOPs, depending on whether the containing section already has undergone alignment. Also, since no byte can be deleted during this relax pass, in the pass the pending_delete_ops structure is no longer allocated, and loongarch_calc_relaxed_addr(x) degrades to the trivial "return x" in this case. In addition, previously when calculating distances to symbols, an extra segment alignment must be considered, because alignment may increase distance between sites. However in the newly added 3rd pass code size can no longer increase for "closed" sections, so we can skip the adjustment for them to allow for a few more relaxation chances. A simple way to roughly measure this change's effectiveness is to check how many pcalau12i + addi.d pairs are relaxed into pcaddi's. Taking a Firefox 140.0.2 test build of mine as an example: Before: 47842 pcaddi's in libxul.so After: 48089 This is a 0.5% increase, which is kind of acceptable for a peephole optimization like this; of which 9 are due to the "relax"ed symbol distance treatment. Signed-off-by: WANG Xuerui --- bfd/elfnn-loongarch.c | 218 ++++++++++++------ ld/emultempl/loongarchelf.em | 2 +- .../ld-loongarch-elf/ld-loongarch-elf.exp | 1 + .../ld-loongarch-elf/relax-after-alignment.d | 30 +++ .../ld-loongarch-elf/relax-after-alignment.s | 49 ++++ 5 files changed, 227 insertions(+), 73 deletions(-) create mode 100644 ld/testsuite/ld-loongarch-elf/relax-after-alignment.d create mode 100644 ld/testsuite/ld-loongarch-elf/relax-after-alignment.s diff --git a/bfd/elfnn-loongarch.c b/bfd/elfnn-loongarch.c index a480a709897..c75f5206f2e 100644 --- a/bfd/elfnn-loongarch.c +++ b/bfd/elfnn-loongarch.c @@ -173,6 +173,10 @@ loongarch_elf_new_section_hook (bfd *abfd, asection *sec) #define loongarch_elf_hash_table(p) \ ((struct loongarch_elf_link_hash_table *) ((p)->hash)) \ +/* During linker relaxation, indicates whether the section has already + undergone alignment processing and no more byte deletion is permitted. */ +#define loongarch_sec_closed_for_deletion(sec) ((sec)->sec_flg0) + #define MINUS_ONE ((bfd_vma) 0 - 1) #define sec_addr(sec) ((sec)->output_section->vma + (sec)->output_offset) @@ -4789,7 +4793,10 @@ loongarch_calc_relaxed_addr (struct bfd_link_info *info, bfd_vma offset) struct pending_delete_op *op; splay_tree_node node; - BFD_ASSERT (pdops != NULL); + if (!pdops) + /* Currently this means we are past the stages where byte deletion could + possibly happen. */ + return offset; /* Find the op that starts just before the given address. */ node = splay_tree_predecessor (pdops, (splay_tree_key)offset); @@ -4814,9 +4821,9 @@ loongarch_calc_relaxed_addr (struct bfd_link_info *info, bfd_vma offset) static void loongarch_relax_delete_bytes (bfd *abfd, - bfd_vma addr, - size_t count, - struct bfd_link_info *link_info) + bfd_vma addr, + size_t count, + struct bfd_link_info *link_info) { struct loongarch_elf_link_hash_table *htab = loongarch_elf_hash_table (link_info); @@ -4867,6 +4874,34 @@ loongarch_relax_delete_bytes (bfd *abfd, } } +static void +loongarch_relax_delete_or_nop (bfd *abfd, + asection *sec, + bfd_vma addr, + size_t count, + struct bfd_link_info *link_info) +{ + struct bfd_elf_section_data *data = elf_section_data (sec); + bfd_byte *contents = data->this_hdr.contents; + + BFD_ASSERT (count % 4 == 0); + + if (!loongarch_sec_closed_for_deletion (sec)) + { + /* Deletions are still possible within the section. */ + loongarch_relax_delete_bytes (abfd, addr, count, link_info); + return; + } + + /* We can no longer delete bytes in the section after enforcing alignment. + But as the resulting shrinkage may open up a few more relaxation chances, + allowing unnecessary instructions to be replaced with NOPs instead of + being removed altogether may still benefit performance to a lesser + extent. */ + for (; count; addr += 4, count -= 4) + bfd_put (32, abfd, LARCH_NOP, contents + addr); +} + static void loongarch_relax_perform_deletes (bfd *abfd, asection *sec, struct bfd_link_info *link_info) @@ -5135,7 +5170,7 @@ loongarch_tls_perform_trans (bfd *abfd, asection *sec, bfd_put (32, abfd, LARCH_NOP, contents + rel->r_offset); /* link with -relax option will delete NOP. */ if (!info->disable_target_specific_optimizations) - loongarch_relax_delete_bytes (abfd, rel->r_offset, 4, info); + loongarch_relax_delete_or_nop (abfd, sec, rel->r_offset, 4, info); return true; case R_LARCH_TLS_IE_PC_HI20: @@ -5250,7 +5285,7 @@ loongarch_relax_tls_le (bfd *abfd, asection *sec, asection *sym_sec, if (symval < 0x800) { rel->r_info = ELFNN_R_INFO (0, R_LARCH_NONE); - loongarch_relax_delete_bytes (abfd, rel->r_offset, + loongarch_relax_delete_or_nop (abfd, sec, rel->r_offset, 4, link_info); } break; @@ -5275,8 +5310,8 @@ loongarch_relax_tls_le (bfd *abfd, asection *sec, asection *sym_sec, case R_LARCH_TLS_LE64_LO20: case R_LARCH_TLS_LE64_HI12: rel->r_info = ELFNN_R_INFO (0, R_LARCH_NONE); - loongarch_relax_delete_bytes (abfd, rel->r_offset, - 4, link_info); + loongarch_relax_delete_or_nop (abfd, sec, rel->r_offset, + 4, link_info); break; case R_LARCH_TLS_LE_LO12: @@ -5339,17 +5374,22 @@ loongarch_relax_pcala_addi (bfd *abfd, asection *sec, asection *sym_sec, symval = sec_addr (sec) + loongarch_calc_relaxed_addr (info, symval - sec_addr (sec)); - /* If pc and symbol not in the same segment, add/sub segment alignment. */ - if (!loongarch_two_sections_in_same_segment (info->output_bfd, - sec->output_section, - sym_sec->output_section)) - max_alignment = info->maxpagesize > max_alignment ? info->maxpagesize - : max_alignment; - - if (symval > pc) - pc -= (max_alignment > 4 ? max_alignment : 0); - else if (symval < pc) - pc += (max_alignment > 4 ? max_alignment : 0); + /* If pc and symbol not in the same segment, add/sub segment alignment if the + section has not undergone alignment processing because distances may grow + after alignment. */ + if (!loongarch_sec_closed_for_deletion (sec)) + { + if (!loongarch_two_sections_in_same_segment (info->output_bfd, + sec->output_section, + sym_sec->output_section)) + max_alignment = info->maxpagesize > max_alignment ? info->maxpagesize + : max_alignment; + + if (symval > pc) + pc -= (max_alignment > 4 ? max_alignment : 0); + else if (symval < pc) + pc += (max_alignment > 4 ? max_alignment : 0); + } const uint32_t pcaddi = LARCH_OP_PCADDI; @@ -5376,7 +5416,7 @@ loongarch_relax_pcala_addi (bfd *abfd, asection *sec, asection *sym_sec, R_LARCH_PCREL20_S2); rel_lo->r_info = ELFNN_R_INFO (0, R_LARCH_NONE); - loongarch_relax_delete_bytes (abfd, rel_lo->r_offset, 4, info); + loongarch_relax_delete_or_nop (abfd, sec, rel_lo->r_offset, 4, info); return true; } @@ -5404,17 +5444,22 @@ loongarch_relax_call36 (bfd *abfd, asection *sec, asection *sym_sec, symval = sec_addr (sec) + loongarch_calc_relaxed_addr (info, symval - sec_addr (sec)); - /* If pc and symbol not in the same segment, add/sub segment alignment. */ - if (!loongarch_two_sections_in_same_segment (info->output_bfd, - sec->output_section, - sym_sec->output_section)) - max_alignment = info->maxpagesize > max_alignment ? info->maxpagesize - : max_alignment; - - if (symval > pc) - pc -= (max_alignment > 4 ? max_alignment : 0); - else if (symval < pc) - pc += (max_alignment > 4 ? max_alignment : 0); + /* If pc and symbol not in the same segment, add/sub segment alignment if the + section has not undergone alignment processing because distances may grow + after alignment. */ + if (!loongarch_sec_closed_for_deletion (sec)) + { + if (!loongarch_two_sections_in_same_segment (info->output_bfd, + sec->output_section, + sym_sec->output_section)) + max_alignment = info->maxpagesize > max_alignment ? info->maxpagesize + : max_alignment; + + if (symval > pc) + pc -= (max_alignment > 4 ? max_alignment : 0); + else if (symval < pc) + pc += (max_alignment > 4 ? max_alignment : 0); + } /* Is pcalau12i + addi.d insns? */ if (!LARCH_INSN_JIRL (jirl) @@ -5436,7 +5481,7 @@ loongarch_relax_call36 (bfd *abfd, asection *sec, asection *sym_sec, /* Adjust relocations. */ rel->r_info = ELFNN_R_INFO (ELFNN_R_SYM (rel->r_info), R_LARCH_B26); /* Delete jirl instruction. */ - loongarch_relax_delete_bytes (abfd, rel->r_offset + 4, 4, info); + loongarch_relax_delete_or_nop (abfd, sec, rel->r_offset + 4, 4, info); return true; } @@ -5468,17 +5513,22 @@ loongarch_relax_pcala_ld (bfd *abfd, asection *sec, symval = sec_addr (sec) + loongarch_calc_relaxed_addr (info, symval - sec_addr (sec)); - /* If pc and symbol not in the same segment, add/sub segment alignment. */ - if (!loongarch_two_sections_in_same_segment (info->output_bfd, - sec->output_section, - sym_sec->output_section)) - max_alignment = info->maxpagesize > max_alignment ? info->maxpagesize - : max_alignment; - - if (symval > pc) - pc -= (max_alignment > 4 ? max_alignment : 0); - else if (symval < pc) - pc += (max_alignment > 4 ? max_alignment : 0); + /* If pc and symbol not in the same segment, add/sub segment alignment if the + section has not undergone alignment processing because distances may grow + after alignment. */ + if (!loongarch_sec_closed_for_deletion (sec)) + { + if (!loongarch_two_sections_in_same_segment (info->output_bfd, + sec->output_section, + sym_sec->output_section)) + max_alignment = info->maxpagesize > max_alignment ? info->maxpagesize + : max_alignment; + + if (symval > pc) + pc -= (max_alignment > 4 ? max_alignment : 0); + else if (symval < pc) + pc += (max_alignment > 4 ? max_alignment : 0); + } if ((ELFNN_R_TYPE (rel_lo->r_info) != R_LARCH_GOT_PC_LO12) || (LARCH_GET_RD (ld) != rd) @@ -5511,8 +5561,9 @@ bfd_elfNN_loongarch_set_data_segment_info (struct bfd_link_info *info, loongarch_elf_hash_table (info)->data_segment_phase = data_segment_phase; } -/* Implement R_LARCH_ALIGN by deleting excess alignment NOPs. - Once we've handled an R_LARCH_ALIGN, we can't relax anything else. */ +/* Honor R_LARCH_ALIGN requests by deleting excess alignment NOPs. + Once we've handled an R_LARCH_ALIGN, we can't relax anything else by deleting + bytes, or alignment will be disrupted. */ static bool loongarch_relax_align (bfd *abfd, asection *sec, asection *sym_sec, Elf_Internal_Rela *rel, @@ -5553,9 +5604,9 @@ loongarch_relax_align (bfd *abfd, asection *sec, asection *sym_sec, return false; } - /* Once we've handled an R_LARCH_ALIGN in a section, - we can't relax anything else in this section. */ - sec->sec_flg0 = true; + /* Once we've handled an R_LARCH_ALIGN in a section, we can't relax anything + else by deleting bytes, or alignment will be disrupted. */ + loongarch_sec_closed_for_deletion (sec) = true; rel->r_info = ELFNN_R_INFO (0, R_LARCH_NONE); /* If skipping more bytes than the specified maximum, @@ -5600,17 +5651,22 @@ loongarch_relax_tls_ld_gd_desc (bfd *abfd, asection *sec, asection *sym_sec, symval = sec_addr (sec) + loongarch_calc_relaxed_addr (info, symval - sec_addr (sec)); - /* If pc and symbol not in the same segment, add/sub segment alignment. */ - if (!loongarch_two_sections_in_same_segment (info->output_bfd, - sec->output_section, - sym_sec->output_section)) - max_alignment = info->maxpagesize > max_alignment ? info->maxpagesize - : max_alignment; - - if (symval > pc) - pc -= (max_alignment > 4 ? max_alignment : 0); - else if (symval < pc) - pc += (max_alignment > 4 ? max_alignment : 0); + /* If pc and symbol not in the same segment, add/sub segment alignment if the + section has not undergone alignment processing because distances may grow + after alignment. */ + if (!loongarch_sec_closed_for_deletion (sec)) + { + if (!loongarch_two_sections_in_same_segment (info->output_bfd, + sec->output_section, + sym_sec->output_section)) + max_alignment = info->maxpagesize > max_alignment ? info->maxpagesize + : max_alignment; + + if (symval > pc) + pc -= (max_alignment > 4 ? max_alignment : 0); + else if (symval < pc) + pc += (max_alignment > 4 ? max_alignment : 0); + } const uint32_t pcaddi = LARCH_OP_PCADDI; @@ -5653,7 +5709,7 @@ loongarch_relax_tls_ld_gd_desc (bfd *abfd, asection *sec, asection *sym_sec, } rel_lo->r_info = ELFNN_R_INFO (0, R_LARCH_NONE); - loongarch_relax_delete_bytes (abfd, rel_lo->r_offset, 4, info); + loongarch_relax_delete_or_nop (abfd, sec, rel_lo->r_offset, 4, info); return true; } @@ -5697,15 +5753,25 @@ loongarch_elf_relax_section (bfd *abfd, asection *sec, if (htab->layout_mutating_for_relr) return true; + /* Definition of LoongArch linker relaxation passes: + + - Pass 0: relaxes everything except R_LARCH_ALIGN, byte deletions are + performed; skipped if disable_target_specific_optimizations. + - Pass 1: handles alignment, byte deletions are performed. Sections with + R_LARCH_ALIGN relocations are marked closed for further byte + deletion in order to not disturb alignment. This pass is NOT + skipped even if disable_target_specific_optimizations is true. + - Pass 2: identical to Pass 0, but replacing relaxed insns with NOP in case + the containing section is closed for deletion; skip condition + also same as Pass 0. */ + bool is_alignment_pass = info->relax_pass == 1; if (bfd_link_relocatable (info) - || sec->sec_flg0 || sec->reloc_count == 0 || (sec->flags & SEC_RELOC) == 0 || (sec->flags & SEC_HAS_CONTENTS) == 0 /* The exp_seg_relro_adjust is enum phase_enum (0x4). */ || *(htab->data_segment_phase) == 4 - || (info->disable_target_specific_optimizations - && info->relax_pass == 0)) + || (info->disable_target_specific_optimizations && !is_alignment_pass)) return true; struct bfd_elf_section_data *data = elf_section_data (sec); @@ -5741,7 +5807,10 @@ loongarch_elf_relax_section (bfd *abfd, asection *sec, htab->max_alignment = max_alignment; } - splay_tree pdops = pending_delete_ops_new (abfd); + splay_tree pdops = NULL; + if (!loongarch_sec_closed_for_deletion (sec)) + pdops = pending_delete_ops_new (abfd); + htab->pending_delete_ops = pdops; for (unsigned int i = 0; i < sec->reloc_count; i++) @@ -5783,7 +5852,13 @@ loongarch_elf_relax_section (bfd *abfd, asection *sec, } relax_func_t relax_func = NULL; - if (info->relax_pass == 0) + if (is_alignment_pass) + { + if (r_type != R_LARCH_ALIGN) + continue; + relax_func = loongarch_relax_align; + } + else { switch (r_type) { @@ -5837,10 +5912,6 @@ loongarch_elf_relax_section (bfd *abfd, asection *sec, continue; } } - else if (info->relax_pass == 1 && r_type == R_LARCH_ALIGN) - relax_func = loongarch_relax_align; - else - continue; /* Four kind of relocations: Normal: symval is the symbol address. @@ -5979,9 +6050,12 @@ loongarch_elf_relax_section (bfd *abfd, asection *sec, info, again, max_alignment); } - loongarch_relax_perform_deletes (abfd, sec, info); - htab->pending_delete_ops = NULL; - splay_tree_delete (pdops); + if (pdops) + { + loongarch_relax_perform_deletes (abfd, sec, info); + htab->pending_delete_ops = NULL; + splay_tree_delete (pdops); + } return true; } diff --git a/ld/emultempl/loongarchelf.em b/ld/emultempl/loongarchelf.em index 928fd83ab15..517ece16105 100644 --- a/ld/emultempl/loongarchelf.em +++ b/ld/emultempl/loongarchelf.em @@ -58,7 +58,7 @@ larch_elf_before_allocation (void) ENABLE_RELAXATION; } - link_info.relax_pass = 2; + link_info.relax_pass = 3; } static void diff --git a/ld/testsuite/ld-loongarch-elf/ld-loongarch-elf.exp b/ld/testsuite/ld-loongarch-elf/ld-loongarch-elf.exp index 2f09a69befb..e23cdc87161 100644 --- a/ld/testsuite/ld-loongarch-elf/ld-loongarch-elf.exp +++ b/ld/testsuite/ld-loongarch-elf/ld-loongarch-elf.exp @@ -45,6 +45,7 @@ if [istarget "loongarch64-*-*"] { run_dump_test "underflow_s_5_20" run_dump_test "tls-le-norelax" run_dump_test "tls-le-relax" + run_dump_test "relax-after-alignment" run_dump_test "relax-medium-call" run_dump_test "relax-medium-call-1" run_dump_test "check_got_relax" diff --git a/ld/testsuite/ld-loongarch-elf/relax-after-alignment.d b/ld/testsuite/ld-loongarch-elf/relax-after-alignment.d new file mode 100644 index 00000000000..844c518e9c7 --- /dev/null +++ b/ld/testsuite/ld-loongarch-elf/relax-after-alignment.d @@ -0,0 +1,30 @@ +#name: additional relaxation chances after alignment processing +#as: +#ld: --defsym _start=0 +#objdump: -d --no-show-raw-insn + +.*:\s+file format .* + + +Disassembly of section \.text: + +0000000120000400 : +\s*120000400:\s+pcalau12i\s+\$t0, 512 +\s*[0-9a-f]+:\s+addi\.d\s+\$t0, \$t0, 1024 +\s*[0-9a-f]+:\s+pcaddi\s+\$t0, 524286 +\s*[0-9a-f]+:\s+nop\s* +\s*\.\.\. +\s*120000c00:\s+pcaddi\s+\$t0, 523776 +\s*\.\.\. + +0000000120200400 : +\s*120200400:\s+break\s+0x0 + +0000000120200404 : +\s*\.\.\. +\s*1203ffbfc:\s+pcaddi\s+\$t0, -523775 +\s*\.\.\. +\s*120400400:\s+pcaddi\s+\$t0, -524288 +\s*[0-9a-f]+:\s+nop\s* +\s*[0-9a-f]+:\s+pcalau12i\s+\$t0, -512 +\s*[0-9a-f]+:\s+addi\.d\s+\$t0, \$t0, 1024 diff --git a/ld/testsuite/ld-loongarch-elf/relax-after-alignment.s b/ld/testsuite/ld-loongarch-elf/relax-after-alignment.s new file mode 100644 index 00000000000..24d29ecb257 --- /dev/null +++ b/ld/testsuite/ld-loongarch-elf/relax-after-alignment.s @@ -0,0 +1,49 @@ +# 0x0 pre-relax, 0x400 post-relax +# all addresses are additionally offset by 0x120000000 without `ld -shared` +before: + la.pcrel $t0, target # too far; should stay as pcalau12i + addi.d + la.pcrel $t0, target # furthest reach of relax pass 2 + +# 0x10 pre-relax, 0x410 post-relax +.rept 508 +.word 0 +.endr + +# 0x800 {pre,post}-relax +# 255 nops + R_LARCH_ALIGN before relaxation +# none of the aligning nops should remain after relax pass 1 +.p2align 10 + +# 0xbfc pre-relax, 0xc00 post-relax + la.pcrel $t0, target # should become single pcaddi in relax pass 0 + +# 0xc04 {pre,post}-relax +.rept 523775 +.word 0 +.endr + +# 0x200400 {pre,post}-relax +target: + break 0 + +# 0x200404 {pre,post}-relax +after: +.rept 523774 +.word 0 +.endr + +# 0x3ffbfc {pre,post}-relax + la.pcrel $t0, target # should become single pcaddi in relax pass 0 + +# 255 nops + R_LARCH_ALIGN before relaxation +# none of the aligning nops should remain after relax pass 1 +.p2align 10 + +# 0x400000 pre-relax, 0x3ffc00 post-relax +.rept 512 +.word 0 +.endr + +# 0x400800 pre-relax, 0x400400 post-relax + la.pcrel $t0, target # furthest reach of relax pass 2 + la.pcrel $t0, target # too far; should stay as pcalau12i + addi.d -- 2.47.2