From: Mark Rutland Date: Wed, 3 Jun 2026 11:06:24 +0000 (+0100) Subject: arm64: fpsimd: Move fpsimd save/restore inline X-Git-Url: http://git.ipfire.org/gitweb/?a=commitdiff_plain;h=890712d4507b8950bd5fa005077a9178ddde95e6;p=thirdparty%2Flinux.git arm64: fpsimd: Move fpsimd save/restore inline Currently the FPSIMD register save/restore sequences are written in out-of-line assembly routines. While this works, it's somewhat painful: * As KVM needs to be able to use the sequences in hyp code, separate assembly files are used for the regular kernel and KVM code. While the common logic is shared in assembly macros, this still requires some duplication, and has lead to some trivial divergence. * For historical reasons, the assembly macros take some register arguments as numerical indices (e.g. "fpsimd_save x0, 8" uses x0 and x8), which is simply confusing. * For historical reasons, the SVE save/restore code and FPSIMD save/restore code have distinct sequences for FPSR and FPCR. Ideally this logic would be shared. * The assembly sequences can't be instrumented, and so it's harder than necessary to catch memory safety issues. To handle the above, move the FPSIMD register save/restore sequences to inline assembly, and share the FPSR+FPCR save/restore with SVE. Neither GCC nor LLVM instrument memory arguments to inline assembly, so explicit instrumentation is added in the same manner as other assembly routines. This instrumentation is implicitly disabled by Kbuild for nVHE hyp code. I've used the SVE sequence for restoring FPCR, which uses an unconditional write to FPCR, rather than the conditional write used by the FPSIMD assembly sequence. I believe that in practice, this doesn't matter to a real workload, and given it's possible for the mis-predicted branch to cost more than the necessary micro-architectural synchronization, I strongly suspect any performance impact is within the noise. Looking at the history, the FPSIMD assembly sequence was changed to use a conditional write to FPCR since 2014 in commit: 5959e25729a5 ("arm64: fpsimd: avoid restoring fpcr if the contents haven't change") ... as described in the commit message, this was based on an expectation of implementation style, and was not based on benchmarking. Signed-off-by: Mark Rutland Reviewed-by: Mark Brown Reviewed-by: Vladimir Murzin Cc: Catalin Marinas Cc: Fuad Tabba Cc: James Morse Cc: Marc Zyngier Cc: Oliver Upton Cc: Will Deacon Signed-off-by: Will Deacon --- diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h index 6fd5cdf5e5f17..19b373ad0ebf7 100644 --- a/arch/arm64/include/asm/fpsimd.h +++ b/arch/arm64/include/asm/fpsimd.h @@ -22,6 +22,8 @@ #include #include +#define __FPSIMD_PREAMBLE ".arch_extension fp\n" \ + ".arch_extension simd\n" #define __SVE_PREAMBLE ".arch_extension sve\n" #define __SME_PREAMBLE ".arch_extension sme\n" @@ -86,8 +88,70 @@ static inline void fpsimd_load_common(const struct user_fpsimd_state *state) write_sysreg_s(state->fpcr, SYS_FPCR); } -extern void fpsimd_save_state(struct user_fpsimd_state *state); -extern void fpsimd_load_state(struct user_fpsimd_state *state); +static inline void fpsimd_save_vregs(struct user_fpsimd_state *state) +{ + instrument_write(state->vregs, sizeof(state->vregs)); + asm volatile( + __FPSIMD_PREAMBLE + " stp q0, q1, [%[vregs], #16 * 0]\n" + " stp q2, q3, [%[vregs], #16 * 2]\n" + " stp q4, q5, [%[vregs], #16 * 4]\n" + " stp q6, q7, [%[vregs], #16 * 6]\n" + " stp q8, q9, [%[vregs], #16 * 8]\n" + " stp q10, q11, [%[vregs], #16 * 10]\n" + " stp q12, q13, [%[vregs], #16 * 12]\n" + " stp q14, q15, [%[vregs], #16 * 14]\n" + " stp q16, q17, [%[vregs], #16 * 16]\n" + " stp q18, q19, [%[vregs], #16 * 18]\n" + " stp q20, q21, [%[vregs], #16 * 20]\n" + " stp q22, q23, [%[vregs], #16 * 22]\n" + " stp q24, q25, [%[vregs], #16 * 24]\n" + " stp q26, q27, [%[vregs], #16 * 26]\n" + " stp q28, q29, [%[vregs], #16 * 28]\n" + " stp q30, q31, [%[vregs], #16 * 30]\n" + : "=Q" (state->vregs) + : [vregs] "r" (state->vregs) + ); +} + +static inline void fpsimd_load_vregs(const struct user_fpsimd_state *state) +{ + instrument_read(state->vregs, sizeof(state->vregs)); + asm volatile( + __FPSIMD_PREAMBLE + " ldp q0, q1, [%[vregs], #16 * 0]\n" + " ldp q2, q3, [%[vregs], #16 * 2]\n" + " ldp q4, q5, [%[vregs], #16 * 4]\n" + " ldp q6, q7, [%[vregs], #16 * 6]\n" + " ldp q8, q9, [%[vregs], #16 * 8]\n" + " ldp q10, q11, [%[vregs], #16 * 10]\n" + " ldp q12, q13, [%[vregs], #16 * 12]\n" + " ldp q14, q15, [%[vregs], #16 * 14]\n" + " ldp q16, q17, [%[vregs], #16 * 16]\n" + " ldp q18, q19, [%[vregs], #16 * 18]\n" + " ldp q20, q21, [%[vregs], #16 * 20]\n" + " ldp q22, q23, [%[vregs], #16 * 22]\n" + " ldp q24, q25, [%[vregs], #16 * 24]\n" + " ldp q26, q27, [%[vregs], #16 * 26]\n" + " ldp q28, q29, [%[vregs], #16 * 28]\n" + " ldp q30, q31, [%[vregs], #16 * 30]\n" + : + : "Q" (state->vregs), + [vregs] "r" (state->vregs) + ); +} + +static inline void fpsimd_save_state(struct user_fpsimd_state *state) +{ + fpsimd_save_vregs(state); + fpsimd_save_common(state); +} + +static inline void fpsimd_load_state(const struct user_fpsimd_state *state) +{ + fpsimd_load_vregs(state); + fpsimd_load_common(state); +} extern void fpsimd_thread_switch(struct task_struct *next); extern void fpsimd_flush_thread(void); diff --git a/arch/arm64/include/asm/fpsimdmacros.h b/arch/arm64/include/asm/fpsimdmacros.h index 1f32e0967dcd3..b486c6399bb4e 100644 --- a/arch/arm64/include/asm/fpsimdmacros.h +++ b/arch/arm64/include/asm/fpsimdmacros.h @@ -8,65 +8,6 @@ #include -.macro fpsimd_save state, tmpnr - stp q0, q1, [\state, #16 * 0] - stp q2, q3, [\state, #16 * 2] - stp q4, q5, [\state, #16 * 4] - stp q6, q7, [\state, #16 * 6] - stp q8, q9, [\state, #16 * 8] - stp q10, q11, [\state, #16 * 10] - stp q12, q13, [\state, #16 * 12] - stp q14, q15, [\state, #16 * 14] - stp q16, q17, [\state, #16 * 16] - stp q18, q19, [\state, #16 * 18] - stp q20, q21, [\state, #16 * 20] - stp q22, q23, [\state, #16 * 22] - stp q24, q25, [\state, #16 * 24] - stp q26, q27, [\state, #16 * 26] - stp q28, q29, [\state, #16 * 28] - stp q30, q31, [\state, #16 * 30]! - mrs x\tmpnr, fpsr - str w\tmpnr, [\state, #16 * 2] - mrs x\tmpnr, fpcr - str w\tmpnr, [\state, #16 * 2 + 4] -.endm - -.macro fpsimd_restore_fpcr state, tmp - /* - * Writes to fpcr may be self-synchronising, so avoid restoring - * the register if it hasn't changed. - */ - mrs \tmp, fpcr - cmp \tmp, \state - b.eq 9999f - msr fpcr, \state -9999: -.endm - -/* Clobbers \state */ -.macro fpsimd_restore state, tmpnr - ldp q0, q1, [\state, #16 * 0] - ldp q2, q3, [\state, #16 * 2] - ldp q4, q5, [\state, #16 * 4] - ldp q6, q7, [\state, #16 * 6] - ldp q8, q9, [\state, #16 * 8] - ldp q10, q11, [\state, #16 * 10] - ldp q12, q13, [\state, #16 * 12] - ldp q14, q15, [\state, #16 * 14] - ldp q16, q17, [\state, #16 * 16] - ldp q18, q19, [\state, #16 * 18] - ldp q20, q21, [\state, #16 * 20] - ldp q22, q23, [\state, #16 * 22] - ldp q24, q25, [\state, #16 * 24] - ldp q26, q27, [\state, #16 * 26] - ldp q28, q29, [\state, #16 * 28] - ldp q30, q31, [\state, #16 * 30]! - ldr w\tmpnr, [\state, #16 * 2] - msr fpsr, x\tmpnr - ldr w\tmpnr, [\state, #16 * 2 + 4] - fpsimd_restore_fpcr x\tmpnr, \state -.endm - /* Sanity-check macros to help avoid encoding garbage instructions */ .macro _check_general_reg nr diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index 0030cc1b52197..8c4602c8f4356 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -121,8 +121,6 @@ void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu); void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu); #endif -void __fpsimd_save_state(struct user_fpsimd_state *fp_regs); -void __fpsimd_restore_state(struct user_fpsimd_state *fp_regs); void __sve_save_state(void *sve, int save_ffr); void __sve_restore_state(void *sve, int restore_ffr); diff --git a/arch/arm64/kernel/entry-fpsimd.S b/arch/arm64/kernel/entry-fpsimd.S index 1a581597037ca..66668bfca5ae8 100644 --- a/arch/arm64/kernel/entry-fpsimd.S +++ b/arch/arm64/kernel/entry-fpsimd.S @@ -11,26 +11,6 @@ #include #include -/* - * Save the FP registers. - * - * x0 - pointer to struct fpsimd_state - */ -SYM_FUNC_START(fpsimd_save_state) - fpsimd_save x0, 8 - ret -SYM_FUNC_END(fpsimd_save_state) - -/* - * Load the FP registers. - * - * x0 - pointer to struct fpsimd_state - */ -SYM_FUNC_START(fpsimd_load_state) - fpsimd_restore x0, 8 - ret -SYM_FUNC_END(fpsimd_load_state) - #ifdef CONFIG_ARM64_SVE /* diff --git a/arch/arm64/kvm/hyp/fpsimd.S b/arch/arm64/kvm/hyp/fpsimd.S index 3d5b5e93ee5e0..00c56e31484a5 100644 --- a/arch/arm64/kvm/hyp/fpsimd.S +++ b/arch/arm64/kvm/hyp/fpsimd.S @@ -10,16 +10,6 @@ .text -SYM_FUNC_START(__fpsimd_save_state) - fpsimd_save x0, 1 - ret -SYM_FUNC_END(__fpsimd_save_state) - -SYM_FUNC_START(__fpsimd_restore_state) - fpsimd_restore x0, 1 - ret -SYM_FUNC_END(__fpsimd_restore_state) - SYM_FUNC_START(__sve_restore_state) sve_load 0, w1 ret diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index eb76a863ebb84..aaa43554fd8e6 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -565,7 +565,7 @@ static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu) if (system_supports_sve()) { __hyp_sve_save_host(); } else { - __fpsimd_save_state(&hctxt->fp_regs); + fpsimd_save_state(&hctxt->fp_regs); } if (kvm_has_fpmr(kern_hyp_va(vcpu->kvm))) @@ -625,7 +625,7 @@ static inline bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) if (sve_guest) __hyp_sve_restore_guest(vcpu); else - __fpsimd_restore_state(&vcpu->arch.ctxt.fp_regs); + fpsimd_load_state(&vcpu->arch.ctxt.fp_regs); if (kvm_has_fpmr(kern_hyp_va(vcpu->kvm))) write_sysreg_s(__vcpu_sys_reg(vcpu, FPMR), SYS_FPMR); diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 2ed4720e4f707..71fcfe8928f04 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -83,7 +83,7 @@ static void fpsimd_sve_sync(struct kvm_vcpu *vcpu) if (vcpu_has_sve(vcpu)) __hyp_sve_save_guest(vcpu); else - __fpsimd_save_state(&vcpu->arch.ctxt.fp_regs); + fpsimd_save_state(&vcpu->arch.ctxt.fp_regs); has_fpmr = kvm_has_fpmr(kern_hyp_va(vcpu->kvm)); if (has_fpmr) @@ -92,7 +92,7 @@ static void fpsimd_sve_sync(struct kvm_vcpu *vcpu) if (system_supports_sve()) __hyp_sve_restore_host(); else - __fpsimd_restore_state(&hctxt->fp_regs); + fpsimd_load_state(&hctxt->fp_regs); if (has_fpmr) write_sysreg_s(ctxt_sys_reg(hctxt, FPMR), SYS_FPMR);