From: Greg Kroah-Hartman Date: Thu, 13 Mar 2025 16:09:28 +0000 (+0100) Subject: 6.13-stable patches X-Git-Tag: v6.6.84~63 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=e0e02873ba6a96a11c1a2d9eb315fc1f7c5dbd89;p=thirdparty%2Fkernel%2Fstable-queue.git 6.13-stable patches added patches: kvm-arm64-calculate-cptr_el2-traps-on-activating-traps.patch kvm-arm64-eagerly-switch-zcr_el-1-2.patch kvm-arm64-mark-some-header-functions-as-inline.patch kvm-arm64-refactor-exit-handlers.patch kvm-arm64-remove-host-fpsimd-saving-for-non-protected-kvm.patch kvm-arm64-remove-vhe-host-restore-of-cpacr_el1.smen.patch kvm-arm64-remove-vhe-host-restore-of-cpacr_el1.zen.patch kvm-arm64-unconditionally-save-flush-host-fpsimd-sve-sme-state.patch mm-fix-kernel-bug-when-userfaultfd_move-encounters-swapcache.patch mm-slab-kvfree_rcu-switch-to-wq_mem_reclaim-wq.patch series userfaultfd-fix-pte-unmapping-stack-allocated-pte-copies.patch virt-sev-guest-move-snp-guest-request-data-pages-handling-under-snp_cmd_mutex.patch --- diff --git a/queue-6.13/kvm-arm64-calculate-cptr_el2-traps-on-activating-traps.patch b/queue-6.13/kvm-arm64-calculate-cptr_el2-traps-on-activating-traps.patch new file mode 100644 index 0000000000..a7f71967e6 --- /dev/null +++ b/queue-6.13/kvm-arm64-calculate-cptr_el2-traps-on-activating-traps.patch @@ -0,0 +1,210 @@ +From stable+bounces-124192-greg=kroah.com@vger.kernel.org Thu Mar 13 00:49:47 2025 +From: Mark Brown +Date: Wed, 12 Mar 2025 23:49:09 +0000 +Subject: KVM: arm64: Calculate cptr_el2 traps on activating traps +To: Greg Kroah-Hartman , Marc Zyngier , Oliver Upton , Joey Gouly , Suzuki K Poulose , Catalin Marinas , Will Deacon +Cc: linux-arm-kernel@lists.infradead.org, kvmarm@lists.linux.dev, linux-kernel@vger.kernel.org, stable@vger.kernel.org, Mark Brown , Fuad Tabba , James Clark +Message-ID: <20250312-stable-sve-6-13-v1-1-c7ba07a6f4f7@kernel.org> + +From: Fuad Tabba + +[ Upstream commit 2fd5b4b0e7b440602455b79977bfa64dea101e6c ] + +Similar to VHE, calculate the value of cptr_el2 from scratch on +activate traps. This removes the need to store cptr_el2 in every +vcpu structure. Moreover, some traps, such as whether the guest +owns the fp registers, need to be set on every vcpu run. + +Reported-by: James Clark +Fixes: 5294afdbf45a ("KVM: arm64: Exclude FP ownership from kvm_vcpu_arch") +Signed-off-by: Fuad Tabba +Link: https://lore.kernel.org/r/20241216105057.579031-13-tabba@google.com +Signed-off-by: Marc Zyngier +Signed-off-by: Mark Brown +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/kvm/arm.c | 1 + arch/arm64/kvm/hyp/nvhe/pkvm.c | 30 ---------------------- + arch/arm64/kvm/hyp/nvhe/switch.c | 51 +++++++++++++++++++++++--------------- + 4 files changed, 32 insertions(+), 51 deletions(-) + +--- a/arch/arm64/include/asm/kvm_host.h ++++ b/arch/arm64/include/asm/kvm_host.h +@@ -708,7 +708,6 @@ struct kvm_vcpu_arch { + u64 hcr_el2; + u64 hcrx_el2; + u64 mdcr_el2; +- u64 cptr_el2; + + /* Exception Information */ + struct kvm_vcpu_fault_info fault; +--- a/arch/arm64/kvm/arm.c ++++ b/arch/arm64/kvm/arm.c +@@ -1569,7 +1569,6 @@ static int kvm_arch_vcpu_ioctl_vcpu_init + } + + vcpu_reset_hcr(vcpu); +- vcpu->arch.cptr_el2 = kvm_get_reset_cptr_el2(vcpu); + + /* + * Handle the "start in power-off" case. +--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c ++++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c +@@ -31,8 +31,6 @@ static void pvm_init_traps_aa64pfr0(stru + const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR0_EL1); + u64 hcr_set = HCR_RW; + u64 hcr_clear = 0; +- u64 cptr_set = 0; +- u64 cptr_clear = 0; + + /* Protected KVM does not support AArch32 guests. */ + BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), +@@ -62,21 +60,10 @@ static void pvm_init_traps_aa64pfr0(stru + /* Trap AMU */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AMU), feature_ids)) { + hcr_clear |= HCR_AMVOFFEN; +- cptr_set |= CPTR_EL2_TAM; +- } +- +- /* Trap SVE */ +- if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE), feature_ids)) { +- if (has_hvhe()) +- cptr_clear |= CPACR_ELx_ZEN; +- else +- cptr_set |= CPTR_EL2_TZ; + } + + vcpu->arch.hcr_el2 |= hcr_set; + vcpu->arch.hcr_el2 &= ~hcr_clear; +- vcpu->arch.cptr_el2 |= cptr_set; +- vcpu->arch.cptr_el2 &= ~cptr_clear; + } + + /* +@@ -106,7 +93,6 @@ static void pvm_init_traps_aa64dfr0(stru + const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64DFR0_EL1); + u64 mdcr_set = 0; + u64 mdcr_clear = 0; +- u64 cptr_set = 0; + + /* Trap/constrain PMU */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), feature_ids)) { +@@ -133,21 +119,12 @@ static void pvm_init_traps_aa64dfr0(stru + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_TraceFilt), feature_ids)) + mdcr_set |= MDCR_EL2_TTRF; + +- /* Trap Trace */ +- if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_TraceVer), feature_ids)) { +- if (has_hvhe()) +- cptr_set |= CPACR_EL1_TTA; +- else +- cptr_set |= CPTR_EL2_TTA; +- } +- + /* Trap External Trace */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_ExtTrcBuff), feature_ids)) + mdcr_clear |= MDCR_EL2_E2TB_MASK; + + vcpu->arch.mdcr_el2 |= mdcr_set; + vcpu->arch.mdcr_el2 &= ~mdcr_clear; +- vcpu->arch.cptr_el2 |= cptr_set; + } + + /* +@@ -198,10 +175,6 @@ static void pvm_init_trap_regs(struct kv + /* Clear res0 and set res1 bits to trap potential new features. */ + vcpu->arch.hcr_el2 &= ~(HCR_RES0); + vcpu->arch.mdcr_el2 &= ~(MDCR_EL2_RES0); +- if (!has_hvhe()) { +- vcpu->arch.cptr_el2 |= CPTR_NVHE_EL2_RES1; +- vcpu->arch.cptr_el2 &= ~(CPTR_NVHE_EL2_RES0); +- } + } + + static void pkvm_vcpu_reset_hcr(struct kvm_vcpu *vcpu) +@@ -236,7 +209,6 @@ static void pkvm_vcpu_reset_hcr(struct k + */ + static void pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu) + { +- vcpu->arch.cptr_el2 = kvm_get_reset_cptr_el2(vcpu); + vcpu->arch.mdcr_el2 = 0; + + pkvm_vcpu_reset_hcr(vcpu); +@@ -693,8 +665,6 @@ unlock: + return ret; + } + +- hyp_vcpu->vcpu.arch.cptr_el2 = kvm_get_reset_cptr_el2(&hyp_vcpu->vcpu); +- + return 0; + } + +--- a/arch/arm64/kvm/hyp/nvhe/switch.c ++++ b/arch/arm64/kvm/hyp/nvhe/switch.c +@@ -36,33 +36,46 @@ DEFINE_PER_CPU(unsigned long, kvm_hyp_ve + + extern void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc); + +-static void __activate_traps(struct kvm_vcpu *vcpu) ++static void __activate_cptr_traps(struct kvm_vcpu *vcpu) + { +- u64 val; ++ u64 val = CPTR_EL2_TAM; /* Same bit irrespective of E2H */ + +- ___activate_traps(vcpu, vcpu->arch.hcr_el2); +- __activate_traps_common(vcpu); ++ if (has_hvhe()) { ++ val |= CPACR_ELx_TTA; + +- val = vcpu->arch.cptr_el2; +- val |= CPTR_EL2_TAM; /* Same bit irrespective of E2H */ +- val |= has_hvhe() ? CPACR_EL1_TTA : CPTR_EL2_TTA; +- if (cpus_have_final_cap(ARM64_SME)) { +- if (has_hvhe()) +- val &= ~CPACR_ELx_SMEN; +- else +- val |= CPTR_EL2_TSM; +- } ++ if (guest_owns_fp_regs()) { ++ val |= CPACR_ELx_FPEN; ++ if (vcpu_has_sve(vcpu)) ++ val |= CPACR_ELx_ZEN; ++ } ++ } else { ++ val |= CPTR_EL2_TTA | CPTR_NVHE_EL2_RES1; ++ ++ /* ++ * Always trap SME since it's not supported in KVM. ++ * TSM is RES1 if SME isn't implemented. ++ */ ++ val |= CPTR_EL2_TSM; + +- if (!guest_owns_fp_regs()) { +- if (has_hvhe()) +- val &= ~(CPACR_ELx_FPEN | CPACR_ELx_ZEN); +- else +- val |= CPTR_EL2_TFP | CPTR_EL2_TZ; ++ if (!vcpu_has_sve(vcpu) || !guest_owns_fp_regs()) ++ val |= CPTR_EL2_TZ; + +- __activate_traps_fpsimd32(vcpu); ++ if (!guest_owns_fp_regs()) ++ val |= CPTR_EL2_TFP; + } + ++ if (!guest_owns_fp_regs()) ++ __activate_traps_fpsimd32(vcpu); ++ + kvm_write_cptr_el2(val); ++} ++ ++static void __activate_traps(struct kvm_vcpu *vcpu) ++{ ++ ___activate_traps(vcpu, vcpu->arch.hcr_el2); ++ __activate_traps_common(vcpu); ++ __activate_cptr_traps(vcpu); ++ + write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el2); + + if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { diff --git a/queue-6.13/kvm-arm64-eagerly-switch-zcr_el-1-2.patch b/queue-6.13/kvm-arm64-eagerly-switch-zcr_el-1-2.patch new file mode 100644 index 0000000000..b1379f00bb --- /dev/null +++ b/queue-6.13/kvm-arm64-eagerly-switch-zcr_el-1-2.patch @@ -0,0 +1,321 @@ +From broonie@kernel.org Thu Mar 13 00:49:53 2025 +From: Mark Brown +Date: Wed, 12 Mar 2025 23:49:16 +0000 +Subject: KVM: arm64: Eagerly switch ZCR_EL{1,2} +To: Greg Kroah-Hartman , Marc Zyngier , Oliver Upton , Joey Gouly , Suzuki K Poulose , Catalin Marinas , Will Deacon +Cc: linux-arm-kernel@lists.infradead.org, kvmarm@lists.linux.dev, linux-kernel@vger.kernel.org, stable@vger.kernel.org, Mark Brown , Mark Rutland , Fuad Tabba +Message-ID: <20250312-stable-sve-6-13-v1-8-c7ba07a6f4f7@kernel.org> + +From: Mark Rutland + +In non-protected KVM modes, while the guest FPSIMD/SVE/SME state is live on the +CPU, the host's active SVE VL may differ from the guest's maximum SVE VL: + +* For VHE hosts, when a VM uses NV, ZCR_EL2 contains a value constrained + by the guest hypervisor, which may be less than or equal to that + guest's maximum VL. + + Note: in this case the value of ZCR_EL1 is immaterial due to E2H. + +* For nVHE/hVHE hosts, ZCR_EL1 contains a value written by the guest, + which may be less than or greater than the guest's maximum VL. + + Note: in this case hyp code traps host SVE usage and lazily restores + ZCR_EL2 to the host's maximum VL, which may be greater than the + guest's maximum VL. + +This can be the case between exiting a guest and kvm_arch_vcpu_put_fp(). +If a softirq is taken during this period and the softirq handler tries +to use kernel-mode NEON, then the kernel will fail to save the guest's +FPSIMD/SVE state, and will pend a SIGKILL for the current thread. + +This happens because kvm_arch_vcpu_ctxsync_fp() binds the guest's live +FPSIMD/SVE state with the guest's maximum SVE VL, and +fpsimd_save_user_state() verifies that the live SVE VL is as expected +before attempting to save the register state: + +| if (WARN_ON(sve_get_vl() != vl)) { +| force_signal_inject(SIGKILL, SI_KERNEL, 0, 0); +| return; +| } + +Fix this and make this a bit easier to reason about by always eagerly +switching ZCR_EL{1,2} at hyp during guest<->host transitions. With this +happening, there's no need to trap host SVE usage, and the nVHE/nVHE +__deactivate_cptr_traps() logic can be simplified to enable host access +to all present FPSIMD/SVE/SME features. + +In protected nVHE/hVHE modes, the host's state is always saved/restored +by hyp, and the guest's state is saved prior to exit to the host, so +from the host's PoV the guest never has live FPSIMD/SVE/SME state, and +the host's ZCR_EL1 is never clobbered by hyp. + +Fixes: 8c8010d69c132273 ("KVM: arm64: Save/restore SVE state for nVHE") +Fixes: 2e3cf82063a00ea0 ("KVM: arm64: nv: Ensure correct VL is loaded before saving SVE state") +Signed-off-by: Mark Rutland +Reviewed-by: Mark Brown +Tested-by: Mark Brown +Cc: Catalin Marinas +Cc: Fuad Tabba +Cc: Marc Zyngier +Cc: Oliver Upton +Cc: Will Deacon +Reviewed-by: Oliver Upton +Link: https://lore.kernel.org/r/20250210195226.1215254-9-mark.rutland@arm.com +Signed-off-by: Marc Zyngier +(cherry picked from commit 59419f10045bc955d2229819c7cf7a8b0b9c5b59) +Signed-off-by: Mark Brown +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm64/kvm/fpsimd.c | 30 ---------------- + arch/arm64/kvm/hyp/entry.S | 5 ++ + arch/arm64/kvm/hyp/include/hyp/switch.h | 59 ++++++++++++++++++++++++++++++++ + arch/arm64/kvm/hyp/nvhe/hyp-main.c | 13 +++---- + arch/arm64/kvm/hyp/nvhe/switch.c | 33 +++++++++++++++-- + arch/arm64/kvm/hyp/vhe/switch.c | 4 ++ + 6 files changed, 103 insertions(+), 41 deletions(-) + +--- a/arch/arm64/kvm/fpsimd.c ++++ b/arch/arm64/kvm/fpsimd.c +@@ -136,36 +136,6 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcp + local_irq_save(flags); + + if (guest_owns_fp_regs()) { +- if (vcpu_has_sve(vcpu)) { +- u64 zcr = read_sysreg_el1(SYS_ZCR); +- +- /* +- * If the vCPU is in the hyp context then ZCR_EL1 is +- * loaded with its vEL2 counterpart. +- */ +- __vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)) = zcr; +- +- /* +- * Restore the VL that was saved when bound to the CPU, +- * which is the maximum VL for the guest. Because the +- * layout of the data when saving the sve state depends +- * on the VL, we need to use a consistent (i.e., the +- * maximum) VL. +- * Note that this means that at guest exit ZCR_EL1 is +- * not necessarily the same as on guest entry. +- * +- * ZCR_EL2 holds the guest hypervisor's VL when running +- * a nested guest, which could be smaller than the +- * max for the vCPU. Similar to above, we first need to +- * switch to a VL consistent with the layout of the +- * vCPU's SVE state. KVM support for NV implies VHE, so +- * using the ZCR_EL1 alias is safe. +- */ +- if (!has_vhe() || (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu))) +- sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1, +- SYS_ZCR_EL1); +- } +- + /* + * Flush (save and invalidate) the fpsimd/sve state so that if + * the host tries to use fpsimd/sve, it's not using stale data +--- a/arch/arm64/kvm/hyp/entry.S ++++ b/arch/arm64/kvm/hyp/entry.S +@@ -44,6 +44,11 @@ alternative_if ARM64_HAS_RAS_EXTN + alternative_else_nop_endif + mrs x1, isr_el1 + cbz x1, 1f ++ ++ // Ensure that __guest_enter() always provides a context ++ // synchronization event so that callers don't need ISBs for anything ++ // that would usually be synchonized by the ERET. ++ isb + mov x0, #ARM_EXCEPTION_IRQ + ret + +--- a/arch/arm64/kvm/hyp/include/hyp/switch.h ++++ b/arch/arm64/kvm/hyp/include/hyp/switch.h +@@ -375,6 +375,65 @@ static inline void __hyp_sve_save_host(v + true); + } + ++static inline void fpsimd_lazy_switch_to_guest(struct kvm_vcpu *vcpu) ++{ ++ u64 zcr_el1, zcr_el2; ++ ++ if (!guest_owns_fp_regs()) ++ return; ++ ++ if (vcpu_has_sve(vcpu)) { ++ /* A guest hypervisor may restrict the effective max VL. */ ++ if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) ++ zcr_el2 = __vcpu_sys_reg(vcpu, ZCR_EL2); ++ else ++ zcr_el2 = vcpu_sve_max_vq(vcpu) - 1; ++ ++ write_sysreg_el2(zcr_el2, SYS_ZCR); ++ ++ zcr_el1 = __vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)); ++ write_sysreg_el1(zcr_el1, SYS_ZCR); ++ } ++} ++ ++static inline void fpsimd_lazy_switch_to_host(struct kvm_vcpu *vcpu) ++{ ++ u64 zcr_el1, zcr_el2; ++ ++ if (!guest_owns_fp_regs()) ++ return; ++ ++ /* ++ * When the guest owns the FP regs, we know that guest+hyp traps for ++ * any FPSIMD/SVE/SME features exposed to the guest have been disabled ++ * by either fpsimd_lazy_switch_to_guest() or kvm_hyp_handle_fpsimd() ++ * prior to __guest_entry(). As __guest_entry() guarantees a context ++ * synchronization event, we don't need an ISB here to avoid taking ++ * traps for anything that was exposed to the guest. ++ */ ++ if (vcpu_has_sve(vcpu)) { ++ zcr_el1 = read_sysreg_el1(SYS_ZCR); ++ __vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)) = zcr_el1; ++ ++ /* ++ * The guest's state is always saved using the guest's max VL. ++ * Ensure that the host has the guest's max VL active such that ++ * the host can save the guest's state lazily, but don't ++ * artificially restrict the host to the guest's max VL. ++ */ ++ if (has_vhe()) { ++ zcr_el2 = vcpu_sve_max_vq(vcpu) - 1; ++ write_sysreg_el2(zcr_el2, SYS_ZCR); ++ } else { ++ zcr_el2 = sve_vq_from_vl(kvm_host_sve_max_vl) - 1; ++ write_sysreg_el2(zcr_el2, SYS_ZCR); ++ ++ zcr_el1 = vcpu_sve_max_vq(vcpu) - 1; ++ write_sysreg_el1(zcr_el1, SYS_ZCR); ++ } ++ } ++} ++ + static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu) + { + /* +--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c ++++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c +@@ -5,6 +5,7 @@ + */ + + #include ++#include + + #include + #include +@@ -178,8 +179,12 @@ static void handle___kvm_vcpu_run(struct + sync_hyp_vcpu(hyp_vcpu); + pkvm_put_hyp_vcpu(hyp_vcpu); + } else { ++ struct kvm_vcpu *vcpu = kern_hyp_va(host_vcpu); ++ + /* The host is fully trusted, run its vCPU directly. */ +- ret = __kvm_vcpu_run(host_vcpu); ++ fpsimd_lazy_switch_to_guest(vcpu); ++ ret = __kvm_vcpu_run(vcpu); ++ fpsimd_lazy_switch_to_host(vcpu); + } + + out: +@@ -480,12 +485,6 @@ void handle_trap(struct kvm_cpu_context + case ESR_ELx_EC_SMC64: + handle_host_smc(host_ctxt); + break; +- case ESR_ELx_EC_SVE: +- cpacr_clear_set(0, CPACR_ELx_ZEN); +- isb(); +- sve_cond_update_zcr_vq(sve_vq_from_vl(kvm_host_sve_max_vl) - 1, +- SYS_ZCR_EL2); +- break; + case ESR_ELx_EC_IABT_LOW: + case ESR_ELx_EC_DABT_LOW: + handle_host_mem_abort(host_ctxt); +--- a/arch/arm64/kvm/hyp/nvhe/switch.c ++++ b/arch/arm64/kvm/hyp/nvhe/switch.c +@@ -40,6 +40,9 @@ static void __activate_cptr_traps(struct + { + u64 val = CPTR_EL2_TAM; /* Same bit irrespective of E2H */ + ++ if (!guest_owns_fp_regs()) ++ __activate_traps_fpsimd32(vcpu); ++ + if (has_hvhe()) { + val |= CPACR_ELx_TTA; + +@@ -48,6 +51,8 @@ static void __activate_cptr_traps(struct + if (vcpu_has_sve(vcpu)) + val |= CPACR_ELx_ZEN; + } ++ ++ write_sysreg(val, cpacr_el1); + } else { + val |= CPTR_EL2_TTA | CPTR_NVHE_EL2_RES1; + +@@ -62,12 +67,32 @@ static void __activate_cptr_traps(struct + + if (!guest_owns_fp_regs()) + val |= CPTR_EL2_TFP; ++ ++ write_sysreg(val, cptr_el2); + } ++} + +- if (!guest_owns_fp_regs()) +- __activate_traps_fpsimd32(vcpu); ++static void __deactivate_cptr_traps(struct kvm_vcpu *vcpu) ++{ ++ if (has_hvhe()) { ++ u64 val = CPACR_ELx_FPEN; ++ ++ if (cpus_have_final_cap(ARM64_SVE)) ++ val |= CPACR_ELx_ZEN; ++ if (cpus_have_final_cap(ARM64_SME)) ++ val |= CPACR_ELx_SMEN; ++ ++ write_sysreg(val, cpacr_el1); ++ } else { ++ u64 val = CPTR_NVHE_EL2_RES1; ++ ++ if (!cpus_have_final_cap(ARM64_SVE)) ++ val |= CPTR_EL2_TZ; ++ if (!cpus_have_final_cap(ARM64_SME)) ++ val |= CPTR_EL2_TSM; + +- kvm_write_cptr_el2(val); ++ write_sysreg(val, cptr_el2); ++ } + } + + static void __activate_traps(struct kvm_vcpu *vcpu) +@@ -120,7 +145,7 @@ static void __deactivate_traps(struct kv + + write_sysreg(this_cpu_ptr(&kvm_init_params)->hcr_el2, hcr_el2); + +- kvm_reset_cptr_el2(vcpu); ++ __deactivate_cptr_traps(vcpu); + write_sysreg(__kvm_hyp_host_vector, vbar_el2); + } + +--- a/arch/arm64/kvm/hyp/vhe/switch.c ++++ b/arch/arm64/kvm/hyp/vhe/switch.c +@@ -462,6 +462,8 @@ static int __kvm_vcpu_run_vhe(struct kvm + + sysreg_save_host_state_vhe(host_ctxt); + ++ fpsimd_lazy_switch_to_guest(vcpu); ++ + /* + * Note that ARM erratum 1165522 requires us to configure both stage 1 + * and stage 2 translation for the guest context before we clear +@@ -486,6 +488,8 @@ static int __kvm_vcpu_run_vhe(struct kvm + + __deactivate_traps(vcpu); + ++ fpsimd_lazy_switch_to_host(vcpu); ++ + sysreg_restore_host_state_vhe(host_ctxt); + + if (guest_owns_fp_regs()) diff --git a/queue-6.13/kvm-arm64-mark-some-header-functions-as-inline.patch b/queue-6.13/kvm-arm64-mark-some-header-functions-as-inline.patch new file mode 100644 index 0000000000..45cbf53846 --- /dev/null +++ b/queue-6.13/kvm-arm64-mark-some-header-functions-as-inline.patch @@ -0,0 +1,122 @@ +From broonie@kernel.org Thu Mar 13 00:49:50 2025 +From: Mark Brown +Date: Wed, 12 Mar 2025 23:49:15 +0000 +Subject: KVM: arm64: Mark some header functions as inline +To: Greg Kroah-Hartman , Marc Zyngier , Oliver Upton , Joey Gouly , Suzuki K Poulose , Catalin Marinas , Will Deacon +Cc: linux-arm-kernel@lists.infradead.org, kvmarm@lists.linux.dev, linux-kernel@vger.kernel.org, stable@vger.kernel.org, Mark Brown , Mark Rutland , Fuad Tabba +Message-ID: <20250312-stable-sve-6-13-v1-7-c7ba07a6f4f7@kernel.org> + +From: Mark Rutland + +[ Upstream commit f9dd00de1e53a47763dfad601635d18542c3836d ] + +The shared hyp switch header has a number of static functions which +might not be used by all files that include the header, and when unused +they will provoke compiler warnings, e.g. + +| In file included from arch/arm64/kvm/hyp/nvhe/hyp-main.c:8: +| ./arch/arm64/kvm/hyp/include/hyp/switch.h:703:13: warning: 'kvm_hyp_handle_dabt_low' defined but not used [-Wunused-function] +| 703 | static bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) +| | ^~~~~~~~~~~~~~~~~~~~~~~ +| ./arch/arm64/kvm/hyp/include/hyp/switch.h:682:13: warning: 'kvm_hyp_handle_cp15_32' defined but not used [-Wunused-function] +| 682 | static bool kvm_hyp_handle_cp15_32(struct kvm_vcpu *vcpu, u64 *exit_code) +| | ^~~~~~~~~~~~~~~~~~~~~~ +| ./arch/arm64/kvm/hyp/include/hyp/switch.h:662:13: warning: 'kvm_hyp_handle_sysreg' defined but not used [-Wunused-function] +| 662 | static bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code) +| | ^~~~~~~~~~~~~~~~~~~~~ +| ./arch/arm64/kvm/hyp/include/hyp/switch.h:458:13: warning: 'kvm_hyp_handle_fpsimd' defined but not used [-Wunused-function] +| 458 | static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) +| | ^~~~~~~~~~~~~~~~~~~~~ +| ./arch/arm64/kvm/hyp/include/hyp/switch.h:329:13: warning: 'kvm_hyp_handle_mops' defined but not used [-Wunused-function] +| 329 | static bool kvm_hyp_handle_mops(struct kvm_vcpu *vcpu, u64 *exit_code) +| | ^~~~~~~~~~~~~~~~~~~ + +Mark these functions as 'inline' to suppress this warning. This +shouldn't result in any functional change. + +At the same time, avoid the use of __alias() in the header and alias +kvm_hyp_handle_iabt_low() and kvm_hyp_handle_watchpt_low() to +kvm_hyp_handle_memory_fault() using CPP, matching the style in the rest +of the kernel. For consistency, kvm_hyp_handle_memory_fault() is also +marked as 'inline'. + +Signed-off-by: Mark Rutland +Reviewed-by: Mark Brown +Tested-by: Mark Brown +Acked-by: Will Deacon +Cc: Catalin Marinas +Cc: Fuad Tabba +Cc: Marc Zyngier +Cc: Oliver Upton +Reviewed-by: Oliver Upton +Link: https://lore.kernel.org/r/20250210195226.1215254-8-mark.rutland@arm.com +Signed-off-by: Marc Zyngier +Signed-off-by: Mark Brown +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm64/kvm/hyp/include/hyp/switch.h | 19 +++++++++---------- + 1 file changed, 9 insertions(+), 10 deletions(-) + +--- a/arch/arm64/kvm/hyp/include/hyp/switch.h ++++ b/arch/arm64/kvm/hyp/include/hyp/switch.h +@@ -326,7 +326,7 @@ static inline bool __populate_fault_info + return __get_fault_info(vcpu->arch.fault.esr_el2, &vcpu->arch.fault); + } + +-static bool kvm_hyp_handle_mops(struct kvm_vcpu *vcpu, u64 *exit_code) ++static inline bool kvm_hyp_handle_mops(struct kvm_vcpu *vcpu, u64 *exit_code) + { + *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR); + arm64_mops_reset_regs(vcpu_gp_regs(vcpu), vcpu->arch.fault.esr_el2); +@@ -404,7 +404,7 @@ static void kvm_hyp_save_fpsimd_host(str + * If FP/SIMD is not implemented, handle the trap and inject an undefined + * instruction exception to the guest. Similarly for trapped SVE accesses. + */ +-static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) ++static inline bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) + { + bool sve_guest; + u8 esr_ec; +@@ -595,7 +595,7 @@ static bool handle_ampere1_tcr(struct kv + return true; + } + +-static bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code) ++static inline bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code) + { + if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_TX2_219_TVM) && + handle_tx2_tvm(vcpu)) +@@ -615,7 +615,7 @@ static bool kvm_hyp_handle_sysreg(struct + return false; + } + +-static bool kvm_hyp_handle_cp15_32(struct kvm_vcpu *vcpu, u64 *exit_code) ++static inline bool kvm_hyp_handle_cp15_32(struct kvm_vcpu *vcpu, u64 *exit_code) + { + if (static_branch_unlikely(&vgic_v3_cpuif_trap) && + __vgic_v3_perform_cpuif_access(vcpu) == 1) +@@ -624,19 +624,18 @@ static bool kvm_hyp_handle_cp15_32(struc + return false; + } + +-static bool kvm_hyp_handle_memory_fault(struct kvm_vcpu *vcpu, u64 *exit_code) ++static inline bool kvm_hyp_handle_memory_fault(struct kvm_vcpu *vcpu, ++ u64 *exit_code) + { + if (!__populate_fault_info(vcpu)) + return true; + + return false; + } +-static bool kvm_hyp_handle_iabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) +- __alias(kvm_hyp_handle_memory_fault); +-static bool kvm_hyp_handle_watchpt_low(struct kvm_vcpu *vcpu, u64 *exit_code) +- __alias(kvm_hyp_handle_memory_fault); ++#define kvm_hyp_handle_iabt_low kvm_hyp_handle_memory_fault ++#define kvm_hyp_handle_watchpt_low kvm_hyp_handle_memory_fault + +-static bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) ++static inline bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) + { + if (kvm_hyp_handle_memory_fault(vcpu, exit_code)) + return true; diff --git a/queue-6.13/kvm-arm64-refactor-exit-handlers.patch b/queue-6.13/kvm-arm64-refactor-exit-handlers.patch new file mode 100644 index 0000000000..b502a0f6a1 --- /dev/null +++ b/queue-6.13/kvm-arm64-refactor-exit-handlers.patch @@ -0,0 +1,186 @@ +From broonie@kernel.org Thu Mar 13 00:49:47 2025 +From: Mark Brown +Date: Wed, 12 Mar 2025 23:49:14 +0000 +Subject: KVM: arm64: Refactor exit handlers +To: Greg Kroah-Hartman , Marc Zyngier , Oliver Upton , Joey Gouly , Suzuki K Poulose , Catalin Marinas , Will Deacon +Cc: linux-arm-kernel@lists.infradead.org, kvmarm@lists.linux.dev, linux-kernel@vger.kernel.org, stable@vger.kernel.org, Mark Brown , Mark Rutland , Fuad Tabba +Message-ID: <20250312-stable-sve-6-13-v1-6-c7ba07a6f4f7@kernel.org> + +From: Mark Rutland + +[ Upstream commit 9b66195063c5a145843547b1d692bd189be85287 ] + +The hyp exit handling logic is largely shared between VHE and nVHE/hVHE, +with common logic in arch/arm64/kvm/hyp/include/hyp/switch.h. The code +in the header depends on function definitions provided by +arch/arm64/kvm/hyp/vhe/switch.c and arch/arm64/kvm/hyp/nvhe/switch.c +when they include the header. + +This is an unusual header dependency, and prevents the use of +arch/arm64/kvm/hyp/include/hyp/switch.h in other files as this would +result in compiler warnings regarding missing definitions, e.g. + +| In file included from arch/arm64/kvm/hyp/nvhe/hyp-main.c:8: +| ./arch/arm64/kvm/hyp/include/hyp/switch.h:733:31: warning: 'kvm_get_exit_handler_array' used but never defined +| 733 | static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu); +| | ^~~~~~~~~~~~~~~~~~~~~~~~~~ +| ./arch/arm64/kvm/hyp/include/hyp/switch.h:735:13: warning: 'early_exit_filter' used but never defined +| 735 | static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code); +| | ^~~~~~~~~~~~~~~~~ + +Refactor the logic such that the header doesn't depend on anything from +the C files. There should be no functional change as a result of this +patch. + +Signed-off-by: Mark Rutland +Reviewed-by: Mark Brown +Tested-by: Mark Brown +Acked-by: Will Deacon +Cc: Catalin Marinas +Cc: Fuad Tabba +Cc: Marc Zyngier +Cc: Oliver Upton +Reviewed-by: Oliver Upton +Link: https://lore.kernel.org/r/20250210195226.1215254-7-mark.rutland@arm.com +Signed-off-by: Marc Zyngier +Signed-off-by: Mark Brown +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm64/kvm/hyp/include/hyp/switch.h | 30 ++++++------------------------ + arch/arm64/kvm/hyp/nvhe/switch.c | 28 ++++++++++++++++------------ + arch/arm64/kvm/hyp/vhe/switch.c | 9 ++++----- + 3 files changed, 26 insertions(+), 41 deletions(-) + +--- a/arch/arm64/kvm/hyp/include/hyp/switch.h ++++ b/arch/arm64/kvm/hyp/include/hyp/switch.h +@@ -666,23 +666,16 @@ static bool kvm_hyp_handle_dabt_low(stru + + typedef bool (*exit_handler_fn)(struct kvm_vcpu *, u64 *); + +-static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu); +- +-static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code); +- + /* + * Allow the hypervisor to handle the exit with an exit handler if it has one. + * + * Returns true if the hypervisor handled the exit, and control should go back + * to the guest, or false if it hasn't. + */ +-static inline bool kvm_hyp_handle_exit(struct kvm_vcpu *vcpu, u64 *exit_code) ++static inline bool kvm_hyp_handle_exit(struct kvm_vcpu *vcpu, u64 *exit_code, ++ const exit_handler_fn *handlers) + { +- const exit_handler_fn *handlers = kvm_get_exit_handler_array(vcpu); +- exit_handler_fn fn; +- +- fn = handlers[kvm_vcpu_trap_get_class(vcpu)]; +- ++ exit_handler_fn fn = handlers[kvm_vcpu_trap_get_class(vcpu)]; + if (fn) + return fn(vcpu, exit_code); + +@@ -712,20 +705,9 @@ static inline void synchronize_vcpu_psta + * the guest, false when we should restore the host state and return to the + * main run loop. + */ +-static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) ++static inline bool __fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code, ++ const exit_handler_fn *handlers) + { +- /* +- * Save PSTATE early so that we can evaluate the vcpu mode +- * early on. +- */ +- synchronize_vcpu_pstate(vcpu, exit_code); +- +- /* +- * Check whether we want to repaint the state one way or +- * another. +- */ +- early_exit_filter(vcpu, exit_code); +- + if (ARM_EXCEPTION_CODE(*exit_code) != ARM_EXCEPTION_IRQ) + vcpu->arch.fault.esr_el2 = read_sysreg_el2(SYS_ESR); + +@@ -755,7 +737,7 @@ static inline bool fixup_guest_exit(stru + goto exit; + + /* Check if there's an exit handler and allow it to handle the exit. */ +- if (kvm_hyp_handle_exit(vcpu, exit_code)) ++ if (kvm_hyp_handle_exit(vcpu, exit_code, handlers)) + goto guest; + exit: + /* Return to the host kernel and handle the exit */ +--- a/arch/arm64/kvm/hyp/nvhe/switch.c ++++ b/arch/arm64/kvm/hyp/nvhe/switch.c +@@ -224,19 +224,21 @@ static const exit_handler_fn *kvm_get_ex + return hyp_exit_handlers; + } + +-/* +- * Some guests (e.g., protected VMs) are not be allowed to run in AArch32. +- * The ARMv8 architecture does not give the hypervisor a mechanism to prevent a +- * guest from dropping to AArch32 EL0 if implemented by the CPU. If the +- * hypervisor spots a guest in such a state ensure it is handled, and don't +- * trust the host to spot or fix it. The check below is based on the one in +- * kvm_arch_vcpu_ioctl_run(). +- * +- * Returns false if the guest ran in AArch32 when it shouldn't have, and +- * thus should exit to the host, or true if a the guest run loop can continue. +- */ +-static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code) ++static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) + { ++ const exit_handler_fn *handlers = kvm_get_exit_handler_array(vcpu); ++ ++ synchronize_vcpu_pstate(vcpu, exit_code); ++ ++ /* ++ * Some guests (e.g., protected VMs) are not be allowed to run in ++ * AArch32. The ARMv8 architecture does not give the hypervisor a ++ * mechanism to prevent a guest from dropping to AArch32 EL0 if ++ * implemented by the CPU. If the hypervisor spots a guest in such a ++ * state ensure it is handled, and don't trust the host to spot or fix ++ * it. The check below is based on the one in ++ * kvm_arch_vcpu_ioctl_run(). ++ */ + if (unlikely(vcpu_is_protected(vcpu) && vcpu_mode_is_32bit(vcpu))) { + /* + * As we have caught the guest red-handed, decide that it isn't +@@ -249,6 +251,8 @@ static void early_exit_filter(struct kvm + *exit_code &= BIT(ARM_EXIT_WITH_SERROR_BIT); + *exit_code |= ARM_EXCEPTION_IL; + } ++ ++ return __fixup_guest_exit(vcpu, exit_code, handlers); + } + + /* Switch to the guest for legacy non-VHE systems */ +--- a/arch/arm64/kvm/hyp/vhe/switch.c ++++ b/arch/arm64/kvm/hyp/vhe/switch.c +@@ -423,13 +423,10 @@ static const exit_handler_fn hyp_exit_ha + [ESR_ELx_EC_MOPS] = kvm_hyp_handle_mops, + }; + +-static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu) ++static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) + { +- return hyp_exit_handlers; +-} ++ synchronize_vcpu_pstate(vcpu, exit_code); + +-static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code) +-{ + /* + * If we were in HYP context on entry, adjust the PSTATE view + * so that the usual helpers work correctly. +@@ -449,6 +446,8 @@ static void early_exit_filter(struct kvm + *vcpu_cpsr(vcpu) &= ~(PSR_MODE_MASK | PSR_MODE32_BIT); + *vcpu_cpsr(vcpu) |= mode; + } ++ ++ return __fixup_guest_exit(vcpu, exit_code, hyp_exit_handlers); + } + + /* Switch to the guest for VHE systems running in EL2 */ diff --git a/queue-6.13/kvm-arm64-remove-host-fpsimd-saving-for-non-protected-kvm.patch b/queue-6.13/kvm-arm64-remove-host-fpsimd-saving-for-non-protected-kvm.patch new file mode 100644 index 0000000000..fa1af42acf --- /dev/null +++ b/queue-6.13/kvm-arm64-remove-host-fpsimd-saving-for-non-protected-kvm.patch @@ -0,0 +1,213 @@ +From stable+bounces-124194-greg=kroah.com@vger.kernel.org Thu Mar 13 00:50:09 2025 +From: Mark Brown +Date: Wed, 12 Mar 2025 23:49:11 +0000 +Subject: KVM: arm64: Remove host FPSIMD saving for non-protected KVM +To: Greg Kroah-Hartman , Marc Zyngier , Oliver Upton , Joey Gouly , Suzuki K Poulose , Catalin Marinas , Will Deacon +Cc: linux-arm-kernel@lists.infradead.org, kvmarm@lists.linux.dev, linux-kernel@vger.kernel.org, stable@vger.kernel.org, Mark Brown , Mark Rutland , Fuad Tabba +Message-ID: <20250312-stable-sve-6-13-v1-3-c7ba07a6f4f7@kernel.org> + +From: Mark Rutland + +[ Upstream commit 8eca7f6d5100b6997df4f532090bc3f7e0203bef ] + +Now that the host eagerly saves its own FPSIMD/SVE/SME state, +non-protected KVM never needs to save the host FPSIMD/SVE/SME state, +and the code to do this is never used. Protected KVM still needs to +save/restore the host FPSIMD/SVE state to avoid leaking guest state to +the host (and to avoid revealing to the host whether the guest used +FPSIMD/SVE/SME), and that code needs to be retained. + +Remove the unused code and data structures. + +To avoid the need for a stub copy of kvm_hyp_save_fpsimd_host() in the +VHE hyp code, the nVHE/hVHE version is moved into the shared switch +header, where it is only invoked when KVM is in protected mode. + +Signed-off-by: Mark Rutland +Reviewed-by: Mark Brown +Tested-by: Mark Brown +Acked-by: Will Deacon +Cc: Catalin Marinas +Cc: Fuad Tabba +Cc: Marc Zyngier +Cc: Oliver Upton +Reviewed-by: Oliver Upton +Link: https://lore.kernel.org/r/20250210195226.1215254-3-mark.rutland@arm.com +Signed-off-by: Marc Zyngier +[CPACR_EL1_ZEN -> CPACR_ELx_ZEN -- broonie] +Signed-off-by: Mark Brown +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm64/include/asm/kvm_host.h | 18 ++++-------------- + arch/arm64/kvm/arm.c | 8 -------- + arch/arm64/kvm/fpsimd.c | 2 -- + arch/arm64/kvm/hyp/include/hyp/switch.h | 25 +++++++++++++++++++++++-- + arch/arm64/kvm/hyp/nvhe/hyp-main.c | 2 +- + arch/arm64/kvm/hyp/nvhe/switch.c | 28 ---------------------------- + arch/arm64/kvm/hyp/vhe/switch.c | 8 -------- + 7 files changed, 28 insertions(+), 63 deletions(-) + +--- a/arch/arm64/include/asm/kvm_host.h ++++ b/arch/arm64/include/asm/kvm_host.h +@@ -613,23 +613,13 @@ struct kvm_host_data { + struct kvm_cpu_context host_ctxt; + + /* +- * All pointers in this union are hyp VA. ++ * Hyp VA. + * sve_state is only used in pKVM and if system_supports_sve(). + */ +- union { +- struct user_fpsimd_state *fpsimd_state; +- struct cpu_sve_state *sve_state; +- }; ++ struct cpu_sve_state *sve_state; + +- union { +- /* HYP VA pointer to the host storage for FPMR */ +- u64 *fpmr_ptr; +- /* +- * Used by pKVM only, as it needs to provide storage +- * for the host +- */ +- u64 fpmr; +- }; ++ /* Used by pKVM only. */ ++ u64 fpmr; + + /* Ownership of the FP regs */ + enum { +--- a/arch/arm64/kvm/arm.c ++++ b/arch/arm64/kvm/arm.c +@@ -2468,14 +2468,6 @@ static void finalize_init_hyp_mode(void) + per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state = + kern_hyp_va(sve_state); + } +- } else { +- for_each_possible_cpu(cpu) { +- struct user_fpsimd_state *fpsimd_state; +- +- fpsimd_state = &per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->host_ctxt.fp_regs; +- per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->fpsimd_state = +- kern_hyp_va(fpsimd_state); +- } + } + } + +--- a/arch/arm64/kvm/fpsimd.c ++++ b/arch/arm64/kvm/fpsimd.c +@@ -64,8 +64,6 @@ void kvm_arch_vcpu_load_fp(struct kvm_vc + */ + fpsimd_save_and_flush_cpu_state(); + *host_data_ptr(fp_owner) = FP_STATE_FREE; +- *host_data_ptr(fpsimd_state) = NULL; +- *host_data_ptr(fpmr_ptr) = NULL; + + vcpu_clear_flag(vcpu, HOST_SVE_ENABLED); + if (read_sysreg(cpacr_el1) & CPACR_EL1_ZEN_EL0EN) +--- a/arch/arm64/kvm/hyp/include/hyp/switch.h ++++ b/arch/arm64/kvm/hyp/include/hyp/switch.h +@@ -375,7 +375,28 @@ static inline void __hyp_sve_save_host(v + true); + } + +-static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu); ++static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu) ++{ ++ /* ++ * Non-protected kvm relies on the host restoring its sve state. ++ * Protected kvm restores the host's sve state as not to reveal that ++ * fpsimd was used by a guest nor leak upper sve bits. ++ */ ++ if (system_supports_sve()) { ++ __hyp_sve_save_host(); ++ ++ /* Re-enable SVE traps if not supported for the guest vcpu. */ ++ if (!vcpu_has_sve(vcpu)) ++ cpacr_clear_set(CPACR_ELx_ZEN, 0); ++ ++ } else { ++ __fpsimd_save_state(host_data_ptr(host_ctxt.fp_regs)); ++ } ++ ++ if (kvm_has_fpmr(kern_hyp_va(vcpu->kvm))) ++ *host_data_ptr(fpmr) = read_sysreg_s(SYS_FPMR); ++} ++ + + /* + * We trap the first access to the FP/SIMD to save the host context and +@@ -425,7 +446,7 @@ static bool kvm_hyp_handle_fpsimd(struct + isb(); + + /* Write out the host state if it's in the registers */ +- if (host_owns_fp_regs()) ++ if (is_protected_kvm_enabled() && host_owns_fp_regs()) + kvm_hyp_save_fpsimd_host(vcpu); + + /* Restore the guest state */ +--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c ++++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c +@@ -83,7 +83,7 @@ static void fpsimd_sve_sync(struct kvm_v + if (system_supports_sve()) + __hyp_sve_restore_host(); + else +- __fpsimd_restore_state(*host_data_ptr(fpsimd_state)); ++ __fpsimd_restore_state(host_data_ptr(host_ctxt.fp_regs)); + + if (has_fpmr) + write_sysreg_s(*host_data_ptr(fpmr), SYS_FPMR); +--- a/arch/arm64/kvm/hyp/nvhe/switch.c ++++ b/arch/arm64/kvm/hyp/nvhe/switch.c +@@ -193,34 +193,6 @@ static bool kvm_handle_pvm_sys64(struct + kvm_handle_pvm_sysreg(vcpu, exit_code)); + } + +-static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu) +-{ +- /* +- * Non-protected kvm relies on the host restoring its sve state. +- * Protected kvm restores the host's sve state as not to reveal that +- * fpsimd was used by a guest nor leak upper sve bits. +- */ +- if (unlikely(is_protected_kvm_enabled() && system_supports_sve())) { +- __hyp_sve_save_host(); +- +- /* Re-enable SVE traps if not supported for the guest vcpu. */ +- if (!vcpu_has_sve(vcpu)) +- cpacr_clear_set(CPACR_ELx_ZEN, 0); +- +- } else { +- __fpsimd_save_state(*host_data_ptr(fpsimd_state)); +- } +- +- if (kvm_has_fpmr(kern_hyp_va(vcpu->kvm))) { +- u64 val = read_sysreg_s(SYS_FPMR); +- +- if (unlikely(is_protected_kvm_enabled())) +- *host_data_ptr(fpmr) = val; +- else +- **host_data_ptr(fpmr_ptr) = val; +- } +-} +- + static const exit_handler_fn hyp_exit_handlers[] = { + [0 ... ESR_ELx_EC_MAX] = NULL, + [ESR_ELx_EC_CP15_32] = kvm_hyp_handle_cp15_32, +--- a/arch/arm64/kvm/hyp/vhe/switch.c ++++ b/arch/arm64/kvm/hyp/vhe/switch.c +@@ -309,14 +309,6 @@ static bool kvm_hyp_handle_eret(struct k + return true; + } + +-static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu) +-{ +- __fpsimd_save_state(*host_data_ptr(fpsimd_state)); +- +- if (kvm_has_fpmr(vcpu->kvm)) +- **host_data_ptr(fpmr_ptr) = read_sysreg_s(SYS_FPMR); +-} +- + static bool kvm_hyp_handle_tlbi_el2(struct kvm_vcpu *vcpu, u64 *exit_code) + { + int ret = -EINVAL; diff --git a/queue-6.13/kvm-arm64-remove-vhe-host-restore-of-cpacr_el1.smen.patch b/queue-6.13/kvm-arm64-remove-vhe-host-restore-of-cpacr_el1.smen.patch new file mode 100644 index 0000000000..fc9f8f9c48 --- /dev/null +++ b/queue-6.13/kvm-arm64-remove-vhe-host-restore-of-cpacr_el1.smen.patch @@ -0,0 +1,118 @@ +From broonie@kernel.org Thu Mar 13 00:49:44 2025 +From: Mark Brown +Date: Wed, 12 Mar 2025 23:49:13 +0000 +Subject: KVM: arm64: Remove VHE host restore of CPACR_EL1.SMEN +To: Greg Kroah-Hartman , Marc Zyngier , Oliver Upton , Joey Gouly , Suzuki K Poulose , Catalin Marinas , Will Deacon +Cc: linux-arm-kernel@lists.infradead.org, kvmarm@lists.linux.dev, linux-kernel@vger.kernel.org, stable@vger.kernel.org, Mark Brown , Mark Rutland , Fuad Tabba +Message-ID: <20250312-stable-sve-6-13-v1-5-c7ba07a6f4f7@kernel.org> + +From: Mark Rutland + +[ Upstream commit 407a99c4654e8ea65393f412c421a55cac539f5b ] + +When KVM is in VHE mode, the host kernel tries to save and restore the +configuration of CPACR_EL1.SMEN (i.e. CPTR_EL2.SMEN when HCR_EL2.E2H=1) +across kvm_arch_vcpu_load_fp() and kvm_arch_vcpu_put_fp(), since the +configuration may be clobbered by hyp when running a vCPU. This logic +has historically been broken, and is currently redundant. + +This logic was originally introduced in commit: + + 861262ab86270206 ("KVM: arm64: Handle SME host state when running guests") + +At the time, the VHE hyp code would reset CPTR_EL2.SMEN to 0b00 when +returning to the host, trapping host access to SME state. Unfortunately, +this was unsafe as the host could take a softirq before calling +kvm_arch_vcpu_put_fp(), and if a softirq handler were to use kernel mode +NEON the resulting attempt to save the live FPSIMD/SVE/SME state would +result in a fatal trap. + +That issue was limited to VHE mode. For nVHE/hVHE modes, KVM always +saved/restored the host kernel's CPACR_EL1 value, and configured +CPTR_EL2.TSM to 0b0, ensuring that host usage of SME would not be +trapped. + +The issue above was incidentally fixed by commit: + + 375110ab51dec5dc ("KVM: arm64: Fix resetting SME trap values on reset for (h)VHE") + +That commit changed the VHE hyp code to configure CPTR_EL2.SMEN to 0b01 +when returning to the host, permitting host kernel usage of SME, +avoiding the issue described above. At the time, this was not identified +as a fix for commit 861262ab86270206. + +Now that the host eagerly saves and unbinds its own FPSIMD/SVE/SME +state, there's no need to save/restore the state of the EL0 SME trap. +The kernel can safely save/restore state without trapping, as described +above, and will restore userspace state (including trap controls) before +returning to userspace. + +Remove the redundant logic. + +Signed-off-by: Mark Rutland +Reviewed-by: Mark Brown +Tested-by: Mark Brown +Acked-by: Will Deacon +Cc: Catalin Marinas +Cc: Fuad Tabba +Cc: Marc Zyngier +Cc: Oliver Upton +Reviewed-by: Oliver Upton +Link: https://lore.kernel.org/r/20250210195226.1215254-5-mark.rutland@arm.com +Signed-off-by: Marc Zyngier +[Update for rework of flags storage -- broonie] +Signed-off-by: Mark Brown +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm64/include/asm/kvm_host.h | 2 -- + arch/arm64/kvm/fpsimd.c | 21 --------------------- + 2 files changed, 23 deletions(-) + +--- a/arch/arm64/include/asm/kvm_host.h ++++ b/arch/arm64/include/asm/kvm_host.h +@@ -902,8 +902,6 @@ struct kvm_vcpu_arch { + /* Save TRBE context if active */ + #define DEBUG_STATE_SAVE_TRBE __vcpu_single_flag(iflags, BIT(6)) + +-/* SME enabled for EL0 */ +-#define HOST_SME_ENABLED __vcpu_single_flag(sflags, BIT(1)) + /* Physical CPU not in supported_cpus */ + #define ON_UNSUPPORTED_CPU __vcpu_single_flag(sflags, BIT(2)) + /* WFIT instruction trapped */ +--- a/arch/arm64/kvm/fpsimd.c ++++ b/arch/arm64/kvm/fpsimd.c +@@ -65,12 +65,6 @@ void kvm_arch_vcpu_load_fp(struct kvm_vc + fpsimd_save_and_flush_cpu_state(); + *host_data_ptr(fp_owner) = FP_STATE_FREE; + +- if (system_supports_sme()) { +- vcpu_clear_flag(vcpu, HOST_SME_ENABLED); +- if (read_sysreg(cpacr_el1) & CPACR_EL1_SMEN_EL0EN) +- vcpu_set_flag(vcpu, HOST_SME_ENABLED); +- } +- + /* + * If normal guests gain SME support, maintain this behavior for pKVM + * guests, which don't support SME. +@@ -141,21 +135,6 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcp + + local_irq_save(flags); + +- /* +- * If we have VHE then the Hyp code will reset CPACR_EL1 to +- * the default value and we need to reenable SME. +- */ +- if (has_vhe() && system_supports_sme()) { +- /* Also restore EL0 state seen on entry */ +- if (vcpu_get_flag(vcpu, HOST_SME_ENABLED)) +- sysreg_clear_set(CPACR_EL1, 0, CPACR_ELx_SMEN); +- else +- sysreg_clear_set(CPACR_EL1, +- CPACR_EL1_SMEN_EL0EN, +- CPACR_EL1_SMEN_EL1EN); +- isb(); +- } +- + if (guest_owns_fp_regs()) { + if (vcpu_has_sve(vcpu)) { + u64 zcr = read_sysreg_el1(SYS_ZCR); diff --git a/queue-6.13/kvm-arm64-remove-vhe-host-restore-of-cpacr_el1.zen.patch b/queue-6.13/kvm-arm64-remove-vhe-host-restore-of-cpacr_el1.zen.patch new file mode 100644 index 0000000000..08f30a5a7c --- /dev/null +++ b/queue-6.13/kvm-arm64-remove-vhe-host-restore-of-cpacr_el1.zen.patch @@ -0,0 +1,91 @@ +From stable+bounces-124195-greg=kroah.com@vger.kernel.org Thu Mar 13 00:50:19 2025 +From: Mark Brown +Date: Wed, 12 Mar 2025 23:49:12 +0000 +Subject: KVM: arm64: Remove VHE host restore of CPACR_EL1.ZEN +To: Greg Kroah-Hartman , Marc Zyngier , Oliver Upton , Joey Gouly , Suzuki K Poulose , Catalin Marinas , Will Deacon +Cc: linux-arm-kernel@lists.infradead.org, kvmarm@lists.linux.dev, linux-kernel@vger.kernel.org, stable@vger.kernel.org, Mark Brown , Mark Rutland , Fuad Tabba +Message-ID: <20250312-stable-sve-6-13-v1-4-c7ba07a6f4f7@kernel.org> + +From: Mark Rutland + +[ Upstream commit 459f059be702056d91537b99a129994aa6ccdd35 ] + +When KVM is in VHE mode, the host kernel tries to save and restore the +configuration of CPACR_EL1.ZEN (i.e. CPTR_EL2.ZEN when HCR_EL2.E2H=1) +across kvm_arch_vcpu_load_fp() and kvm_arch_vcpu_put_fp(), since the +configuration may be clobbered by hyp when running a vCPU. This logic is +currently redundant. + +The VHE hyp code unconditionally configures CPTR_EL2.ZEN to 0b01 when +returning to the host, permitting host kernel usage of SVE. + +Now that the host eagerly saves and unbinds its own FPSIMD/SVE/SME +state, there's no need to save/restore the state of the EL0 SVE trap. +The kernel can safely save/restore state without trapping, as described +above, and will restore userspace state (including trap controls) before +returning to userspace. + +Remove the redundant logic. + +Signed-off-by: Mark Rutland +Reviewed-by: Mark Brown +Tested-by: Mark Brown +Acked-by: Will Deacon +Cc: Catalin Marinas +Cc: Fuad Tabba +Cc: Marc Zyngier +Cc: Oliver Upton +Reviewed-by: Oliver Upton +Link: https://lore.kernel.org/r/20250210195226.1215254-4-mark.rutland@arm.com +Signed-off-by: Marc Zyngier +[Rework for refactoring of where the flags are stored -- broonie] +Signed-off-by: Mark Brown +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm64/include/asm/kvm_host.h | 2 -- + arch/arm64/kvm/fpsimd.c | 16 ---------------- + 2 files changed, 18 deletions(-) + +--- a/arch/arm64/include/asm/kvm_host.h ++++ b/arch/arm64/include/asm/kvm_host.h +@@ -902,8 +902,6 @@ struct kvm_vcpu_arch { + /* Save TRBE context if active */ + #define DEBUG_STATE_SAVE_TRBE __vcpu_single_flag(iflags, BIT(6)) + +-/* SVE enabled for host EL0 */ +-#define HOST_SVE_ENABLED __vcpu_single_flag(sflags, BIT(0)) + /* SME enabled for EL0 */ + #define HOST_SME_ENABLED __vcpu_single_flag(sflags, BIT(1)) + /* Physical CPU not in supported_cpus */ +--- a/arch/arm64/kvm/fpsimd.c ++++ b/arch/arm64/kvm/fpsimd.c +@@ -65,10 +65,6 @@ void kvm_arch_vcpu_load_fp(struct kvm_vc + fpsimd_save_and_flush_cpu_state(); + *host_data_ptr(fp_owner) = FP_STATE_FREE; + +- vcpu_clear_flag(vcpu, HOST_SVE_ENABLED); +- if (read_sysreg(cpacr_el1) & CPACR_EL1_ZEN_EL0EN) +- vcpu_set_flag(vcpu, HOST_SVE_ENABLED); +- + if (system_supports_sme()) { + vcpu_clear_flag(vcpu, HOST_SME_ENABLED); + if (read_sysreg(cpacr_el1) & CPACR_EL1_SMEN_EL0EN) +@@ -202,18 +198,6 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcp + * when needed. + */ + fpsimd_save_and_flush_cpu_state(); +- } else if (has_vhe() && system_supports_sve()) { +- /* +- * The FPSIMD/SVE state in the CPU has not been touched, and we +- * have SVE (and VHE): CPACR_EL1 (alias CPTR_EL2) has been +- * reset by kvm_reset_cptr_el2() in the Hyp code, disabling SVE +- * for EL0. To avoid spurious traps, restore the trap state +- * seen by kvm_arch_vcpu_load_fp(): +- */ +- if (vcpu_get_flag(vcpu, HOST_SVE_ENABLED)) +- sysreg_clear_set(CPACR_EL1, 0, CPACR_EL1_ZEN_EL0EN); +- else +- sysreg_clear_set(CPACR_EL1, CPACR_EL1_ZEN_EL0EN, 0); + } + + local_irq_restore(flags); diff --git a/queue-6.13/kvm-arm64-unconditionally-save-flush-host-fpsimd-sve-sme-state.patch b/queue-6.13/kvm-arm64-unconditionally-save-flush-host-fpsimd-sve-sme-state.patch new file mode 100644 index 0000000000..56f0bfa103 --- /dev/null +++ b/queue-6.13/kvm-arm64-unconditionally-save-flush-host-fpsimd-sve-sme-state.patch @@ -0,0 +1,163 @@ +From stable+bounces-124193-greg=kroah.com@vger.kernel.org Thu Mar 13 00:49:59 2025 +From: Mark Brown +Date: Wed, 12 Mar 2025 23:49:10 +0000 +Subject: KVM: arm64: Unconditionally save+flush host FPSIMD/SVE/SME state +To: Greg Kroah-Hartman , Marc Zyngier , Oliver Upton , Joey Gouly , Suzuki K Poulose , Catalin Marinas , Will Deacon +Cc: linux-arm-kernel@lists.infradead.org, kvmarm@lists.linux.dev, linux-kernel@vger.kernel.org, stable@vger.kernel.org, Mark Brown , Mark Rutland , Eric Auger , Wilco Dijkstra , Eric Auger , Florian Weimer , Fuad Tabba , Jeremy Linton , Paolo Bonzini +Message-ID: <20250312-stable-sve-6-13-v1-2-c7ba07a6f4f7@kernel.org> + +From: Mark Rutland + +[ Upstream commit fbc7e61195e23f744814e78524b73b59faa54ab4 ] + +There are several problems with the way hyp code lazily saves the host's +FPSIMD/SVE state, including: + +* Host SVE being discarded unexpectedly due to inconsistent + configuration of TIF_SVE and CPACR_ELx.ZEN. This has been seen to + result in QEMU crashes where SVE is used by memmove(), as reported by + Eric Auger: + + https://issues.redhat.com/browse/RHEL-68997 + +* Host SVE state is discarded *after* modification by ptrace, which was an + unintentional ptrace ABI change introduced with lazy discarding of SVE state. + +* The host FPMR value can be discarded when running a non-protected VM, + where FPMR support is not exposed to a VM, and that VM uses + FPSIMD/SVE. In these cases the hyp code does not save the host's FPMR + before unbinding the host's FPSIMD/SVE/SME state, leaving a stale + value in memory. + +Avoid these by eagerly saving and "flushing" the host's FPSIMD/SVE/SME +state when loading a vCPU such that KVM does not need to save any of the +host's FPSIMD/SVE/SME state. For clarity, fpsimd_kvm_prepare() is +removed and the necessary call to fpsimd_save_and_flush_cpu_state() is +placed in kvm_arch_vcpu_load_fp(). As 'fpsimd_state' and 'fpmr_ptr' +should not be used, they are set to NULL; all uses of these will be +removed in subsequent patches. + +Historical problems go back at least as far as v5.17, e.g. erroneous +assumptions about TIF_SVE being clear in commit: + + 8383741ab2e773a9 ("KVM: arm64: Get rid of host SVE tracking/saving") + +... and so this eager save+flush probably needs to be backported to ALL +stable trees. + +Fixes: 93ae6b01bafee8fa ("KVM: arm64: Discard any SVE state when entering KVM guests") +Fixes: 8c845e2731041f0f ("arm64/sve: Leave SVE enabled on syscall if we don't context switch") +Fixes: ef3be86021c3bdf3 ("KVM: arm64: Add save/restore support for FPMR") +Reported-by: Eric Auger +Reported-by: Wilco Dijkstra +Reviewed-by: Mark Brown +Tested-by: Mark Brown +Tested-by: Eric Auger +Acked-by: Will Deacon +Cc: Catalin Marinas +Cc: Florian Weimer +Cc: Fuad Tabba +Cc: Jeremy Linton +Cc: Marc Zyngier +Cc: Oliver Upton +Cc: Paolo Bonzini +Signed-off-by: Mark Rutland +Reviewed-by: Oliver Upton +Link: https://lore.kernel.org/r/20250210195226.1215254-2-mark.rutland@arm.com +Signed-off-by: Marc Zyngier +[ Mark: Handle vcpu/host flag conflict ] +Signed-off-by: Mark Rutland +Signed-off-by: Mark Brown +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm64/kernel/fpsimd.c | 25 ------------------------- + arch/arm64/kvm/fpsimd.c | 35 ++++++++++------------------------- + 2 files changed, 10 insertions(+), 50 deletions(-) + +--- a/arch/arm64/kernel/fpsimd.c ++++ b/arch/arm64/kernel/fpsimd.c +@@ -1695,31 +1695,6 @@ void fpsimd_signal_preserve_current_stat + } + + /* +- * Called by KVM when entering the guest. +- */ +-void fpsimd_kvm_prepare(void) +-{ +- if (!system_supports_sve()) +- return; +- +- /* +- * KVM does not save host SVE state since we can only enter +- * the guest from a syscall so the ABI means that only the +- * non-saved SVE state needs to be saved. If we have left +- * SVE enabled for performance reasons then update the task +- * state to be FPSIMD only. +- */ +- get_cpu_fpsimd_context(); +- +- if (test_and_clear_thread_flag(TIF_SVE)) { +- sve_to_fpsimd(current); +- current->thread.fp_type = FP_STATE_FPSIMD; +- } +- +- put_cpu_fpsimd_context(); +-} +- +-/* + * Associate current's FPSIMD context with this cpu + * The caller must have ownership of the cpu FPSIMD context before calling + * this function. +--- a/arch/arm64/kvm/fpsimd.c ++++ b/arch/arm64/kvm/fpsimd.c +@@ -54,16 +54,18 @@ void kvm_arch_vcpu_load_fp(struct kvm_vc + if (!system_supports_fpsimd()) + return; + +- fpsimd_kvm_prepare(); +- + /* +- * We will check TIF_FOREIGN_FPSTATE just before entering the +- * guest in kvm_arch_vcpu_ctxflush_fp() and override this to +- * FP_STATE_FREE if the flag set. ++ * Ensure that any host FPSIMD/SVE/SME state is saved and unbound such ++ * that the host kernel is responsible for restoring this state upon ++ * return to userspace, and the hyp code doesn't need to save anything. ++ * ++ * When the host may use SME, fpsimd_save_and_flush_cpu_state() ensures ++ * that PSTATE.{SM,ZA} == {0,0}. + */ +- *host_data_ptr(fp_owner) = FP_STATE_HOST_OWNED; +- *host_data_ptr(fpsimd_state) = kern_hyp_va(¤t->thread.uw.fpsimd_state); +- *host_data_ptr(fpmr_ptr) = kern_hyp_va(¤t->thread.uw.fpmr); ++ fpsimd_save_and_flush_cpu_state(); ++ *host_data_ptr(fp_owner) = FP_STATE_FREE; ++ *host_data_ptr(fpsimd_state) = NULL; ++ *host_data_ptr(fpmr_ptr) = NULL; + + vcpu_clear_flag(vcpu, HOST_SVE_ENABLED); + if (read_sysreg(cpacr_el1) & CPACR_EL1_ZEN_EL0EN) +@@ -73,23 +75,6 @@ void kvm_arch_vcpu_load_fp(struct kvm_vc + vcpu_clear_flag(vcpu, HOST_SME_ENABLED); + if (read_sysreg(cpacr_el1) & CPACR_EL1_SMEN_EL0EN) + vcpu_set_flag(vcpu, HOST_SME_ENABLED); +- +- /* +- * If PSTATE.SM is enabled then save any pending FP +- * state and disable PSTATE.SM. If we leave PSTATE.SM +- * enabled and the guest does not enable SME via +- * CPACR_EL1.SMEN then operations that should be valid +- * may generate SME traps from EL1 to EL1 which we +- * can't intercept and which would confuse the guest. +- * +- * Do the same for PSTATE.ZA in the case where there +- * is state in the registers which has not already +- * been saved, this is very unlikely to happen. +- */ +- if (read_sysreg_s(SYS_SVCR) & (SVCR_SM_MASK | SVCR_ZA_MASK)) { +- *host_data_ptr(fp_owner) = FP_STATE_FREE; +- fpsimd_save_and_flush_cpu_state(); +- } + } + + /* diff --git a/queue-6.13/mm-fix-kernel-bug-when-userfaultfd_move-encounters-swapcache.patch b/queue-6.13/mm-fix-kernel-bug-when-userfaultfd_move-encounters-swapcache.patch new file mode 100644 index 0000000000..f4e4f0a3e9 --- /dev/null +++ b/queue-6.13/mm-fix-kernel-bug-when-userfaultfd_move-encounters-swapcache.patch @@ -0,0 +1,276 @@ +From c50f8e6053b0503375c2975bf47f182445aebb4c Mon Sep 17 00:00:00 2001 +From: Barry Song +Date: Wed, 26 Feb 2025 13:14:00 +1300 +Subject: mm: fix kernel BUG when userfaultfd_move encounters swapcache + +From: Barry Song + +commit c50f8e6053b0503375c2975bf47f182445aebb4c upstream. + +userfaultfd_move() checks whether the PTE entry is present or a +swap entry. + +- If the PTE entry is present, move_present_pte() handles folio + migration by setting: + + src_folio->index = linear_page_index(dst_vma, dst_addr); + +- If the PTE entry is a swap entry, move_swap_pte() simply copies + the PTE to the new dst_addr. + +This approach is incorrect because, even if the PTE is a swap entry, +it can still reference a folio that remains in the swap cache. + +This creates a race window between steps 2 and 4. + 1. add_to_swap: The folio is added to the swapcache. + 2. try_to_unmap: PTEs are converted to swap entries. + 3. pageout: The folio is written back. + 4. Swapcache is cleared. +If userfaultfd_move() occurs in the window between steps 2 and 4, +after the swap PTE has been moved to the destination, accessing the +destination triggers do_swap_page(), which may locate the folio in +the swapcache. However, since the folio's index has not been updated +to match the destination VMA, do_swap_page() will detect a mismatch. + +This can result in two critical issues depending on the system +configuration. + +If KSM is disabled, both small and large folios can trigger a BUG +during the add_rmap operation due to: + + page_pgoff(folio, page) != linear_page_index(vma, address) + +[ 13.336953] page: refcount:6 mapcount:1 mapping:00000000f43db19c index:0xffffaf150 pfn:0x4667c +[ 13.337520] head: order:2 mapcount:1 entire_mapcount:0 nr_pages_mapped:1 pincount:0 +[ 13.337716] memcg:ffff00000405f000 +[ 13.337849] anon flags: 0x3fffc0000020459(locked|uptodate|dirty|owner_priv_1|head|swapbacked|node=0|zone=0|lastcpupid=0xffff) +[ 13.338630] raw: 03fffc0000020459 ffff80008507b538 ffff80008507b538 ffff000006260361 +[ 13.338831] raw: 0000000ffffaf150 0000000000004000 0000000600000000 ffff00000405f000 +[ 13.339031] head: 03fffc0000020459 ffff80008507b538 ffff80008507b538 ffff000006260361 +[ 13.339204] head: 0000000ffffaf150 0000000000004000 0000000600000000 ffff00000405f000 +[ 13.339375] head: 03fffc0000000202 fffffdffc0199f01 ffffffff00000000 0000000000000001 +[ 13.339546] head: 0000000000000004 0000000000000000 00000000ffffffff 0000000000000000 +[ 13.339736] page dumped because: VM_BUG_ON_PAGE(page_pgoff(folio, page) != linear_page_index(vma, address)) +[ 13.340190] ------------[ cut here ]------------ +[ 13.340316] kernel BUG at mm/rmap.c:1380! +[ 13.340683] Internal error: Oops - BUG: 00000000f2000800 [#1] PREEMPT SMP +[ 13.340969] Modules linked in: +[ 13.341257] CPU: 1 UID: 0 PID: 107 Comm: a.out Not tainted 6.14.0-rc3-gcf42737e247a-dirty #299 +[ 13.341470] Hardware name: linux,dummy-virt (DT) +[ 13.341671] pstate: 60000005 (nZCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) +[ 13.341815] pc : __page_check_anon_rmap+0xa0/0xb0 +[ 13.341920] lr : __page_check_anon_rmap+0xa0/0xb0 +[ 13.342018] sp : ffff80008752bb20 +[ 13.342093] x29: ffff80008752bb20 x28: fffffdffc0199f00 x27: 0000000000000001 +[ 13.342404] x26: 0000000000000000 x25: 0000000000000001 x24: 0000000000000001 +[ 13.342575] x23: 0000ffffaf0d0000 x22: 0000ffffaf0d0000 x21: fffffdffc0199f00 +[ 13.342731] x20: fffffdffc0199f00 x19: ffff000006210700 x18: 00000000ffffffff +[ 13.342881] x17: 6c203d2120296567 x16: 6170202c6f696c6f x15: 662866666f67705f +[ 13.343033] x14: 6567617028454741 x13: 2929737365726464 x12: ffff800083728ab0 +[ 13.343183] x11: ffff800082996bf8 x10: 0000000000000fd7 x9 : ffff80008011bc40 +[ 13.343351] x8 : 0000000000017fe8 x7 : 00000000fffff000 x6 : ffff8000829eebf8 +[ 13.343498] x5 : c0000000fffff000 x4 : 0000000000000000 x3 : 0000000000000000 +[ 13.343645] x2 : 0000000000000000 x1 : ffff0000062db980 x0 : 000000000000005f +[ 13.343876] Call trace: +[ 13.344045] __page_check_anon_rmap+0xa0/0xb0 (P) +[ 13.344234] folio_add_anon_rmap_ptes+0x22c/0x320 +[ 13.344333] do_swap_page+0x1060/0x1400 +[ 13.344417] __handle_mm_fault+0x61c/0xbc8 +[ 13.344504] handle_mm_fault+0xd8/0x2e8 +[ 13.344586] do_page_fault+0x20c/0x770 +[ 13.344673] do_translation_fault+0xb4/0xf0 +[ 13.344759] do_mem_abort+0x48/0xa0 +[ 13.344842] el0_da+0x58/0x130 +[ 13.344914] el0t_64_sync_handler+0xc4/0x138 +[ 13.345002] el0t_64_sync+0x1ac/0x1b0 +[ 13.345208] Code: aa1503e0 f000f801 910f6021 97ff5779 (d4210000) +[ 13.345504] ---[ end trace 0000000000000000 ]--- +[ 13.345715] note: a.out[107] exited with irqs disabled +[ 13.345954] note: a.out[107] exited with preempt_count 2 + +If KSM is enabled, Peter Xu also discovered that do_swap_page() may +trigger an unexpected CoW operation for small folios because +ksm_might_need_to_copy() allocates a new folio when the folio index +does not match linear_page_index(vma, addr). + +This patch also checks the swapcache when handling swap entries. If a +match is found in the swapcache, it processes it similarly to a present +PTE. +However, there are some differences. For example, the folio is no longer +exclusive because folio_try_share_anon_rmap_pte() is performed during +unmapping. +Furthermore, in the case of swapcache, the folio has already been +unmapped, eliminating the risk of concurrent rmap walks and removing the +need to acquire src_folio's anon_vma or lock. + +Note that for large folios, in the swapcache handling path, we directly +return -EBUSY since split_folio() will return -EBUSY regardless if +the folio is under writeback or unmapped. This is not an urgent issue, +so a follow-up patch may address it separately. + +[v-songbaohua@oppo.com: minor cleanup according to Peter Xu] + Link: https://lkml.kernel.org/r/20250226024411.47092-1-21cnbao@gmail.com +Link: https://lkml.kernel.org/r/20250226001400.9129-1-21cnbao@gmail.com +Fixes: adef440691ba ("userfaultfd: UFFDIO_MOVE uABI") +Signed-off-by: Barry Song +Acked-by: Peter Xu +Reviewed-by: Suren Baghdasaryan +Cc: Andrea Arcangeli +Cc: Al Viro +Cc: Axel Rasmussen +Cc: Brian Geffon +Cc: Christian Brauner +Cc: David Hildenbrand +Cc: Hugh Dickins +Cc: Jann Horn +Cc: Kalesh Singh +Cc: Liam R. Howlett +Cc: Lokesh Gidra +Cc: Matthew Wilcox (Oracle) +Cc: Michal Hocko +Cc: Mike Rapoport (IBM) +Cc: Nicolas Geoffray +Cc: Ryan Roberts +Cc: Shuah Khan +Cc: ZhangPeng +Cc: Tangquan Zheng +Cc: +Signed-off-by: Andrew Morton +[ surenb: resolved merged conflict caused by the difference in + move_swap_pte() arguments] +Signed-off-by: Suren Baghdasaryan +Signed-off-by: Greg Kroah-Hartman +--- + mm/userfaultfd.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++------- + 1 file changed, 66 insertions(+), 9 deletions(-) + +--- a/mm/userfaultfd.c ++++ b/mm/userfaultfd.c +@@ -18,6 +18,7 @@ + #include + #include + #include "internal.h" ++#include "swap.h" + + static __always_inline + bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end) +@@ -1067,15 +1068,13 @@ out: + return err; + } + +-static int move_swap_pte(struct mm_struct *mm, ++static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, + unsigned long dst_addr, unsigned long src_addr, + pte_t *dst_pte, pte_t *src_pte, + pte_t orig_dst_pte, pte_t orig_src_pte, +- spinlock_t *dst_ptl, spinlock_t *src_ptl) ++ spinlock_t *dst_ptl, spinlock_t *src_ptl, ++ struct folio *src_folio) + { +- if (!pte_swp_exclusive(orig_src_pte)) +- return -EBUSY; +- + double_pt_lock(dst_ptl, src_ptl); + + if (!pte_same(ptep_get(src_pte), orig_src_pte) || +@@ -1084,6 +1083,16 @@ static int move_swap_pte(struct mm_struc + return -EAGAIN; + } + ++ /* ++ * The src_folio resides in the swapcache, requiring an update to its ++ * index and mapping to align with the dst_vma, where a swap-in may ++ * occur and hit the swapcache after moving the PTE. ++ */ ++ if (src_folio) { ++ folio_move_anon_rmap(src_folio, dst_vma); ++ src_folio->index = linear_page_index(dst_vma, dst_addr); ++ } ++ + orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); + set_pte_at(mm, dst_addr, dst_pte, orig_src_pte); + double_pt_unlock(dst_ptl, src_ptl); +@@ -1130,6 +1139,7 @@ static int move_pages_pte(struct mm_stru + __u64 mode) + { + swp_entry_t entry; ++ struct swap_info_struct *si = NULL; + pte_t orig_src_pte, orig_dst_pte; + pte_t src_folio_pte; + spinlock_t *src_ptl, *dst_ptl; +@@ -1321,6 +1331,8 @@ retry: + orig_dst_pte, orig_src_pte, + dst_ptl, src_ptl, src_folio); + } else { ++ struct folio *folio = NULL; ++ + entry = pte_to_swp_entry(orig_src_pte); + if (non_swap_entry(entry)) { + if (is_migration_entry(entry)) { +@@ -1334,10 +1346,53 @@ retry: + goto out; + } + +- err = move_swap_pte(mm, dst_addr, src_addr, +- dst_pte, src_pte, +- orig_dst_pte, orig_src_pte, +- dst_ptl, src_ptl); ++ if (!pte_swp_exclusive(orig_src_pte)) { ++ err = -EBUSY; ++ goto out; ++ } ++ ++ si = get_swap_device(entry); ++ if (unlikely(!si)) { ++ err = -EAGAIN; ++ goto out; ++ } ++ /* ++ * Verify the existence of the swapcache. If present, the folio's ++ * index and mapping must be updated even when the PTE is a swap ++ * entry. The anon_vma lock is not taken during this process since ++ * the folio has already been unmapped, and the swap entry is ++ * exclusive, preventing rmap walks. ++ * ++ * For large folios, return -EBUSY immediately, as split_folio() ++ * also returns -EBUSY when attempting to split unmapped large ++ * folios in the swapcache. This issue needs to be resolved ++ * separately to allow proper handling. ++ */ ++ if (!src_folio) ++ folio = filemap_get_folio(swap_address_space(entry), ++ swap_cache_index(entry)); ++ if (!IS_ERR_OR_NULL(folio)) { ++ if (folio_test_large(folio)) { ++ err = -EBUSY; ++ folio_put(folio); ++ goto out; ++ } ++ src_folio = folio; ++ src_folio_pte = orig_src_pte; ++ if (!folio_trylock(src_folio)) { ++ pte_unmap(&orig_src_pte); ++ pte_unmap(&orig_dst_pte); ++ src_pte = dst_pte = NULL; ++ put_swap_device(si); ++ si = NULL; ++ /* now we can block and wait */ ++ folio_lock(src_folio); ++ goto retry; ++ } ++ } ++ err = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte, ++ orig_dst_pte, orig_src_pte, ++ dst_ptl, src_ptl, src_folio); + } + + out: +@@ -1354,6 +1409,8 @@ out: + if (src_pte) + pte_unmap(src_pte); + mmu_notifier_invalidate_range_end(&range); ++ if (si) ++ put_swap_device(si); + + return err; + } diff --git a/queue-6.13/mm-slab-kvfree_rcu-switch-to-wq_mem_reclaim-wq.patch b/queue-6.13/mm-slab-kvfree_rcu-switch-to-wq_mem_reclaim-wq.patch new file mode 100644 index 0000000000..d1be1e1c3f --- /dev/null +++ b/queue-6.13/mm-slab-kvfree_rcu-switch-to-wq_mem_reclaim-wq.patch @@ -0,0 +1,142 @@ +From dfd3df31c9db752234d7d2e09bef2aeabb643ce4 Mon Sep 17 00:00:00 2001 +From: "Uladzislau Rezki (Sony)" +Date: Fri, 28 Feb 2025 13:13:56 +0100 +Subject: mm/slab/kvfree_rcu: Switch to WQ_MEM_RECLAIM wq + +From: Uladzislau Rezki (Sony) + +commit dfd3df31c9db752234d7d2e09bef2aeabb643ce4 upstream. + +Currently kvfree_rcu() APIs use a system workqueue which is +"system_unbound_wq" to driver RCU machinery to reclaim a memory. + +Recently, it has been noted that the following kernel warning can +be observed: + + +workqueue: WQ_MEM_RECLAIM nvme-wq:nvme_scan_work is flushing !WQ_MEM_RECLAIM events_unbound:kfree_rcu_work + WARNING: CPU: 21 PID: 330 at kernel/workqueue.c:3719 check_flush_dependency+0x112/0x120 + Modules linked in: intel_uncore_frequency(E) intel_uncore_frequency_common(E) skx_edac(E) ... + CPU: 21 UID: 0 PID: 330 Comm: kworker/u144:6 Tainted: G E 6.13.2-0_g925d379822da #1 + Hardware name: Wiwynn Twin Lakes MP/Twin Lakes Passive MP, BIOS YMM20 02/01/2023 + Workqueue: nvme-wq nvme_scan_work + RIP: 0010:check_flush_dependency+0x112/0x120 + Code: 05 9a 40 14 02 01 48 81 c6 c0 00 00 00 48 8b 50 18 48 81 c7 c0 00 00 00 48 89 f9 48 ... + RSP: 0018:ffffc90000df7bd8 EFLAGS: 00010082 + RAX: 000000000000006a RBX: ffffffff81622390 RCX: 0000000000000027 + RDX: 00000000fffeffff RSI: 000000000057ffa8 RDI: ffff88907f960c88 + RBP: 0000000000000000 R08: ffffffff83068e50 R09: 000000000002fffd + R10: 0000000000000004 R11: 0000000000000000 R12: ffff8881001a4400 + R13: 0000000000000000 R14: ffff88907f420fb8 R15: 0000000000000000 + FS: 0000000000000000(0000) GS:ffff88907f940000(0000) knlGS:0000000000000000 + CR2: 00007f60c3001000 CR3: 000000107d010005 CR4: 00000000007726f0 + PKRU: 55555554 + Call Trace: + + ? __warn+0xa4/0x140 + ? check_flush_dependency+0x112/0x120 + ? report_bug+0xe1/0x140 + ? check_flush_dependency+0x112/0x120 + ? handle_bug+0x5e/0x90 + ? exc_invalid_op+0x16/0x40 + ? asm_exc_invalid_op+0x16/0x20 + ? timer_recalc_next_expiry+0x190/0x190 + ? check_flush_dependency+0x112/0x120 + ? check_flush_dependency+0x112/0x120 + __flush_work.llvm.1643880146586177030+0x174/0x2c0 + flush_rcu_work+0x28/0x30 + kvfree_rcu_barrier+0x12f/0x160 + kmem_cache_destroy+0x18/0x120 + bioset_exit+0x10c/0x150 + disk_release.llvm.6740012984264378178+0x61/0xd0 + device_release+0x4f/0x90 + kobject_put+0x95/0x180 + nvme_put_ns+0x23/0xc0 + nvme_remove_invalid_namespaces+0xb3/0xd0 + nvme_scan_work+0x342/0x490 + process_scheduled_works+0x1a2/0x370 + worker_thread+0x2ff/0x390 + ? pwq_release_workfn+0x1e0/0x1e0 + kthread+0xb1/0xe0 + ? __kthread_parkme+0x70/0x70 + ret_from_fork+0x30/0x40 + ? __kthread_parkme+0x70/0x70 + ret_from_fork_asm+0x11/0x20 + + ---[ end trace 0000000000000000 ]--- + + +To address this switch to use of independent WQ_MEM_RECLAIM +workqueue, so the rules are not violated from workqueue framework +point of view. + +Apart of that, since kvfree_rcu() does reclaim memory it is worth +to go with WQ_MEM_RECLAIM type of wq because it is designed for +this purpose. + +Fixes: 6c6c47b063b5 ("mm, slab: call kvfree_rcu_barrier() from kmem_cache_destroy()"), +Reported-by: Keith Busch +Closes: https://lore.kernel.org/all/Z7iqJtCjHKfo8Kho@kbusch-mbp/ +Cc: stable@vger.kernel.org +Signed-off-by: Uladzislau Rezki (Sony) +Reviewed-by: Joel Fernandes +Signed-off-by: Vlastimil Babka +Signed-off-by: Uladzislau Rezki (Sony) +Signed-off-by: Greg Kroah-Hartman +--- + kernel/rcu/tree.c | 14 ++++++++++---- + 1 file changed, 10 insertions(+), 4 deletions(-) + +--- a/kernel/rcu/tree.c ++++ b/kernel/rcu/tree.c +@@ -3191,6 +3191,8 @@ void call_rcu(struct rcu_head *head, rcu + } + EXPORT_SYMBOL_GPL(call_rcu); + ++static struct workqueue_struct *rcu_reclaim_wq; ++ + /* Maximum number of jiffies to wait before draining a batch. */ + #define KFREE_DRAIN_JIFFIES (5 * HZ) + #define KFREE_N_BATCHES 2 +@@ -3519,10 +3521,10 @@ __schedule_delayed_monitor_work(struct k + if (delayed_work_pending(&krcp->monitor_work)) { + delay_left = krcp->monitor_work.timer.expires - jiffies; + if (delay < delay_left) +- mod_delayed_work(system_unbound_wq, &krcp->monitor_work, delay); ++ mod_delayed_work(rcu_reclaim_wq, &krcp->monitor_work, delay); + return; + } +- queue_delayed_work(system_unbound_wq, &krcp->monitor_work, delay); ++ queue_delayed_work(rcu_reclaim_wq, &krcp->monitor_work, delay); + } + + static void +@@ -3620,7 +3622,7 @@ kvfree_rcu_queue_batch(struct kfree_rcu_ + // "free channels", the batch can handle. Break + // the loop since it is done with this CPU thus + // queuing an RCU work is _always_ success here. +- queued = queue_rcu_work(system_unbound_wq, &krwp->rcu_work); ++ queued = queue_rcu_work(rcu_reclaim_wq, &krwp->rcu_work); + WARN_ON_ONCE(!queued); + break; + } +@@ -3708,7 +3710,7 @@ run_page_cache_worker(struct kfree_rcu_c + if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && + !atomic_xchg(&krcp->work_in_progress, 1)) { + if (atomic_read(&krcp->backoff_page_cache_fill)) { +- queue_delayed_work(system_unbound_wq, ++ queue_delayed_work(rcu_reclaim_wq, + &krcp->page_cache_work, + msecs_to_jiffies(rcu_delay_page_cache_fill_msec)); + } else { +@@ -5654,6 +5656,10 @@ static void __init kfree_rcu_batch_init( + int i, j; + struct shrinker *kfree_rcu_shrinker; + ++ rcu_reclaim_wq = alloc_workqueue("kvfree_rcu_reclaim", ++ WQ_UNBOUND | WQ_MEM_RECLAIM, 0); ++ WARN_ON(!rcu_reclaim_wq); ++ + /* Clamp it to [0:100] seconds interval. */ + if (rcu_delay_page_cache_fill_msec < 0 || + rcu_delay_page_cache_fill_msec > 100 * MSEC_PER_SEC) { diff --git a/queue-6.13/series b/queue-6.13/series new file mode 100644 index 0000000000..958c55e54f --- /dev/null +++ b/queue-6.13/series @@ -0,0 +1,12 @@ +kvm-arm64-calculate-cptr_el2-traps-on-activating-traps.patch +kvm-arm64-unconditionally-save-flush-host-fpsimd-sve-sme-state.patch +kvm-arm64-remove-host-fpsimd-saving-for-non-protected-kvm.patch +kvm-arm64-remove-vhe-host-restore-of-cpacr_el1.zen.patch +kvm-arm64-remove-vhe-host-restore-of-cpacr_el1.smen.patch +kvm-arm64-refactor-exit-handlers.patch +kvm-arm64-mark-some-header-functions-as-inline.patch +kvm-arm64-eagerly-switch-zcr_el-1-2.patch +mm-fix-kernel-bug-when-userfaultfd_move-encounters-swapcache.patch +userfaultfd-fix-pte-unmapping-stack-allocated-pte-copies.patch +mm-slab-kvfree_rcu-switch-to-wq_mem_reclaim-wq.patch +virt-sev-guest-move-snp-guest-request-data-pages-handling-under-snp_cmd_mutex.patch diff --git a/queue-6.13/userfaultfd-fix-pte-unmapping-stack-allocated-pte-copies.patch b/queue-6.13/userfaultfd-fix-pte-unmapping-stack-allocated-pte-copies.patch new file mode 100644 index 0000000000..5616de0a40 --- /dev/null +++ b/queue-6.13/userfaultfd-fix-pte-unmapping-stack-allocated-pte-copies.patch @@ -0,0 +1,95 @@ +From 927e926d72d9155fde3264459fe9bfd7b5e40d28 Mon Sep 17 00:00:00 2001 +From: Suren Baghdasaryan +Date: Wed, 26 Feb 2025 10:55:09 -0800 +Subject: userfaultfd: fix PTE unmapping stack-allocated PTE copies + +From: Suren Baghdasaryan + +commit 927e926d72d9155fde3264459fe9bfd7b5e40d28 upstream. + +Current implementation of move_pages_pte() copies source and destination +PTEs in order to detect concurrent changes to PTEs involved in the move. +However these copies are also used to unmap the PTEs, which will fail if +CONFIG_HIGHPTE is enabled because the copies are allocated on the stack. +Fix this by using the actual PTEs which were kmap()ed. + +Link: https://lkml.kernel.org/r/20250226185510.2732648-3-surenb@google.com +Fixes: adef440691ba ("userfaultfd: UFFDIO_MOVE uABI") +Signed-off-by: Suren Baghdasaryan +Reported-by: Peter Xu +Reviewed-by: Peter Xu +Cc: Andrea Arcangeli +Cc: Barry Song <21cnbao@gmail.com> +Cc: Barry Song +Cc: David Hildenbrand +Cc: Hugh Dickins +Cc: Jann Horn +Cc: Kalesh Singh +Cc: Liam R. Howlett +Cc: Lokesh Gidra +Cc: Lorenzo Stoakes +Cc: Matthew Wilcow (Oracle) +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/userfaultfd.c | 20 ++++++++++---------- + 1 file changed, 10 insertions(+), 10 deletions(-) + +--- a/mm/userfaultfd.c ++++ b/mm/userfaultfd.c +@@ -1274,8 +1274,8 @@ retry: + spin_unlock(src_ptl); + + if (!locked) { +- pte_unmap(&orig_src_pte); +- pte_unmap(&orig_dst_pte); ++ pte_unmap(src_pte); ++ pte_unmap(dst_pte); + src_pte = dst_pte = NULL; + /* now we can block and wait */ + folio_lock(src_folio); +@@ -1291,8 +1291,8 @@ retry: + /* at this point we have src_folio locked */ + if (folio_test_large(src_folio)) { + /* split_folio() can block */ +- pte_unmap(&orig_src_pte); +- pte_unmap(&orig_dst_pte); ++ pte_unmap(src_pte); ++ pte_unmap(dst_pte); + src_pte = dst_pte = NULL; + err = split_folio(src_folio); + if (err) +@@ -1317,8 +1317,8 @@ retry: + goto out; + } + if (!anon_vma_trylock_write(src_anon_vma)) { +- pte_unmap(&orig_src_pte); +- pte_unmap(&orig_dst_pte); ++ pte_unmap(src_pte); ++ pte_unmap(dst_pte); + src_pte = dst_pte = NULL; + /* now we can block and wait */ + anon_vma_lock_write(src_anon_vma); +@@ -1336,8 +1336,8 @@ retry: + entry = pte_to_swp_entry(orig_src_pte); + if (non_swap_entry(entry)) { + if (is_migration_entry(entry)) { +- pte_unmap(&orig_src_pte); +- pte_unmap(&orig_dst_pte); ++ pte_unmap(src_pte); ++ pte_unmap(dst_pte); + src_pte = dst_pte = NULL; + migration_entry_wait(mm, src_pmd, src_addr); + err = -EAGAIN; +@@ -1380,8 +1380,8 @@ retry: + src_folio = folio; + src_folio_pte = orig_src_pte; + if (!folio_trylock(src_folio)) { +- pte_unmap(&orig_src_pte); +- pte_unmap(&orig_dst_pte); ++ pte_unmap(src_pte); ++ pte_unmap(dst_pte); + src_pte = dst_pte = NULL; + put_swap_device(si); + si = NULL; diff --git a/queue-6.13/virt-sev-guest-move-snp-guest-request-data-pages-handling-under-snp_cmd_mutex.patch b/queue-6.13/virt-sev-guest-move-snp-guest-request-data-pages-handling-under-snp_cmd_mutex.patch new file mode 100644 index 0000000000..59b62029d4 --- /dev/null +++ b/queue-6.13/virt-sev-guest-move-snp-guest-request-data-pages-handling-under-snp_cmd_mutex.patch @@ -0,0 +1,242 @@ +From 3e385c0d6ce88ac9916dcf84267bd5855d830748 Mon Sep 17 00:00:00 2001 +From: Alexey Kardashevskiy +Date: Fri, 7 Mar 2025 12:37:00 +1100 +Subject: virt: sev-guest: Move SNP Guest Request data pages handling under snp_cmd_mutex + +From: Alexey Kardashevskiy + +commit 3e385c0d6ce88ac9916dcf84267bd5855d830748 upstream. + +Compared to the SNP Guest Request, the "Extended" version adds data pages for +receiving certificates. If not enough pages provided, the HV can report to the +VM how much is needed so the VM can reallocate and repeat. + +Commit + + ae596615d93d ("virt: sev-guest: Reduce the scope of SNP command mutex") + +moved handling of the allocated/desired pages number out of scope of said +mutex and create a possibility for a race (multiple instances trying to +trigger Extended request in a VM) as there is just one instance of +snp_msg_desc per /dev/sev-guest and no locking other than snp_cmd_mutex. + +Fix the issue by moving the data blob/size and the GHCB input struct +(snp_req_data) into snp_guest_req which is allocated on stack now and accessed +by the GHCB caller under that mutex. + +Stop allocating SEV_FW_BLOB_MAX_SIZE in snp_msg_alloc() as only one of four +callers needs it. Free the received blob in get_ext_report() right after it is +copied to the userspace. Possible future users of snp_send_guest_request() are +likely to have different ideas about the buffer size anyways. + +Fixes: ae596615d93d ("virt: sev-guest: Reduce the scope of SNP command mutex") +Signed-off-by: Alexey Kardashevskiy +Signed-off-by: Borislav Petkov (AMD) +Reviewed-by: Nikunj A Dadhania +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/20250307013700.437505-3-aik@amd.com +Signed-off-by: Greg Kroah-Hartman +--- + +--- + arch/x86/include/asm/sev.h | 6 +-- + drivers/virt/coco/sev-guest/sev-guest.c | 63 +++++++++++++++++++------------- + 2 files changed, 42 insertions(+), 27 deletions(-) + +--- a/arch/x86/include/asm/sev.h ++++ b/arch/x86/include/asm/sev.h +@@ -185,6 +185,9 @@ struct snp_guest_req { + unsigned int vmpck_id; + u8 msg_version; + u8 msg_type; ++ ++ struct snp_req_data input; ++ void *certs_data; + }; + + /* +@@ -245,9 +248,6 @@ struct snp_msg_desc { + struct snp_guest_msg secret_request, secret_response; + + struct snp_secrets_page *secrets; +- struct snp_req_data input; +- +- void *certs_data; + + struct aesgcm_ctx *ctx; + +--- a/drivers/virt/coco/sev-guest/sev-guest.c ++++ b/drivers/virt/coco/sev-guest/sev-guest.c +@@ -249,7 +249,7 @@ retry_request: + * sequence number must be incremented or the VMPCK must be deleted to + * prevent reuse of the IV. + */ +- rc = snp_issue_guest_request(req, &mdesc->input, rio); ++ rc = snp_issue_guest_request(req, &req->input, rio); + switch (rc) { + case -ENOSPC: + /* +@@ -259,7 +259,7 @@ retry_request: + * order to increment the sequence number and thus avoid + * IV reuse. + */ +- override_npages = mdesc->input.data_npages; ++ override_npages = req->input.data_npages; + req->exit_code = SVM_VMGEXIT_GUEST_REQUEST; + + /* +@@ -315,7 +315,7 @@ retry_request: + } + + if (override_npages) +- mdesc->input.data_npages = override_npages; ++ req->input.data_npages = override_npages; + + return rc; + } +@@ -354,6 +354,11 @@ static int snp_send_guest_request(struct + memcpy(mdesc->request, &mdesc->secret_request, + sizeof(mdesc->secret_request)); + ++ /* initial the input address for guest request */ ++ req->input.req_gpa = __pa(mdesc->request); ++ req->input.resp_gpa = __pa(mdesc->response); ++ req->input.data_gpa = req->certs_data ? __pa(req->certs_data) : 0; ++ + rc = __handle_guest_request(mdesc, req, rio); + if (rc) { + if (rc == -EIO && +@@ -495,6 +500,7 @@ static int get_ext_report(struct snp_gue + struct snp_guest_req req = {}; + int ret, npages = 0, resp_len; + sockptr_t certs_address; ++ struct page *page; + + if (sockptr_is_null(io->req_data) || sockptr_is_null(io->resp_data)) + return -EINVAL; +@@ -528,8 +534,20 @@ static int get_ext_report(struct snp_gue + * the host. If host does not supply any certs in it, then copy + * zeros to indicate that certificate data was not provided. + */ +- memset(mdesc->certs_data, 0, report_req->certs_len); + npages = report_req->certs_len >> PAGE_SHIFT; ++ page = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, ++ get_order(report_req->certs_len)); ++ if (!page) ++ return -ENOMEM; ++ ++ req.certs_data = page_address(page); ++ ret = set_memory_decrypted((unsigned long)req.certs_data, npages); ++ if (ret) { ++ pr_err("failed to mark page shared, ret=%d\n", ret); ++ __free_pages(page, get_order(report_req->certs_len)); ++ return -EFAULT; ++ } ++ + cmd: + /* + * The intermediate response buffer is used while decrypting the +@@ -538,10 +556,12 @@ cmd: + */ + resp_len = sizeof(report_resp->data) + mdesc->ctx->authsize; + report_resp = kzalloc(resp_len, GFP_KERNEL_ACCOUNT); +- if (!report_resp) +- return -ENOMEM; ++ if (!report_resp) { ++ ret = -ENOMEM; ++ goto e_free_data; ++ } + +- mdesc->input.data_npages = npages; ++ req.input.data_npages = npages; + + req.msg_version = arg->msg_version; + req.msg_type = SNP_MSG_REPORT_REQ; +@@ -556,7 +576,7 @@ cmd: + + /* If certs length is invalid then copy the returned length */ + if (arg->vmm_error == SNP_GUEST_VMM_ERR_INVALID_LEN) { +- report_req->certs_len = mdesc->input.data_npages << PAGE_SHIFT; ++ report_req->certs_len = req.input.data_npages << PAGE_SHIFT; + + if (copy_to_sockptr(io->req_data, report_req, sizeof(*report_req))) + ret = -EFAULT; +@@ -565,7 +585,7 @@ cmd: + if (ret) + goto e_free; + +- if (npages && copy_to_sockptr(certs_address, mdesc->certs_data, report_req->certs_len)) { ++ if (npages && copy_to_sockptr(certs_address, req.certs_data, report_req->certs_len)) { + ret = -EFAULT; + goto e_free; + } +@@ -575,6 +595,13 @@ cmd: + + e_free: + kfree(report_resp); ++e_free_data: ++ if (npages) { ++ if (set_memory_encrypted((unsigned long)req.certs_data, npages)) ++ WARN_ONCE(ret, "failed to restore encryption mask (leak it)\n"); ++ else ++ __free_pages(page, get_order(report_req->certs_len)); ++ } + return ret; + } + +@@ -1048,35 +1075,26 @@ static int __init sev_guest_probe(struct + if (!mdesc->response) + goto e_free_request; + +- mdesc->certs_data = alloc_shared_pages(dev, SEV_FW_BLOB_MAX_SIZE); +- if (!mdesc->certs_data) +- goto e_free_response; +- + ret = -EIO; + mdesc->ctx = snp_init_crypto(mdesc->vmpck, VMPCK_KEY_LEN); + if (!mdesc->ctx) +- goto e_free_cert_data; ++ goto e_free_response; + + misc = &snp_dev->misc; + misc->minor = MISC_DYNAMIC_MINOR; + misc->name = DEVICE_NAME; + misc->fops = &snp_guest_fops; + +- /* Initialize the input addresses for guest request */ +- mdesc->input.req_gpa = __pa(mdesc->request); +- mdesc->input.resp_gpa = __pa(mdesc->response); +- mdesc->input.data_gpa = __pa(mdesc->certs_data); +- + /* Set the privlevel_floor attribute based on the vmpck_id */ + sev_tsm_ops.privlevel_floor = vmpck_id; + + ret = tsm_register(&sev_tsm_ops, snp_dev); + if (ret) +- goto e_free_cert_data; ++ goto e_free_response; + + ret = devm_add_action_or_reset(&pdev->dev, unregister_sev_tsm, NULL); + if (ret) +- goto e_free_cert_data; ++ goto e_free_response; + + ret = misc_register(misc); + if (ret) +@@ -1088,8 +1106,6 @@ static int __init sev_guest_probe(struct + + e_free_ctx: + kfree(mdesc->ctx); +-e_free_cert_data: +- free_shared_pages(mdesc->certs_data, SEV_FW_BLOB_MAX_SIZE); + e_free_response: + free_shared_pages(mdesc->response, sizeof(struct snp_guest_msg)); + e_free_request: +@@ -1104,7 +1120,6 @@ static void __exit sev_guest_remove(stru + struct snp_guest_dev *snp_dev = platform_get_drvdata(pdev); + struct snp_msg_desc *mdesc = snp_dev->msg_desc; + +- free_shared_pages(mdesc->certs_data, SEV_FW_BLOB_MAX_SIZE); + free_shared_pages(mdesc->response, sizeof(struct snp_guest_msg)); + free_shared_pages(mdesc->request, sizeof(struct snp_guest_msg)); + kfree(mdesc->ctx);