]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
6.13-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 13 Mar 2025 16:09:28 +0000 (17:09 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 13 Mar 2025 16:09:28 +0000 (17:09 +0100)
added patches:
kvm-arm64-calculate-cptr_el2-traps-on-activating-traps.patch
kvm-arm64-eagerly-switch-zcr_el-1-2.patch
kvm-arm64-mark-some-header-functions-as-inline.patch
kvm-arm64-refactor-exit-handlers.patch
kvm-arm64-remove-host-fpsimd-saving-for-non-protected-kvm.patch
kvm-arm64-remove-vhe-host-restore-of-cpacr_el1.smen.patch
kvm-arm64-remove-vhe-host-restore-of-cpacr_el1.zen.patch
kvm-arm64-unconditionally-save-flush-host-fpsimd-sve-sme-state.patch
mm-fix-kernel-bug-when-userfaultfd_move-encounters-swapcache.patch
mm-slab-kvfree_rcu-switch-to-wq_mem_reclaim-wq.patch
series
userfaultfd-fix-pte-unmapping-stack-allocated-pte-copies.patch
virt-sev-guest-move-snp-guest-request-data-pages-handling-under-snp_cmd_mutex.patch

13 files changed:
queue-6.13/kvm-arm64-calculate-cptr_el2-traps-on-activating-traps.patch [new file with mode: 0644]
queue-6.13/kvm-arm64-eagerly-switch-zcr_el-1-2.patch [new file with mode: 0644]
queue-6.13/kvm-arm64-mark-some-header-functions-as-inline.patch [new file with mode: 0644]
queue-6.13/kvm-arm64-refactor-exit-handlers.patch [new file with mode: 0644]
queue-6.13/kvm-arm64-remove-host-fpsimd-saving-for-non-protected-kvm.patch [new file with mode: 0644]
queue-6.13/kvm-arm64-remove-vhe-host-restore-of-cpacr_el1.smen.patch [new file with mode: 0644]
queue-6.13/kvm-arm64-remove-vhe-host-restore-of-cpacr_el1.zen.patch [new file with mode: 0644]
queue-6.13/kvm-arm64-unconditionally-save-flush-host-fpsimd-sve-sme-state.patch [new file with mode: 0644]
queue-6.13/mm-fix-kernel-bug-when-userfaultfd_move-encounters-swapcache.patch [new file with mode: 0644]
queue-6.13/mm-slab-kvfree_rcu-switch-to-wq_mem_reclaim-wq.patch [new file with mode: 0644]
queue-6.13/series [new file with mode: 0644]
queue-6.13/userfaultfd-fix-pte-unmapping-stack-allocated-pte-copies.patch [new file with mode: 0644]
queue-6.13/virt-sev-guest-move-snp-guest-request-data-pages-handling-under-snp_cmd_mutex.patch [new file with mode: 0644]

diff --git a/queue-6.13/kvm-arm64-calculate-cptr_el2-traps-on-activating-traps.patch b/queue-6.13/kvm-arm64-calculate-cptr_el2-traps-on-activating-traps.patch
new file mode 100644 (file)
index 0000000..a7f7196
--- /dev/null
@@ -0,0 +1,210 @@
+From stable+bounces-124192-greg=kroah.com@vger.kernel.org Thu Mar 13 00:49:47 2025
+From: Mark Brown <broonie@kernel.org>
+Date: Wed, 12 Mar 2025 23:49:09 +0000
+Subject: KVM: arm64: Calculate cptr_el2 traps on activating traps
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,  Marc Zyngier <maz@kernel.org>, Oliver Upton <oliver.upton@linux.dev>,  Joey Gouly <joey.gouly@arm.com>, Suzuki K Poulose <suzuki.poulose@arm.com>,  Catalin Marinas <catalin.marinas@arm.com>, Will Deacon <will@kernel.org>
+Cc: linux-arm-kernel@lists.infradead.org, kvmarm@lists.linux.dev,  linux-kernel@vger.kernel.org, stable@vger.kernel.org,  Mark Brown <broonie@kernel.org>, Fuad Tabba <tabba@google.com>,  James Clark <james.clark@linaro.org>
+Message-ID: <20250312-stable-sve-6-13-v1-1-c7ba07a6f4f7@kernel.org>
+
+From: Fuad Tabba <tabba@google.com>
+
+[ Upstream commit 2fd5b4b0e7b440602455b79977bfa64dea101e6c ]
+
+Similar to VHE, calculate the value of cptr_el2 from scratch on
+activate traps. This removes the need to store cptr_el2 in every
+vcpu structure. Moreover, some traps, such as whether the guest
+owns the fp registers, need to be set on every vcpu run.
+
+Reported-by: James Clark <james.clark@linaro.org>
+Fixes: 5294afdbf45a ("KVM: arm64: Exclude FP ownership from kvm_vcpu_arch")
+Signed-off-by: Fuad Tabba <tabba@google.com>
+Link: https://lore.kernel.org/r/20241216105057.579031-13-tabba@google.com
+Signed-off-by: Marc Zyngier <maz@kernel.org>
+Signed-off-by: Mark Brown <broonie@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/include/asm/kvm_host.h |    1 
+ arch/arm64/kvm/arm.c              |    1 
+ arch/arm64/kvm/hyp/nvhe/pkvm.c    |   30 ----------------------
+ arch/arm64/kvm/hyp/nvhe/switch.c  |   51 +++++++++++++++++++++++---------------
+ 4 files changed, 32 insertions(+), 51 deletions(-)
+
+--- a/arch/arm64/include/asm/kvm_host.h
++++ b/arch/arm64/include/asm/kvm_host.h
+@@ -708,7 +708,6 @@ struct kvm_vcpu_arch {
+       u64 hcr_el2;
+       u64 hcrx_el2;
+       u64 mdcr_el2;
+-      u64 cptr_el2;
+       /* Exception Information */
+       struct kvm_vcpu_fault_info fault;
+--- a/arch/arm64/kvm/arm.c
++++ b/arch/arm64/kvm/arm.c
+@@ -1569,7 +1569,6 @@ static int kvm_arch_vcpu_ioctl_vcpu_init
+       }
+       vcpu_reset_hcr(vcpu);
+-      vcpu->arch.cptr_el2 = kvm_get_reset_cptr_el2(vcpu);
+       /*
+        * Handle the "start in power-off" case.
+--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
++++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
+@@ -31,8 +31,6 @@ static void pvm_init_traps_aa64pfr0(stru
+       const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR0_EL1);
+       u64 hcr_set = HCR_RW;
+       u64 hcr_clear = 0;
+-      u64 cptr_set = 0;
+-      u64 cptr_clear = 0;
+       /* Protected KVM does not support AArch32 guests. */
+       BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0),
+@@ -62,21 +60,10 @@ static void pvm_init_traps_aa64pfr0(stru
+       /* Trap AMU */
+       if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AMU), feature_ids)) {
+               hcr_clear |= HCR_AMVOFFEN;
+-              cptr_set |= CPTR_EL2_TAM;
+-      }
+-
+-      /* Trap SVE */
+-      if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE), feature_ids)) {
+-              if (has_hvhe())
+-                      cptr_clear |= CPACR_ELx_ZEN;
+-              else
+-                      cptr_set |= CPTR_EL2_TZ;
+       }
+       vcpu->arch.hcr_el2 |= hcr_set;
+       vcpu->arch.hcr_el2 &= ~hcr_clear;
+-      vcpu->arch.cptr_el2 |= cptr_set;
+-      vcpu->arch.cptr_el2 &= ~cptr_clear;
+ }
+ /*
+@@ -106,7 +93,6 @@ static void pvm_init_traps_aa64dfr0(stru
+       const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64DFR0_EL1);
+       u64 mdcr_set = 0;
+       u64 mdcr_clear = 0;
+-      u64 cptr_set = 0;
+       /* Trap/constrain PMU */
+       if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), feature_ids)) {
+@@ -133,21 +119,12 @@ static void pvm_init_traps_aa64dfr0(stru
+       if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_TraceFilt), feature_ids))
+               mdcr_set |= MDCR_EL2_TTRF;
+-      /* Trap Trace */
+-      if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_TraceVer), feature_ids)) {
+-              if (has_hvhe())
+-                      cptr_set |= CPACR_EL1_TTA;
+-              else
+-                      cptr_set |= CPTR_EL2_TTA;
+-      }
+-
+       /* Trap External Trace */
+       if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_ExtTrcBuff), feature_ids))
+               mdcr_clear |= MDCR_EL2_E2TB_MASK;
+       vcpu->arch.mdcr_el2 |= mdcr_set;
+       vcpu->arch.mdcr_el2 &= ~mdcr_clear;
+-      vcpu->arch.cptr_el2 |= cptr_set;
+ }
+ /*
+@@ -198,10 +175,6 @@ static void pvm_init_trap_regs(struct kv
+       /* Clear res0 and set res1 bits to trap potential new features. */
+       vcpu->arch.hcr_el2 &= ~(HCR_RES0);
+       vcpu->arch.mdcr_el2 &= ~(MDCR_EL2_RES0);
+-      if (!has_hvhe()) {
+-              vcpu->arch.cptr_el2 |= CPTR_NVHE_EL2_RES1;
+-              vcpu->arch.cptr_el2 &= ~(CPTR_NVHE_EL2_RES0);
+-      }
+ }
+ static void pkvm_vcpu_reset_hcr(struct kvm_vcpu *vcpu)
+@@ -236,7 +209,6 @@ static void pkvm_vcpu_reset_hcr(struct k
+  */
+ static void pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu)
+ {
+-      vcpu->arch.cptr_el2 = kvm_get_reset_cptr_el2(vcpu);
+       vcpu->arch.mdcr_el2 = 0;
+       pkvm_vcpu_reset_hcr(vcpu);
+@@ -693,8 +665,6 @@ unlock:
+               return ret;
+       }
+-      hyp_vcpu->vcpu.arch.cptr_el2 = kvm_get_reset_cptr_el2(&hyp_vcpu->vcpu);
+-
+       return 0;
+ }
+--- a/arch/arm64/kvm/hyp/nvhe/switch.c
++++ b/arch/arm64/kvm/hyp/nvhe/switch.c
+@@ -36,33 +36,46 @@ DEFINE_PER_CPU(unsigned long, kvm_hyp_ve
+ extern void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc);
+-static void __activate_traps(struct kvm_vcpu *vcpu)
++static void __activate_cptr_traps(struct kvm_vcpu *vcpu)
+ {
+-      u64 val;
++      u64 val = CPTR_EL2_TAM; /* Same bit irrespective of E2H */
+-      ___activate_traps(vcpu, vcpu->arch.hcr_el2);
+-      __activate_traps_common(vcpu);
++      if (has_hvhe()) {
++              val |= CPACR_ELx_TTA;
+-      val = vcpu->arch.cptr_el2;
+-      val |= CPTR_EL2_TAM;    /* Same bit irrespective of E2H */
+-      val |= has_hvhe() ? CPACR_EL1_TTA : CPTR_EL2_TTA;
+-      if (cpus_have_final_cap(ARM64_SME)) {
+-              if (has_hvhe())
+-                      val &= ~CPACR_ELx_SMEN;
+-              else
+-                      val |= CPTR_EL2_TSM;
+-      }
++              if (guest_owns_fp_regs()) {
++                      val |= CPACR_ELx_FPEN;
++                      if (vcpu_has_sve(vcpu))
++                              val |= CPACR_ELx_ZEN;
++              }
++      } else {
++              val |= CPTR_EL2_TTA | CPTR_NVHE_EL2_RES1;
++
++              /*
++               * Always trap SME since it's not supported in KVM.
++               * TSM is RES1 if SME isn't implemented.
++               */
++              val |= CPTR_EL2_TSM;
+-      if (!guest_owns_fp_regs()) {
+-              if (has_hvhe())
+-                      val &= ~(CPACR_ELx_FPEN | CPACR_ELx_ZEN);
+-              else
+-                      val |= CPTR_EL2_TFP | CPTR_EL2_TZ;
++              if (!vcpu_has_sve(vcpu) || !guest_owns_fp_regs())
++                      val |= CPTR_EL2_TZ;
+-              __activate_traps_fpsimd32(vcpu);
++              if (!guest_owns_fp_regs())
++                      val |= CPTR_EL2_TFP;
+       }
++      if (!guest_owns_fp_regs())
++              __activate_traps_fpsimd32(vcpu);
++
+       kvm_write_cptr_el2(val);
++}
++
++static void __activate_traps(struct kvm_vcpu *vcpu)
++{
++      ___activate_traps(vcpu, vcpu->arch.hcr_el2);
++      __activate_traps_common(vcpu);
++      __activate_cptr_traps(vcpu);
++
+       write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el2);
+       if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
diff --git a/queue-6.13/kvm-arm64-eagerly-switch-zcr_el-1-2.patch b/queue-6.13/kvm-arm64-eagerly-switch-zcr_el-1-2.patch
new file mode 100644 (file)
index 0000000..b1379f0
--- /dev/null
@@ -0,0 +1,321 @@
+From broonie@kernel.org Thu Mar 13 00:49:53 2025
+From: Mark Brown <broonie@kernel.org>
+Date: Wed, 12 Mar 2025 23:49:16 +0000
+Subject: KVM: arm64: Eagerly switch ZCR_EL{1,2}
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,  Marc Zyngier <maz@kernel.org>, Oliver Upton <oliver.upton@linux.dev>,  Joey Gouly <joey.gouly@arm.com>, Suzuki K Poulose <suzuki.poulose@arm.com>,  Catalin Marinas <catalin.marinas@arm.com>, Will Deacon <will@kernel.org>
+Cc: linux-arm-kernel@lists.infradead.org, kvmarm@lists.linux.dev,  linux-kernel@vger.kernel.org, stable@vger.kernel.org,  Mark Brown <broonie@kernel.org>, Mark Rutland <mark.rutland@arm.com>,  Fuad Tabba <tabba@google.com>
+Message-ID: <20250312-stable-sve-6-13-v1-8-c7ba07a6f4f7@kernel.org>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+In non-protected KVM modes, while the guest FPSIMD/SVE/SME state is live on the
+CPU, the host's active SVE VL may differ from the guest's maximum SVE VL:
+
+* For VHE hosts, when a VM uses NV, ZCR_EL2 contains a value constrained
+  by the guest hypervisor, which may be less than or equal to that
+  guest's maximum VL.
+
+  Note: in this case the value of ZCR_EL1 is immaterial due to E2H.
+
+* For nVHE/hVHE hosts, ZCR_EL1 contains a value written by the guest,
+  which may be less than or greater than the guest's maximum VL.
+
+  Note: in this case hyp code traps host SVE usage and lazily restores
+  ZCR_EL2 to the host's maximum VL, which may be greater than the
+  guest's maximum VL.
+
+This can be the case between exiting a guest and kvm_arch_vcpu_put_fp().
+If a softirq is taken during this period and the softirq handler tries
+to use kernel-mode NEON, then the kernel will fail to save the guest's
+FPSIMD/SVE state, and will pend a SIGKILL for the current thread.
+
+This happens because kvm_arch_vcpu_ctxsync_fp() binds the guest's live
+FPSIMD/SVE state with the guest's maximum SVE VL, and
+fpsimd_save_user_state() verifies that the live SVE VL is as expected
+before attempting to save the register state:
+
+| if (WARN_ON(sve_get_vl() != vl)) {
+|         force_signal_inject(SIGKILL, SI_KERNEL, 0, 0);
+|         return;
+| }
+
+Fix this and make this a bit easier to reason about by always eagerly
+switching ZCR_EL{1,2} at hyp during guest<->host transitions. With this
+happening, there's no need to trap host SVE usage, and the nVHE/nVHE
+__deactivate_cptr_traps() logic can be simplified to enable host access
+to all present FPSIMD/SVE/SME features.
+
+In protected nVHE/hVHE modes, the host's state is always saved/restored
+by hyp, and the guest's state is saved prior to exit to the host, so
+from the host's PoV the guest never has live FPSIMD/SVE/SME state, and
+the host's ZCR_EL1 is never clobbered by hyp.
+
+Fixes: 8c8010d69c132273 ("KVM: arm64: Save/restore SVE state for nVHE")
+Fixes: 2e3cf82063a00ea0 ("KVM: arm64: nv: Ensure correct VL is loaded before saving SVE state")
+Signed-off-by: Mark Rutland <mark.rutland@arm.com>
+Reviewed-by: Mark Brown <broonie@kernel.org>
+Tested-by: Mark Brown <broonie@kernel.org>
+Cc: Catalin Marinas <catalin.marinas@arm.com>
+Cc: Fuad Tabba <tabba@google.com>
+Cc: Marc Zyngier <maz@kernel.org>
+Cc: Oliver Upton <oliver.upton@linux.dev>
+Cc: Will Deacon <will@kernel.org>
+Reviewed-by: Oliver Upton <oliver.upton@linux.dev>
+Link: https://lore.kernel.org/r/20250210195226.1215254-9-mark.rutland@arm.com
+Signed-off-by: Marc Zyngier <maz@kernel.org>
+(cherry picked from commit 59419f10045bc955d2229819c7cf7a8b0b9c5b59)
+Signed-off-by: Mark Brown <broonie@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/kvm/fpsimd.c                 |   30 ----------------
+ arch/arm64/kvm/hyp/entry.S              |    5 ++
+ arch/arm64/kvm/hyp/include/hyp/switch.h |   59 ++++++++++++++++++++++++++++++++
+ arch/arm64/kvm/hyp/nvhe/hyp-main.c      |   13 +++----
+ arch/arm64/kvm/hyp/nvhe/switch.c        |   33 +++++++++++++++--
+ arch/arm64/kvm/hyp/vhe/switch.c         |    4 ++
+ 6 files changed, 103 insertions(+), 41 deletions(-)
+
+--- a/arch/arm64/kvm/fpsimd.c
++++ b/arch/arm64/kvm/fpsimd.c
+@@ -136,36 +136,6 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcp
+       local_irq_save(flags);
+       if (guest_owns_fp_regs()) {
+-              if (vcpu_has_sve(vcpu)) {
+-                      u64 zcr = read_sysreg_el1(SYS_ZCR);
+-
+-                      /*
+-                       * If the vCPU is in the hyp context then ZCR_EL1 is
+-                       * loaded with its vEL2 counterpart.
+-                       */
+-                      __vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)) = zcr;
+-
+-                      /*
+-                       * Restore the VL that was saved when bound to the CPU,
+-                       * which is the maximum VL for the guest. Because the
+-                       * layout of the data when saving the sve state depends
+-                       * on the VL, we need to use a consistent (i.e., the
+-                       * maximum) VL.
+-                       * Note that this means that at guest exit ZCR_EL1 is
+-                       * not necessarily the same as on guest entry.
+-                       *
+-                       * ZCR_EL2 holds the guest hypervisor's VL when running
+-                       * a nested guest, which could be smaller than the
+-                       * max for the vCPU. Similar to above, we first need to
+-                       * switch to a VL consistent with the layout of the
+-                       * vCPU's SVE state. KVM support for NV implies VHE, so
+-                       * using the ZCR_EL1 alias is safe.
+-                       */
+-                      if (!has_vhe() || (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)))
+-                              sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1,
+-                                                     SYS_ZCR_EL1);
+-              }
+-
+               /*
+                * Flush (save and invalidate) the fpsimd/sve state so that if
+                * the host tries to use fpsimd/sve, it's not using stale data
+--- a/arch/arm64/kvm/hyp/entry.S
++++ b/arch/arm64/kvm/hyp/entry.S
+@@ -44,6 +44,11 @@ alternative_if ARM64_HAS_RAS_EXTN
+ alternative_else_nop_endif
+       mrs     x1, isr_el1
+       cbz     x1,  1f
++
++      // Ensure that __guest_enter() always provides a context
++      // synchronization event so that callers don't need ISBs for anything
++      // that would usually be synchonized by the ERET.
++      isb
+       mov     x0, #ARM_EXCEPTION_IRQ
+       ret
+--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
++++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
+@@ -375,6 +375,65 @@ static inline void __hyp_sve_save_host(v
+                        true);
+ }
++static inline void fpsimd_lazy_switch_to_guest(struct kvm_vcpu *vcpu)
++{
++      u64 zcr_el1, zcr_el2;
++
++      if (!guest_owns_fp_regs())
++              return;
++
++      if (vcpu_has_sve(vcpu)) {
++              /* A guest hypervisor may restrict the effective max VL. */
++              if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu))
++                      zcr_el2 = __vcpu_sys_reg(vcpu, ZCR_EL2);
++              else
++                      zcr_el2 = vcpu_sve_max_vq(vcpu) - 1;
++
++              write_sysreg_el2(zcr_el2, SYS_ZCR);
++
++              zcr_el1 = __vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu));
++              write_sysreg_el1(zcr_el1, SYS_ZCR);
++      }
++}
++
++static inline void fpsimd_lazy_switch_to_host(struct kvm_vcpu *vcpu)
++{
++      u64 zcr_el1, zcr_el2;
++
++      if (!guest_owns_fp_regs())
++              return;
++
++      /*
++       * When the guest owns the FP regs, we know that guest+hyp traps for
++       * any FPSIMD/SVE/SME features exposed to the guest have been disabled
++       * by either fpsimd_lazy_switch_to_guest() or kvm_hyp_handle_fpsimd()
++       * prior to __guest_entry(). As __guest_entry() guarantees a context
++       * synchronization event, we don't need an ISB here to avoid taking
++       * traps for anything that was exposed to the guest.
++       */
++      if (vcpu_has_sve(vcpu)) {
++              zcr_el1 = read_sysreg_el1(SYS_ZCR);
++              __vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)) = zcr_el1;
++
++              /*
++               * The guest's state is always saved using the guest's max VL.
++               * Ensure that the host has the guest's max VL active such that
++               * the host can save the guest's state lazily, but don't
++               * artificially restrict the host to the guest's max VL.
++               */
++              if (has_vhe()) {
++                      zcr_el2 = vcpu_sve_max_vq(vcpu) - 1;
++                      write_sysreg_el2(zcr_el2, SYS_ZCR);
++              } else {
++                      zcr_el2 = sve_vq_from_vl(kvm_host_sve_max_vl) - 1;
++                      write_sysreg_el2(zcr_el2, SYS_ZCR);
++
++                      zcr_el1 = vcpu_sve_max_vq(vcpu) - 1;
++                      write_sysreg_el1(zcr_el1, SYS_ZCR);
++              }
++      }
++}
++
+ static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu)
+ {
+       /*
+--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
++++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+@@ -5,6 +5,7 @@
+  */
+ #include <hyp/adjust_pc.h>
++#include <hyp/switch.h>
+ #include <asm/pgtable-types.h>
+ #include <asm/kvm_asm.h>
+@@ -178,8 +179,12 @@ static void handle___kvm_vcpu_run(struct
+               sync_hyp_vcpu(hyp_vcpu);
+               pkvm_put_hyp_vcpu(hyp_vcpu);
+       } else {
++              struct kvm_vcpu *vcpu = kern_hyp_va(host_vcpu);
++
+               /* The host is fully trusted, run its vCPU directly. */
+-              ret = __kvm_vcpu_run(host_vcpu);
++              fpsimd_lazy_switch_to_guest(vcpu);
++              ret = __kvm_vcpu_run(vcpu);
++              fpsimd_lazy_switch_to_host(vcpu);
+       }
+ out:
+@@ -480,12 +485,6 @@ void handle_trap(struct kvm_cpu_context
+       case ESR_ELx_EC_SMC64:
+               handle_host_smc(host_ctxt);
+               break;
+-      case ESR_ELx_EC_SVE:
+-              cpacr_clear_set(0, CPACR_ELx_ZEN);
+-              isb();
+-              sve_cond_update_zcr_vq(sve_vq_from_vl(kvm_host_sve_max_vl) - 1,
+-                                     SYS_ZCR_EL2);
+-              break;
+       case ESR_ELx_EC_IABT_LOW:
+       case ESR_ELx_EC_DABT_LOW:
+               handle_host_mem_abort(host_ctxt);
+--- a/arch/arm64/kvm/hyp/nvhe/switch.c
++++ b/arch/arm64/kvm/hyp/nvhe/switch.c
+@@ -40,6 +40,9 @@ static void __activate_cptr_traps(struct
+ {
+       u64 val = CPTR_EL2_TAM; /* Same bit irrespective of E2H */
++      if (!guest_owns_fp_regs())
++              __activate_traps_fpsimd32(vcpu);
++
+       if (has_hvhe()) {
+               val |= CPACR_ELx_TTA;
+@@ -48,6 +51,8 @@ static void __activate_cptr_traps(struct
+                       if (vcpu_has_sve(vcpu))
+                               val |= CPACR_ELx_ZEN;
+               }
++
++              write_sysreg(val, cpacr_el1);
+       } else {
+               val |= CPTR_EL2_TTA | CPTR_NVHE_EL2_RES1;
+@@ -62,12 +67,32 @@ static void __activate_cptr_traps(struct
+               if (!guest_owns_fp_regs())
+                       val |= CPTR_EL2_TFP;
++
++              write_sysreg(val, cptr_el2);
+       }
++}
+-      if (!guest_owns_fp_regs())
+-              __activate_traps_fpsimd32(vcpu);
++static void __deactivate_cptr_traps(struct kvm_vcpu *vcpu)
++{
++      if (has_hvhe()) {
++              u64 val = CPACR_ELx_FPEN;
++
++              if (cpus_have_final_cap(ARM64_SVE))
++                      val |= CPACR_ELx_ZEN;
++              if (cpus_have_final_cap(ARM64_SME))
++                      val |= CPACR_ELx_SMEN;
++
++              write_sysreg(val, cpacr_el1);
++      } else {
++              u64 val = CPTR_NVHE_EL2_RES1;
++
++              if (!cpus_have_final_cap(ARM64_SVE))
++                      val |= CPTR_EL2_TZ;
++              if (!cpus_have_final_cap(ARM64_SME))
++                      val |= CPTR_EL2_TSM;
+-      kvm_write_cptr_el2(val);
++              write_sysreg(val, cptr_el2);
++      }
+ }
+ static void __activate_traps(struct kvm_vcpu *vcpu)
+@@ -120,7 +145,7 @@ static void __deactivate_traps(struct kv
+       write_sysreg(this_cpu_ptr(&kvm_init_params)->hcr_el2, hcr_el2);
+-      kvm_reset_cptr_el2(vcpu);
++      __deactivate_cptr_traps(vcpu);
+       write_sysreg(__kvm_hyp_host_vector, vbar_el2);
+ }
+--- a/arch/arm64/kvm/hyp/vhe/switch.c
++++ b/arch/arm64/kvm/hyp/vhe/switch.c
+@@ -462,6 +462,8 @@ static int __kvm_vcpu_run_vhe(struct kvm
+       sysreg_save_host_state_vhe(host_ctxt);
++      fpsimd_lazy_switch_to_guest(vcpu);
++
+       /*
+        * Note that ARM erratum 1165522 requires us to configure both stage 1
+        * and stage 2 translation for the guest context before we clear
+@@ -486,6 +488,8 @@ static int __kvm_vcpu_run_vhe(struct kvm
+       __deactivate_traps(vcpu);
++      fpsimd_lazy_switch_to_host(vcpu);
++
+       sysreg_restore_host_state_vhe(host_ctxt);
+       if (guest_owns_fp_regs())
diff --git a/queue-6.13/kvm-arm64-mark-some-header-functions-as-inline.patch b/queue-6.13/kvm-arm64-mark-some-header-functions-as-inline.patch
new file mode 100644 (file)
index 0000000..45cbf53
--- /dev/null
@@ -0,0 +1,122 @@
+From broonie@kernel.org Thu Mar 13 00:49:50 2025
+From: Mark Brown <broonie@kernel.org>
+Date: Wed, 12 Mar 2025 23:49:15 +0000
+Subject: KVM: arm64: Mark some header functions as inline
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,  Marc Zyngier <maz@kernel.org>, Oliver Upton <oliver.upton@linux.dev>,  Joey Gouly <joey.gouly@arm.com>, Suzuki K Poulose <suzuki.poulose@arm.com>,  Catalin Marinas <catalin.marinas@arm.com>, Will Deacon <will@kernel.org>
+Cc: linux-arm-kernel@lists.infradead.org, kvmarm@lists.linux.dev,  linux-kernel@vger.kernel.org, stable@vger.kernel.org,  Mark Brown <broonie@kernel.org>, Mark Rutland <mark.rutland@arm.com>,  Fuad Tabba <tabba@google.com>
+Message-ID: <20250312-stable-sve-6-13-v1-7-c7ba07a6f4f7@kernel.org>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+[ Upstream commit f9dd00de1e53a47763dfad601635d18542c3836d ]
+
+The shared hyp switch header has a number of static functions which
+might not be used by all files that include the header, and when unused
+they will provoke compiler warnings, e.g.
+
+| In file included from arch/arm64/kvm/hyp/nvhe/hyp-main.c:8:
+| ./arch/arm64/kvm/hyp/include/hyp/switch.h:703:13: warning: 'kvm_hyp_handle_dabt_low' defined but not used [-Wunused-function]
+|   703 | static bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code)
+|       |             ^~~~~~~~~~~~~~~~~~~~~~~
+| ./arch/arm64/kvm/hyp/include/hyp/switch.h:682:13: warning: 'kvm_hyp_handle_cp15_32' defined but not used [-Wunused-function]
+|   682 | static bool kvm_hyp_handle_cp15_32(struct kvm_vcpu *vcpu, u64 *exit_code)
+|       |             ^~~~~~~~~~~~~~~~~~~~~~
+| ./arch/arm64/kvm/hyp/include/hyp/switch.h:662:13: warning: 'kvm_hyp_handle_sysreg' defined but not used [-Wunused-function]
+|   662 | static bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code)
+|       |             ^~~~~~~~~~~~~~~~~~~~~
+| ./arch/arm64/kvm/hyp/include/hyp/switch.h:458:13: warning: 'kvm_hyp_handle_fpsimd' defined but not used [-Wunused-function]
+|   458 | static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code)
+|       |             ^~~~~~~~~~~~~~~~~~~~~
+| ./arch/arm64/kvm/hyp/include/hyp/switch.h:329:13: warning: 'kvm_hyp_handle_mops' defined but not used [-Wunused-function]
+|   329 | static bool kvm_hyp_handle_mops(struct kvm_vcpu *vcpu, u64 *exit_code)
+|       |             ^~~~~~~~~~~~~~~~~~~
+
+Mark these functions as 'inline' to suppress this warning. This
+shouldn't result in any functional change.
+
+At the same time, avoid the use of __alias() in the header and alias
+kvm_hyp_handle_iabt_low() and kvm_hyp_handle_watchpt_low() to
+kvm_hyp_handle_memory_fault() using CPP, matching the style in the rest
+of the kernel. For consistency, kvm_hyp_handle_memory_fault() is also
+marked as 'inline'.
+
+Signed-off-by: Mark Rutland <mark.rutland@arm.com>
+Reviewed-by: Mark Brown <broonie@kernel.org>
+Tested-by: Mark Brown <broonie@kernel.org>
+Acked-by: Will Deacon <will@kernel.org>
+Cc: Catalin Marinas <catalin.marinas@arm.com>
+Cc: Fuad Tabba <tabba@google.com>
+Cc: Marc Zyngier <maz@kernel.org>
+Cc: Oliver Upton <oliver.upton@linux.dev>
+Reviewed-by: Oliver Upton <oliver.upton@linux.dev>
+Link: https://lore.kernel.org/r/20250210195226.1215254-8-mark.rutland@arm.com
+Signed-off-by: Marc Zyngier <maz@kernel.org>
+Signed-off-by: Mark Brown <broonie@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/kvm/hyp/include/hyp/switch.h |   19 +++++++++----------
+ 1 file changed, 9 insertions(+), 10 deletions(-)
+
+--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
++++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
+@@ -326,7 +326,7 @@ static inline bool __populate_fault_info
+       return __get_fault_info(vcpu->arch.fault.esr_el2, &vcpu->arch.fault);
+ }
+-static bool kvm_hyp_handle_mops(struct kvm_vcpu *vcpu, u64 *exit_code)
++static inline bool kvm_hyp_handle_mops(struct kvm_vcpu *vcpu, u64 *exit_code)
+ {
+       *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR);
+       arm64_mops_reset_regs(vcpu_gp_regs(vcpu), vcpu->arch.fault.esr_el2);
+@@ -404,7 +404,7 @@ static void kvm_hyp_save_fpsimd_host(str
+  * If FP/SIMD is not implemented, handle the trap and inject an undefined
+  * instruction exception to the guest. Similarly for trapped SVE accesses.
+  */
+-static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code)
++static inline bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code)
+ {
+       bool sve_guest;
+       u8 esr_ec;
+@@ -595,7 +595,7 @@ static bool handle_ampere1_tcr(struct kv
+       return true;
+ }
+-static bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code)
++static inline bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code)
+ {
+       if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_TX2_219_TVM) &&
+           handle_tx2_tvm(vcpu))
+@@ -615,7 +615,7 @@ static bool kvm_hyp_handle_sysreg(struct
+       return false;
+ }
+-static bool kvm_hyp_handle_cp15_32(struct kvm_vcpu *vcpu, u64 *exit_code)
++static inline bool kvm_hyp_handle_cp15_32(struct kvm_vcpu *vcpu, u64 *exit_code)
+ {
+       if (static_branch_unlikely(&vgic_v3_cpuif_trap) &&
+           __vgic_v3_perform_cpuif_access(vcpu) == 1)
+@@ -624,19 +624,18 @@ static bool kvm_hyp_handle_cp15_32(struc
+       return false;
+ }
+-static bool kvm_hyp_handle_memory_fault(struct kvm_vcpu *vcpu, u64 *exit_code)
++static inline bool kvm_hyp_handle_memory_fault(struct kvm_vcpu *vcpu,
++                                             u64 *exit_code)
+ {
+       if (!__populate_fault_info(vcpu))
+               return true;
+       return false;
+ }
+-static bool kvm_hyp_handle_iabt_low(struct kvm_vcpu *vcpu, u64 *exit_code)
+-      __alias(kvm_hyp_handle_memory_fault);
+-static bool kvm_hyp_handle_watchpt_low(struct kvm_vcpu *vcpu, u64 *exit_code)
+-      __alias(kvm_hyp_handle_memory_fault);
++#define kvm_hyp_handle_iabt_low               kvm_hyp_handle_memory_fault
++#define kvm_hyp_handle_watchpt_low    kvm_hyp_handle_memory_fault
+-static bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code)
++static inline bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code)
+ {
+       if (kvm_hyp_handle_memory_fault(vcpu, exit_code))
+               return true;
diff --git a/queue-6.13/kvm-arm64-refactor-exit-handlers.patch b/queue-6.13/kvm-arm64-refactor-exit-handlers.patch
new file mode 100644 (file)
index 0000000..b502a0f
--- /dev/null
@@ -0,0 +1,186 @@
+From broonie@kernel.org Thu Mar 13 00:49:47 2025
+From: Mark Brown <broonie@kernel.org>
+Date: Wed, 12 Mar 2025 23:49:14 +0000
+Subject: KVM: arm64: Refactor exit handlers
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,  Marc Zyngier <maz@kernel.org>, Oliver Upton <oliver.upton@linux.dev>,  Joey Gouly <joey.gouly@arm.com>, Suzuki K Poulose <suzuki.poulose@arm.com>,  Catalin Marinas <catalin.marinas@arm.com>, Will Deacon <will@kernel.org>
+Cc: linux-arm-kernel@lists.infradead.org, kvmarm@lists.linux.dev,  linux-kernel@vger.kernel.org, stable@vger.kernel.org,  Mark Brown <broonie@kernel.org>, Mark Rutland <mark.rutland@arm.com>,  Fuad Tabba <tabba@google.com>
+Message-ID: <20250312-stable-sve-6-13-v1-6-c7ba07a6f4f7@kernel.org>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+[ Upstream commit 9b66195063c5a145843547b1d692bd189be85287 ]
+
+The hyp exit handling logic is largely shared between VHE and nVHE/hVHE,
+with common logic in arch/arm64/kvm/hyp/include/hyp/switch.h. The code
+in the header depends on function definitions provided by
+arch/arm64/kvm/hyp/vhe/switch.c and arch/arm64/kvm/hyp/nvhe/switch.c
+when they include the header.
+
+This is an unusual header dependency, and prevents the use of
+arch/arm64/kvm/hyp/include/hyp/switch.h in other files as this would
+result in compiler warnings regarding missing definitions, e.g.
+
+| In file included from arch/arm64/kvm/hyp/nvhe/hyp-main.c:8:
+| ./arch/arm64/kvm/hyp/include/hyp/switch.h:733:31: warning: 'kvm_get_exit_handler_array' used but never defined
+|   733 | static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu);
+|       |                               ^~~~~~~~~~~~~~~~~~~~~~~~~~
+| ./arch/arm64/kvm/hyp/include/hyp/switch.h:735:13: warning: 'early_exit_filter' used but never defined
+|   735 | static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code);
+|       |             ^~~~~~~~~~~~~~~~~
+
+Refactor the logic such that the header doesn't depend on anything from
+the C files. There should be no functional change as a result of this
+patch.
+
+Signed-off-by: Mark Rutland <mark.rutland@arm.com>
+Reviewed-by: Mark Brown <broonie@kernel.org>
+Tested-by: Mark Brown <broonie@kernel.org>
+Acked-by: Will Deacon <will@kernel.org>
+Cc: Catalin Marinas <catalin.marinas@arm.com>
+Cc: Fuad Tabba <tabba@google.com>
+Cc: Marc Zyngier <maz@kernel.org>
+Cc: Oliver Upton <oliver.upton@linux.dev>
+Reviewed-by: Oliver Upton <oliver.upton@linux.dev>
+Link: https://lore.kernel.org/r/20250210195226.1215254-7-mark.rutland@arm.com
+Signed-off-by: Marc Zyngier <maz@kernel.org>
+Signed-off-by: Mark Brown <broonie@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/kvm/hyp/include/hyp/switch.h |   30 ++++++------------------------
+ arch/arm64/kvm/hyp/nvhe/switch.c        |   28 ++++++++++++++++------------
+ arch/arm64/kvm/hyp/vhe/switch.c         |    9 ++++-----
+ 3 files changed, 26 insertions(+), 41 deletions(-)
+
+--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
++++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
+@@ -666,23 +666,16 @@ static bool kvm_hyp_handle_dabt_low(stru
+ typedef bool (*exit_handler_fn)(struct kvm_vcpu *, u64 *);
+-static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu);
+-
+-static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code);
+-
+ /*
+  * Allow the hypervisor to handle the exit with an exit handler if it has one.
+  *
+  * Returns true if the hypervisor handled the exit, and control should go back
+  * to the guest, or false if it hasn't.
+  */
+-static inline bool kvm_hyp_handle_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
++static inline bool kvm_hyp_handle_exit(struct kvm_vcpu *vcpu, u64 *exit_code,
++                                     const exit_handler_fn *handlers)
+ {
+-      const exit_handler_fn *handlers = kvm_get_exit_handler_array(vcpu);
+-      exit_handler_fn fn;
+-
+-      fn = handlers[kvm_vcpu_trap_get_class(vcpu)];
+-
++      exit_handler_fn fn = handlers[kvm_vcpu_trap_get_class(vcpu)];
+       if (fn)
+               return fn(vcpu, exit_code);
+@@ -712,20 +705,9 @@ static inline void synchronize_vcpu_psta
+  * the guest, false when we should restore the host state and return to the
+  * main run loop.
+  */
+-static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
++static inline bool __fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code,
++                                    const exit_handler_fn *handlers)
+ {
+-      /*
+-       * Save PSTATE early so that we can evaluate the vcpu mode
+-       * early on.
+-       */
+-      synchronize_vcpu_pstate(vcpu, exit_code);
+-
+-      /*
+-       * Check whether we want to repaint the state one way or
+-       * another.
+-       */
+-      early_exit_filter(vcpu, exit_code);
+-
+       if (ARM_EXCEPTION_CODE(*exit_code) != ARM_EXCEPTION_IRQ)
+               vcpu->arch.fault.esr_el2 = read_sysreg_el2(SYS_ESR);
+@@ -755,7 +737,7 @@ static inline bool fixup_guest_exit(stru
+               goto exit;
+       /* Check if there's an exit handler and allow it to handle the exit. */
+-      if (kvm_hyp_handle_exit(vcpu, exit_code))
++      if (kvm_hyp_handle_exit(vcpu, exit_code, handlers))
+               goto guest;
+ exit:
+       /* Return to the host kernel and handle the exit */
+--- a/arch/arm64/kvm/hyp/nvhe/switch.c
++++ b/arch/arm64/kvm/hyp/nvhe/switch.c
+@@ -224,19 +224,21 @@ static const exit_handler_fn *kvm_get_ex
+       return hyp_exit_handlers;
+ }
+-/*
+- * Some guests (e.g., protected VMs) are not be allowed to run in AArch32.
+- * The ARMv8 architecture does not give the hypervisor a mechanism to prevent a
+- * guest from dropping to AArch32 EL0 if implemented by the CPU. If the
+- * hypervisor spots a guest in such a state ensure it is handled, and don't
+- * trust the host to spot or fix it.  The check below is based on the one in
+- * kvm_arch_vcpu_ioctl_run().
+- *
+- * Returns false if the guest ran in AArch32 when it shouldn't have, and
+- * thus should exit to the host, or true if a the guest run loop can continue.
+- */
+-static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code)
++static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
+ {
++      const exit_handler_fn *handlers = kvm_get_exit_handler_array(vcpu);
++
++      synchronize_vcpu_pstate(vcpu, exit_code);
++
++      /*
++       * Some guests (e.g., protected VMs) are not be allowed to run in
++       * AArch32.  The ARMv8 architecture does not give the hypervisor a
++       * mechanism to prevent a guest from dropping to AArch32 EL0 if
++       * implemented by the CPU. If the hypervisor spots a guest in such a
++       * state ensure it is handled, and don't trust the host to spot or fix
++       * it.  The check below is based on the one in
++       * kvm_arch_vcpu_ioctl_run().
++       */
+       if (unlikely(vcpu_is_protected(vcpu) && vcpu_mode_is_32bit(vcpu))) {
+               /*
+                * As we have caught the guest red-handed, decide that it isn't
+@@ -249,6 +251,8 @@ static void early_exit_filter(struct kvm
+               *exit_code &= BIT(ARM_EXIT_WITH_SERROR_BIT);
+               *exit_code |= ARM_EXCEPTION_IL;
+       }
++
++      return __fixup_guest_exit(vcpu, exit_code, handlers);
+ }
+ /* Switch to the guest for legacy non-VHE systems */
+--- a/arch/arm64/kvm/hyp/vhe/switch.c
++++ b/arch/arm64/kvm/hyp/vhe/switch.c
+@@ -423,13 +423,10 @@ static const exit_handler_fn hyp_exit_ha
+       [ESR_ELx_EC_MOPS]               = kvm_hyp_handle_mops,
+ };
+-static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu)
++static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
+ {
+-      return hyp_exit_handlers;
+-}
++      synchronize_vcpu_pstate(vcpu, exit_code);
+-static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code)
+-{
+       /*
+        * If we were in HYP context on entry, adjust the PSTATE view
+        * so that the usual helpers work correctly.
+@@ -449,6 +446,8 @@ static void early_exit_filter(struct kvm
+               *vcpu_cpsr(vcpu) &= ~(PSR_MODE_MASK | PSR_MODE32_BIT);
+               *vcpu_cpsr(vcpu) |= mode;
+       }
++
++      return __fixup_guest_exit(vcpu, exit_code, hyp_exit_handlers);
+ }
+ /* Switch to the guest for VHE systems running in EL2 */
diff --git a/queue-6.13/kvm-arm64-remove-host-fpsimd-saving-for-non-protected-kvm.patch b/queue-6.13/kvm-arm64-remove-host-fpsimd-saving-for-non-protected-kvm.patch
new file mode 100644 (file)
index 0000000..fa1af42
--- /dev/null
@@ -0,0 +1,213 @@
+From stable+bounces-124194-greg=kroah.com@vger.kernel.org Thu Mar 13 00:50:09 2025
+From: Mark Brown <broonie@kernel.org>
+Date: Wed, 12 Mar 2025 23:49:11 +0000
+Subject: KVM: arm64: Remove host FPSIMD saving for non-protected KVM
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,  Marc Zyngier <maz@kernel.org>, Oliver Upton <oliver.upton@linux.dev>,  Joey Gouly <joey.gouly@arm.com>, Suzuki K Poulose <suzuki.poulose@arm.com>,  Catalin Marinas <catalin.marinas@arm.com>, Will Deacon <will@kernel.org>
+Cc: linux-arm-kernel@lists.infradead.org, kvmarm@lists.linux.dev,  linux-kernel@vger.kernel.org, stable@vger.kernel.org,  Mark Brown <broonie@kernel.org>, Mark Rutland <mark.rutland@arm.com>,  Fuad Tabba <tabba@google.com>
+Message-ID: <20250312-stable-sve-6-13-v1-3-c7ba07a6f4f7@kernel.org>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+[ Upstream commit 8eca7f6d5100b6997df4f532090bc3f7e0203bef ]
+
+Now that the host eagerly saves its own FPSIMD/SVE/SME state,
+non-protected KVM never needs to save the host FPSIMD/SVE/SME state,
+and the code to do this is never used. Protected KVM still needs to
+save/restore the host FPSIMD/SVE state to avoid leaking guest state to
+the host (and to avoid revealing to the host whether the guest used
+FPSIMD/SVE/SME), and that code needs to be retained.
+
+Remove the unused code and data structures.
+
+To avoid the need for a stub copy of kvm_hyp_save_fpsimd_host() in the
+VHE hyp code, the nVHE/hVHE version is moved into the shared switch
+header, where it is only invoked when KVM is in protected mode.
+
+Signed-off-by: Mark Rutland <mark.rutland@arm.com>
+Reviewed-by: Mark Brown <broonie@kernel.org>
+Tested-by: Mark Brown <broonie@kernel.org>
+Acked-by: Will Deacon <will@kernel.org>
+Cc: Catalin Marinas <catalin.marinas@arm.com>
+Cc: Fuad Tabba <tabba@google.com>
+Cc: Marc Zyngier <maz@kernel.org>
+Cc: Oliver Upton <oliver.upton@linux.dev>
+Reviewed-by: Oliver Upton <oliver.upton@linux.dev>
+Link: https://lore.kernel.org/r/20250210195226.1215254-3-mark.rutland@arm.com
+Signed-off-by: Marc Zyngier <maz@kernel.org>
+[CPACR_EL1_ZEN -> CPACR_ELx_ZEN -- broonie]
+Signed-off-by: Mark Brown <broonie@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/include/asm/kvm_host.h       |   18 ++++--------------
+ arch/arm64/kvm/arm.c                    |    8 --------
+ arch/arm64/kvm/fpsimd.c                 |    2 --
+ arch/arm64/kvm/hyp/include/hyp/switch.h |   25 +++++++++++++++++++++++--
+ arch/arm64/kvm/hyp/nvhe/hyp-main.c      |    2 +-
+ arch/arm64/kvm/hyp/nvhe/switch.c        |   28 ----------------------------
+ arch/arm64/kvm/hyp/vhe/switch.c         |    8 --------
+ 7 files changed, 28 insertions(+), 63 deletions(-)
+
+--- a/arch/arm64/include/asm/kvm_host.h
++++ b/arch/arm64/include/asm/kvm_host.h
+@@ -613,23 +613,13 @@ struct kvm_host_data {
+       struct kvm_cpu_context host_ctxt;
+       /*
+-       * All pointers in this union are hyp VA.
++       * Hyp VA.
+        * sve_state is only used in pKVM and if system_supports_sve().
+        */
+-      union {
+-              struct user_fpsimd_state *fpsimd_state;
+-              struct cpu_sve_state *sve_state;
+-      };
++      struct cpu_sve_state *sve_state;
+-      union {
+-              /* HYP VA pointer to the host storage for FPMR */
+-              u64     *fpmr_ptr;
+-              /*
+-               * Used by pKVM only, as it needs to provide storage
+-               * for the host
+-               */
+-              u64     fpmr;
+-      };
++      /* Used by pKVM only. */
++      u64     fpmr;
+       /* Ownership of the FP regs */
+       enum {
+--- a/arch/arm64/kvm/arm.c
++++ b/arch/arm64/kvm/arm.c
+@@ -2468,14 +2468,6 @@ static void finalize_init_hyp_mode(void)
+                       per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state =
+                               kern_hyp_va(sve_state);
+               }
+-      } else {
+-              for_each_possible_cpu(cpu) {
+-                      struct user_fpsimd_state *fpsimd_state;
+-
+-                      fpsimd_state = &per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->host_ctxt.fp_regs;
+-                      per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->fpsimd_state =
+-                              kern_hyp_va(fpsimd_state);
+-              }
+       }
+ }
+--- a/arch/arm64/kvm/fpsimd.c
++++ b/arch/arm64/kvm/fpsimd.c
+@@ -64,8 +64,6 @@ void kvm_arch_vcpu_load_fp(struct kvm_vc
+        */
+       fpsimd_save_and_flush_cpu_state();
+       *host_data_ptr(fp_owner) = FP_STATE_FREE;
+-      *host_data_ptr(fpsimd_state) = NULL;
+-      *host_data_ptr(fpmr_ptr) = NULL;
+       vcpu_clear_flag(vcpu, HOST_SVE_ENABLED);
+       if (read_sysreg(cpacr_el1) & CPACR_EL1_ZEN_EL0EN)
+--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
++++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
+@@ -375,7 +375,28 @@ static inline void __hyp_sve_save_host(v
+                        true);
+ }
+-static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu);
++static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu)
++{
++      /*
++       * Non-protected kvm relies on the host restoring its sve state.
++       * Protected kvm restores the host's sve state as not to reveal that
++       * fpsimd was used by a guest nor leak upper sve bits.
++       */
++      if (system_supports_sve()) {
++              __hyp_sve_save_host();
++
++              /* Re-enable SVE traps if not supported for the guest vcpu. */
++              if (!vcpu_has_sve(vcpu))
++                      cpacr_clear_set(CPACR_ELx_ZEN, 0);
++
++      } else {
++              __fpsimd_save_state(host_data_ptr(host_ctxt.fp_regs));
++      }
++
++      if (kvm_has_fpmr(kern_hyp_va(vcpu->kvm)))
++              *host_data_ptr(fpmr) = read_sysreg_s(SYS_FPMR);
++}
++
+ /*
+  * We trap the first access to the FP/SIMD to save the host context and
+@@ -425,7 +446,7 @@ static bool kvm_hyp_handle_fpsimd(struct
+       isb();
+       /* Write out the host state if it's in the registers */
+-      if (host_owns_fp_regs())
++      if (is_protected_kvm_enabled() && host_owns_fp_regs())
+               kvm_hyp_save_fpsimd_host(vcpu);
+       /* Restore the guest state */
+--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
++++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+@@ -83,7 +83,7 @@ static void fpsimd_sve_sync(struct kvm_v
+       if (system_supports_sve())
+               __hyp_sve_restore_host();
+       else
+-              __fpsimd_restore_state(*host_data_ptr(fpsimd_state));
++              __fpsimd_restore_state(host_data_ptr(host_ctxt.fp_regs));
+       if (has_fpmr)
+               write_sysreg_s(*host_data_ptr(fpmr), SYS_FPMR);
+--- a/arch/arm64/kvm/hyp/nvhe/switch.c
++++ b/arch/arm64/kvm/hyp/nvhe/switch.c
+@@ -193,34 +193,6 @@ static bool kvm_handle_pvm_sys64(struct
+               kvm_handle_pvm_sysreg(vcpu, exit_code));
+ }
+-static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu)
+-{
+-      /*
+-       * Non-protected kvm relies on the host restoring its sve state.
+-       * Protected kvm restores the host's sve state as not to reveal that
+-       * fpsimd was used by a guest nor leak upper sve bits.
+-       */
+-      if (unlikely(is_protected_kvm_enabled() && system_supports_sve())) {
+-              __hyp_sve_save_host();
+-
+-              /* Re-enable SVE traps if not supported for the guest vcpu. */
+-              if (!vcpu_has_sve(vcpu))
+-                      cpacr_clear_set(CPACR_ELx_ZEN, 0);
+-
+-      } else {
+-              __fpsimd_save_state(*host_data_ptr(fpsimd_state));
+-      }
+-
+-      if (kvm_has_fpmr(kern_hyp_va(vcpu->kvm))) {
+-              u64 val = read_sysreg_s(SYS_FPMR);
+-
+-              if (unlikely(is_protected_kvm_enabled()))
+-                      *host_data_ptr(fpmr) = val;
+-              else
+-                      **host_data_ptr(fpmr_ptr) = val;
+-      }
+-}
+-
+ static const exit_handler_fn hyp_exit_handlers[] = {
+       [0 ... ESR_ELx_EC_MAX]          = NULL,
+       [ESR_ELx_EC_CP15_32]            = kvm_hyp_handle_cp15_32,
+--- a/arch/arm64/kvm/hyp/vhe/switch.c
++++ b/arch/arm64/kvm/hyp/vhe/switch.c
+@@ -309,14 +309,6 @@ static bool kvm_hyp_handle_eret(struct k
+       return true;
+ }
+-static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu)
+-{
+-      __fpsimd_save_state(*host_data_ptr(fpsimd_state));
+-
+-      if (kvm_has_fpmr(vcpu->kvm))
+-              **host_data_ptr(fpmr_ptr) = read_sysreg_s(SYS_FPMR);
+-}
+-
+ static bool kvm_hyp_handle_tlbi_el2(struct kvm_vcpu *vcpu, u64 *exit_code)
+ {
+       int ret = -EINVAL;
diff --git a/queue-6.13/kvm-arm64-remove-vhe-host-restore-of-cpacr_el1.smen.patch b/queue-6.13/kvm-arm64-remove-vhe-host-restore-of-cpacr_el1.smen.patch
new file mode 100644 (file)
index 0000000..fc9f8f9
--- /dev/null
@@ -0,0 +1,118 @@
+From broonie@kernel.org Thu Mar 13 00:49:44 2025
+From: Mark Brown <broonie@kernel.org>
+Date: Wed, 12 Mar 2025 23:49:13 +0000
+Subject: KVM: arm64: Remove VHE host restore of CPACR_EL1.SMEN
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,  Marc Zyngier <maz@kernel.org>, Oliver Upton <oliver.upton@linux.dev>,  Joey Gouly <joey.gouly@arm.com>, Suzuki K Poulose <suzuki.poulose@arm.com>,  Catalin Marinas <catalin.marinas@arm.com>, Will Deacon <will@kernel.org>
+Cc: linux-arm-kernel@lists.infradead.org, kvmarm@lists.linux.dev,  linux-kernel@vger.kernel.org, stable@vger.kernel.org,  Mark Brown <broonie@kernel.org>, Mark Rutland <mark.rutland@arm.com>,  Fuad Tabba <tabba@google.com>
+Message-ID: <20250312-stable-sve-6-13-v1-5-c7ba07a6f4f7@kernel.org>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+[ Upstream commit 407a99c4654e8ea65393f412c421a55cac539f5b ]
+
+When KVM is in VHE mode, the host kernel tries to save and restore the
+configuration of CPACR_EL1.SMEN (i.e. CPTR_EL2.SMEN when HCR_EL2.E2H=1)
+across kvm_arch_vcpu_load_fp() and kvm_arch_vcpu_put_fp(), since the
+configuration may be clobbered by hyp when running a vCPU. This logic
+has historically been broken, and is currently redundant.
+
+This logic was originally introduced in commit:
+
+  861262ab86270206 ("KVM: arm64: Handle SME host state when running guests")
+
+At the time, the VHE hyp code would reset CPTR_EL2.SMEN to 0b00 when
+returning to the host, trapping host access to SME state. Unfortunately,
+this was unsafe as the host could take a softirq before calling
+kvm_arch_vcpu_put_fp(), and if a softirq handler were to use kernel mode
+NEON the resulting attempt to save the live FPSIMD/SVE/SME state would
+result in a fatal trap.
+
+That issue was limited to VHE mode. For nVHE/hVHE modes, KVM always
+saved/restored the host kernel's CPACR_EL1 value, and configured
+CPTR_EL2.TSM to 0b0, ensuring that host usage of SME would not be
+trapped.
+
+The issue above was incidentally fixed by commit:
+
+  375110ab51dec5dc ("KVM: arm64: Fix resetting SME trap values on reset for (h)VHE")
+
+That commit changed the VHE hyp code to configure CPTR_EL2.SMEN to 0b01
+when returning to the host, permitting host kernel usage of SME,
+avoiding the issue described above. At the time, this was not identified
+as a fix for commit 861262ab86270206.
+
+Now that the host eagerly saves and unbinds its own FPSIMD/SVE/SME
+state, there's no need to save/restore the state of the EL0 SME trap.
+The kernel can safely save/restore state without trapping, as described
+above, and will restore userspace state (including trap controls) before
+returning to userspace.
+
+Remove the redundant logic.
+
+Signed-off-by: Mark Rutland <mark.rutland@arm.com>
+Reviewed-by: Mark Brown <broonie@kernel.org>
+Tested-by: Mark Brown <broonie@kernel.org>
+Acked-by: Will Deacon <will@kernel.org>
+Cc: Catalin Marinas <catalin.marinas@arm.com>
+Cc: Fuad Tabba <tabba@google.com>
+Cc: Marc Zyngier <maz@kernel.org>
+Cc: Oliver Upton <oliver.upton@linux.dev>
+Reviewed-by: Oliver Upton <oliver.upton@linux.dev>
+Link: https://lore.kernel.org/r/20250210195226.1215254-5-mark.rutland@arm.com
+Signed-off-by: Marc Zyngier <maz@kernel.org>
+[Update for rework of flags storage -- broonie]
+Signed-off-by: Mark Brown <broonie@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/include/asm/kvm_host.h |    2 --
+ arch/arm64/kvm/fpsimd.c           |   21 ---------------------
+ 2 files changed, 23 deletions(-)
+
+--- a/arch/arm64/include/asm/kvm_host.h
++++ b/arch/arm64/include/asm/kvm_host.h
+@@ -902,8 +902,6 @@ struct kvm_vcpu_arch {
+ /* Save TRBE context if active  */
+ #define DEBUG_STATE_SAVE_TRBE __vcpu_single_flag(iflags, BIT(6))
+-/* SME enabled for EL0 */
+-#define HOST_SME_ENABLED      __vcpu_single_flag(sflags, BIT(1))
+ /* Physical CPU not in supported_cpus */
+ #define ON_UNSUPPORTED_CPU    __vcpu_single_flag(sflags, BIT(2))
+ /* WFIT instruction trapped */
+--- a/arch/arm64/kvm/fpsimd.c
++++ b/arch/arm64/kvm/fpsimd.c
+@@ -65,12 +65,6 @@ void kvm_arch_vcpu_load_fp(struct kvm_vc
+       fpsimd_save_and_flush_cpu_state();
+       *host_data_ptr(fp_owner) = FP_STATE_FREE;
+-      if (system_supports_sme()) {
+-              vcpu_clear_flag(vcpu, HOST_SME_ENABLED);
+-              if (read_sysreg(cpacr_el1) & CPACR_EL1_SMEN_EL0EN)
+-                      vcpu_set_flag(vcpu, HOST_SME_ENABLED);
+-      }
+-
+       /*
+        * If normal guests gain SME support, maintain this behavior for pKVM
+        * guests, which don't support SME.
+@@ -141,21 +135,6 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcp
+       local_irq_save(flags);
+-      /*
+-       * If we have VHE then the Hyp code will reset CPACR_EL1 to
+-       * the default value and we need to reenable SME.
+-       */
+-      if (has_vhe() && system_supports_sme()) {
+-              /* Also restore EL0 state seen on entry */
+-              if (vcpu_get_flag(vcpu, HOST_SME_ENABLED))
+-                      sysreg_clear_set(CPACR_EL1, 0, CPACR_ELx_SMEN);
+-              else
+-                      sysreg_clear_set(CPACR_EL1,
+-                                       CPACR_EL1_SMEN_EL0EN,
+-                                       CPACR_EL1_SMEN_EL1EN);
+-              isb();
+-      }
+-
+       if (guest_owns_fp_regs()) {
+               if (vcpu_has_sve(vcpu)) {
+                       u64 zcr = read_sysreg_el1(SYS_ZCR);
diff --git a/queue-6.13/kvm-arm64-remove-vhe-host-restore-of-cpacr_el1.zen.patch b/queue-6.13/kvm-arm64-remove-vhe-host-restore-of-cpacr_el1.zen.patch
new file mode 100644 (file)
index 0000000..08f30a5
--- /dev/null
@@ -0,0 +1,91 @@
+From stable+bounces-124195-greg=kroah.com@vger.kernel.org Thu Mar 13 00:50:19 2025
+From: Mark Brown <broonie@kernel.org>
+Date: Wed, 12 Mar 2025 23:49:12 +0000
+Subject: KVM: arm64: Remove VHE host restore of CPACR_EL1.ZEN
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,  Marc Zyngier <maz@kernel.org>, Oliver Upton <oliver.upton@linux.dev>,  Joey Gouly <joey.gouly@arm.com>, Suzuki K Poulose <suzuki.poulose@arm.com>,  Catalin Marinas <catalin.marinas@arm.com>, Will Deacon <will@kernel.org>
+Cc: linux-arm-kernel@lists.infradead.org, kvmarm@lists.linux.dev,  linux-kernel@vger.kernel.org, stable@vger.kernel.org,  Mark Brown <broonie@kernel.org>, Mark Rutland <mark.rutland@arm.com>,  Fuad Tabba <tabba@google.com>
+Message-ID: <20250312-stable-sve-6-13-v1-4-c7ba07a6f4f7@kernel.org>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+[ Upstream commit 459f059be702056d91537b99a129994aa6ccdd35 ]
+
+When KVM is in VHE mode, the host kernel tries to save and restore the
+configuration of CPACR_EL1.ZEN (i.e. CPTR_EL2.ZEN when HCR_EL2.E2H=1)
+across kvm_arch_vcpu_load_fp() and kvm_arch_vcpu_put_fp(), since the
+configuration may be clobbered by hyp when running a vCPU. This logic is
+currently redundant.
+
+The VHE hyp code unconditionally configures CPTR_EL2.ZEN to 0b01 when
+returning to the host, permitting host kernel usage of SVE.
+
+Now that the host eagerly saves and unbinds its own FPSIMD/SVE/SME
+state, there's no need to save/restore the state of the EL0 SVE trap.
+The kernel can safely save/restore state without trapping, as described
+above, and will restore userspace state (including trap controls) before
+returning to userspace.
+
+Remove the redundant logic.
+
+Signed-off-by: Mark Rutland <mark.rutland@arm.com>
+Reviewed-by: Mark Brown <broonie@kernel.org>
+Tested-by: Mark Brown <broonie@kernel.org>
+Acked-by: Will Deacon <will@kernel.org>
+Cc: Catalin Marinas <catalin.marinas@arm.com>
+Cc: Fuad Tabba <tabba@google.com>
+Cc: Marc Zyngier <maz@kernel.org>
+Cc: Oliver Upton <oliver.upton@linux.dev>
+Reviewed-by: Oliver Upton <oliver.upton@linux.dev>
+Link: https://lore.kernel.org/r/20250210195226.1215254-4-mark.rutland@arm.com
+Signed-off-by: Marc Zyngier <maz@kernel.org>
+[Rework for refactoring of where the flags are stored -- broonie]
+Signed-off-by: Mark Brown <broonie@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/include/asm/kvm_host.h |    2 --
+ arch/arm64/kvm/fpsimd.c           |   16 ----------------
+ 2 files changed, 18 deletions(-)
+
+--- a/arch/arm64/include/asm/kvm_host.h
++++ b/arch/arm64/include/asm/kvm_host.h
+@@ -902,8 +902,6 @@ struct kvm_vcpu_arch {
+ /* Save TRBE context if active  */
+ #define DEBUG_STATE_SAVE_TRBE __vcpu_single_flag(iflags, BIT(6))
+-/* SVE enabled for host EL0 */
+-#define HOST_SVE_ENABLED      __vcpu_single_flag(sflags, BIT(0))
+ /* SME enabled for EL0 */
+ #define HOST_SME_ENABLED      __vcpu_single_flag(sflags, BIT(1))
+ /* Physical CPU not in supported_cpus */
+--- a/arch/arm64/kvm/fpsimd.c
++++ b/arch/arm64/kvm/fpsimd.c
+@@ -65,10 +65,6 @@ void kvm_arch_vcpu_load_fp(struct kvm_vc
+       fpsimd_save_and_flush_cpu_state();
+       *host_data_ptr(fp_owner) = FP_STATE_FREE;
+-      vcpu_clear_flag(vcpu, HOST_SVE_ENABLED);
+-      if (read_sysreg(cpacr_el1) & CPACR_EL1_ZEN_EL0EN)
+-              vcpu_set_flag(vcpu, HOST_SVE_ENABLED);
+-
+       if (system_supports_sme()) {
+               vcpu_clear_flag(vcpu, HOST_SME_ENABLED);
+               if (read_sysreg(cpacr_el1) & CPACR_EL1_SMEN_EL0EN)
+@@ -202,18 +198,6 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcp
+                * when needed.
+                */
+               fpsimd_save_and_flush_cpu_state();
+-      } else if (has_vhe() && system_supports_sve()) {
+-              /*
+-               * The FPSIMD/SVE state in the CPU has not been touched, and we
+-               * have SVE (and VHE): CPACR_EL1 (alias CPTR_EL2) has been
+-               * reset by kvm_reset_cptr_el2() in the Hyp code, disabling SVE
+-               * for EL0.  To avoid spurious traps, restore the trap state
+-               * seen by kvm_arch_vcpu_load_fp():
+-               */
+-              if (vcpu_get_flag(vcpu, HOST_SVE_ENABLED))
+-                      sysreg_clear_set(CPACR_EL1, 0, CPACR_EL1_ZEN_EL0EN);
+-              else
+-                      sysreg_clear_set(CPACR_EL1, CPACR_EL1_ZEN_EL0EN, 0);
+       }
+       local_irq_restore(flags);
diff --git a/queue-6.13/kvm-arm64-unconditionally-save-flush-host-fpsimd-sve-sme-state.patch b/queue-6.13/kvm-arm64-unconditionally-save-flush-host-fpsimd-sve-sme-state.patch
new file mode 100644 (file)
index 0000000..56f0bfa
--- /dev/null
@@ -0,0 +1,163 @@
+From stable+bounces-124193-greg=kroah.com@vger.kernel.org Thu Mar 13 00:49:59 2025
+From: Mark Brown <broonie@kernel.org>
+Date: Wed, 12 Mar 2025 23:49:10 +0000
+Subject: KVM: arm64: Unconditionally save+flush host FPSIMD/SVE/SME state
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,  Marc Zyngier <maz@kernel.org>, Oliver Upton <oliver.upton@linux.dev>,  Joey Gouly <joey.gouly@arm.com>, Suzuki K Poulose <suzuki.poulose@arm.com>,  Catalin Marinas <catalin.marinas@arm.com>, Will Deacon <will@kernel.org>
+Cc: linux-arm-kernel@lists.infradead.org, kvmarm@lists.linux.dev,  linux-kernel@vger.kernel.org, stable@vger.kernel.org,  Mark Brown <broonie@kernel.org>, Mark Rutland <mark.rutland@arm.com>,  Eric Auger <eauger@redhat.com>, Wilco Dijkstra <wilco.dijkstra@arm.com>,  Eric Auger <eric.auger@redhat.com>, Florian Weimer <fweimer@redhat.com>,  Fuad Tabba <tabba@google.com>, Jeremy Linton <jeremy.linton@arm.com>,  Paolo Bonzini <pbonzini@redhat.com>
+Message-ID: <20250312-stable-sve-6-13-v1-2-c7ba07a6f4f7@kernel.org>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+[ Upstream commit fbc7e61195e23f744814e78524b73b59faa54ab4 ]
+
+There are several problems with the way hyp code lazily saves the host's
+FPSIMD/SVE state, including:
+
+* Host SVE being discarded unexpectedly due to inconsistent
+  configuration of TIF_SVE and CPACR_ELx.ZEN. This has been seen to
+  result in QEMU crashes where SVE is used by memmove(), as reported by
+  Eric Auger:
+
+  https://issues.redhat.com/browse/RHEL-68997
+
+* Host SVE state is discarded *after* modification by ptrace, which was an
+  unintentional ptrace ABI change introduced with lazy discarding of SVE state.
+
+* The host FPMR value can be discarded when running a non-protected VM,
+  where FPMR support is not exposed to a VM, and that VM uses
+  FPSIMD/SVE. In these cases the hyp code does not save the host's FPMR
+  before unbinding the host's FPSIMD/SVE/SME state, leaving a stale
+  value in memory.
+
+Avoid these by eagerly saving and "flushing" the host's FPSIMD/SVE/SME
+state when loading a vCPU such that KVM does not need to save any of the
+host's FPSIMD/SVE/SME state. For clarity, fpsimd_kvm_prepare() is
+removed and the necessary call to fpsimd_save_and_flush_cpu_state() is
+placed in kvm_arch_vcpu_load_fp(). As 'fpsimd_state' and 'fpmr_ptr'
+should not be used, they are set to NULL; all uses of these will be
+removed in subsequent patches.
+
+Historical problems go back at least as far as v5.17, e.g. erroneous
+assumptions about TIF_SVE being clear in commit:
+
+  8383741ab2e773a9 ("KVM: arm64: Get rid of host SVE tracking/saving")
+
+... and so this eager save+flush probably needs to be backported to ALL
+stable trees.
+
+Fixes: 93ae6b01bafee8fa ("KVM: arm64: Discard any SVE state when entering KVM guests")
+Fixes: 8c845e2731041f0f ("arm64/sve: Leave SVE enabled on syscall if we don't context switch")
+Fixes: ef3be86021c3bdf3 ("KVM: arm64: Add save/restore support for FPMR")
+Reported-by: Eric Auger <eauger@redhat.com>
+Reported-by: Wilco Dijkstra <wilco.dijkstra@arm.com>
+Reviewed-by: Mark Brown <broonie@kernel.org>
+Tested-by: Mark Brown <broonie@kernel.org>
+Tested-by: Eric Auger <eric.auger@redhat.com>
+Acked-by: Will Deacon <will@kernel.org>
+Cc: Catalin Marinas <catalin.marinas@arm.com>
+Cc: Florian Weimer <fweimer@redhat.com>
+Cc: Fuad Tabba <tabba@google.com>
+Cc: Jeremy Linton <jeremy.linton@arm.com>
+Cc: Marc Zyngier <maz@kernel.org>
+Cc: Oliver Upton <oliver.upton@linux.dev>
+Cc: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Mark Rutland <mark.rutland@arm.com>
+Reviewed-by: Oliver Upton <oliver.upton@linux.dev>
+Link: https://lore.kernel.org/r/20250210195226.1215254-2-mark.rutland@arm.com
+Signed-off-by: Marc Zyngier <maz@kernel.org>
+[ Mark: Handle vcpu/host flag conflict ]
+Signed-off-by: Mark Rutland <mark.rutland@arm.com>
+Signed-off-by: Mark Brown <broonie@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/kernel/fpsimd.c |   25 -------------------------
+ arch/arm64/kvm/fpsimd.c    |   35 ++++++++++-------------------------
+ 2 files changed, 10 insertions(+), 50 deletions(-)
+
+--- a/arch/arm64/kernel/fpsimd.c
++++ b/arch/arm64/kernel/fpsimd.c
+@@ -1695,31 +1695,6 @@ void fpsimd_signal_preserve_current_stat
+ }
+ /*
+- * Called by KVM when entering the guest.
+- */
+-void fpsimd_kvm_prepare(void)
+-{
+-      if (!system_supports_sve())
+-              return;
+-
+-      /*
+-       * KVM does not save host SVE state since we can only enter
+-       * the guest from a syscall so the ABI means that only the
+-       * non-saved SVE state needs to be saved.  If we have left
+-       * SVE enabled for performance reasons then update the task
+-       * state to be FPSIMD only.
+-       */
+-      get_cpu_fpsimd_context();
+-
+-      if (test_and_clear_thread_flag(TIF_SVE)) {
+-              sve_to_fpsimd(current);
+-              current->thread.fp_type = FP_STATE_FPSIMD;
+-      }
+-
+-      put_cpu_fpsimd_context();
+-}
+-
+-/*
+  * Associate current's FPSIMD context with this cpu
+  * The caller must have ownership of the cpu FPSIMD context before calling
+  * this function.
+--- a/arch/arm64/kvm/fpsimd.c
++++ b/arch/arm64/kvm/fpsimd.c
+@@ -54,16 +54,18 @@ void kvm_arch_vcpu_load_fp(struct kvm_vc
+       if (!system_supports_fpsimd())
+               return;
+-      fpsimd_kvm_prepare();
+-
+       /*
+-       * We will check TIF_FOREIGN_FPSTATE just before entering the
+-       * guest in kvm_arch_vcpu_ctxflush_fp() and override this to
+-       * FP_STATE_FREE if the flag set.
++       * Ensure that any host FPSIMD/SVE/SME state is saved and unbound such
++       * that the host kernel is responsible for restoring this state upon
++       * return to userspace, and the hyp code doesn't need to save anything.
++       *
++       * When the host may use SME, fpsimd_save_and_flush_cpu_state() ensures
++       * that PSTATE.{SM,ZA} == {0,0}.
+        */
+-      *host_data_ptr(fp_owner) = FP_STATE_HOST_OWNED;
+-      *host_data_ptr(fpsimd_state) = kern_hyp_va(&current->thread.uw.fpsimd_state);
+-      *host_data_ptr(fpmr_ptr) = kern_hyp_va(&current->thread.uw.fpmr);
++      fpsimd_save_and_flush_cpu_state();
++      *host_data_ptr(fp_owner) = FP_STATE_FREE;
++      *host_data_ptr(fpsimd_state) = NULL;
++      *host_data_ptr(fpmr_ptr) = NULL;
+       vcpu_clear_flag(vcpu, HOST_SVE_ENABLED);
+       if (read_sysreg(cpacr_el1) & CPACR_EL1_ZEN_EL0EN)
+@@ -73,23 +75,6 @@ void kvm_arch_vcpu_load_fp(struct kvm_vc
+               vcpu_clear_flag(vcpu, HOST_SME_ENABLED);
+               if (read_sysreg(cpacr_el1) & CPACR_EL1_SMEN_EL0EN)
+                       vcpu_set_flag(vcpu, HOST_SME_ENABLED);
+-
+-              /*
+-               * If PSTATE.SM is enabled then save any pending FP
+-               * state and disable PSTATE.SM. If we leave PSTATE.SM
+-               * enabled and the guest does not enable SME via
+-               * CPACR_EL1.SMEN then operations that should be valid
+-               * may generate SME traps from EL1 to EL1 which we
+-               * can't intercept and which would confuse the guest.
+-               *
+-               * Do the same for PSTATE.ZA in the case where there
+-               * is state in the registers which has not already
+-               * been saved, this is very unlikely to happen.
+-               */
+-              if (read_sysreg_s(SYS_SVCR) & (SVCR_SM_MASK | SVCR_ZA_MASK)) {
+-                      *host_data_ptr(fp_owner) = FP_STATE_FREE;
+-                      fpsimd_save_and_flush_cpu_state();
+-              }
+       }
+       /*
diff --git a/queue-6.13/mm-fix-kernel-bug-when-userfaultfd_move-encounters-swapcache.patch b/queue-6.13/mm-fix-kernel-bug-when-userfaultfd_move-encounters-swapcache.patch
new file mode 100644 (file)
index 0000000..f4e4f0a
--- /dev/null
@@ -0,0 +1,276 @@
+From c50f8e6053b0503375c2975bf47f182445aebb4c Mon Sep 17 00:00:00 2001
+From: Barry Song <v-songbaohua@oppo.com>
+Date: Wed, 26 Feb 2025 13:14:00 +1300
+Subject: mm: fix kernel BUG when userfaultfd_move encounters swapcache
+
+From: Barry Song <v-songbaohua@oppo.com>
+
+commit c50f8e6053b0503375c2975bf47f182445aebb4c upstream.
+
+userfaultfd_move() checks whether the PTE entry is present or a
+swap entry.
+
+- If the PTE entry is present, move_present_pte() handles folio
+  migration by setting:
+
+  src_folio->index = linear_page_index(dst_vma, dst_addr);
+
+- If the PTE entry is a swap entry, move_swap_pte() simply copies
+  the PTE to the new dst_addr.
+
+This approach is incorrect because, even if the PTE is a swap entry,
+it can still reference a folio that remains in the swap cache.
+
+This creates a race window between steps 2 and 4.
+ 1. add_to_swap: The folio is added to the swapcache.
+ 2. try_to_unmap: PTEs are converted to swap entries.
+ 3. pageout: The folio is written back.
+ 4. Swapcache is cleared.
+If userfaultfd_move() occurs in the window between steps 2 and 4,
+after the swap PTE has been moved to the destination, accessing the
+destination triggers do_swap_page(), which may locate the folio in
+the swapcache. However, since the folio's index has not been updated
+to match the destination VMA, do_swap_page() will detect a mismatch.
+
+This can result in two critical issues depending on the system
+configuration.
+
+If KSM is disabled, both small and large folios can trigger a BUG
+during the add_rmap operation due to:
+
+ page_pgoff(folio, page) != linear_page_index(vma, address)
+
+[   13.336953] page: refcount:6 mapcount:1 mapping:00000000f43db19c index:0xffffaf150 pfn:0x4667c
+[   13.337520] head: order:2 mapcount:1 entire_mapcount:0 nr_pages_mapped:1 pincount:0
+[   13.337716] memcg:ffff00000405f000
+[   13.337849] anon flags: 0x3fffc0000020459(locked|uptodate|dirty|owner_priv_1|head|swapbacked|node=0|zone=0|lastcpupid=0xffff)
+[   13.338630] raw: 03fffc0000020459 ffff80008507b538 ffff80008507b538 ffff000006260361
+[   13.338831] raw: 0000000ffffaf150 0000000000004000 0000000600000000 ffff00000405f000
+[   13.339031] head: 03fffc0000020459 ffff80008507b538 ffff80008507b538 ffff000006260361
+[   13.339204] head: 0000000ffffaf150 0000000000004000 0000000600000000 ffff00000405f000
+[   13.339375] head: 03fffc0000000202 fffffdffc0199f01 ffffffff00000000 0000000000000001
+[   13.339546] head: 0000000000000004 0000000000000000 00000000ffffffff 0000000000000000
+[   13.339736] page dumped because: VM_BUG_ON_PAGE(page_pgoff(folio, page) != linear_page_index(vma, address))
+[   13.340190] ------------[ cut here ]------------
+[   13.340316] kernel BUG at mm/rmap.c:1380!
+[   13.340683] Internal error: Oops - BUG: 00000000f2000800 [#1] PREEMPT SMP
+[   13.340969] Modules linked in:
+[   13.341257] CPU: 1 UID: 0 PID: 107 Comm: a.out Not tainted 6.14.0-rc3-gcf42737e247a-dirty #299
+[   13.341470] Hardware name: linux,dummy-virt (DT)
+[   13.341671] pstate: 60000005 (nZCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
+[   13.341815] pc : __page_check_anon_rmap+0xa0/0xb0
+[   13.341920] lr : __page_check_anon_rmap+0xa0/0xb0
+[   13.342018] sp : ffff80008752bb20
+[   13.342093] x29: ffff80008752bb20 x28: fffffdffc0199f00 x27: 0000000000000001
+[   13.342404] x26: 0000000000000000 x25: 0000000000000001 x24: 0000000000000001
+[   13.342575] x23: 0000ffffaf0d0000 x22: 0000ffffaf0d0000 x21: fffffdffc0199f00
+[   13.342731] x20: fffffdffc0199f00 x19: ffff000006210700 x18: 00000000ffffffff
+[   13.342881] x17: 6c203d2120296567 x16: 6170202c6f696c6f x15: 662866666f67705f
+[   13.343033] x14: 6567617028454741 x13: 2929737365726464 x12: ffff800083728ab0
+[   13.343183] x11: ffff800082996bf8 x10: 0000000000000fd7 x9 : ffff80008011bc40
+[   13.343351] x8 : 0000000000017fe8 x7 : 00000000fffff000 x6 : ffff8000829eebf8
+[   13.343498] x5 : c0000000fffff000 x4 : 0000000000000000 x3 : 0000000000000000
+[   13.343645] x2 : 0000000000000000 x1 : ffff0000062db980 x0 : 000000000000005f
+[   13.343876] Call trace:
+[   13.344045]  __page_check_anon_rmap+0xa0/0xb0 (P)
+[   13.344234]  folio_add_anon_rmap_ptes+0x22c/0x320
+[   13.344333]  do_swap_page+0x1060/0x1400
+[   13.344417]  __handle_mm_fault+0x61c/0xbc8
+[   13.344504]  handle_mm_fault+0xd8/0x2e8
+[   13.344586]  do_page_fault+0x20c/0x770
+[   13.344673]  do_translation_fault+0xb4/0xf0
+[   13.344759]  do_mem_abort+0x48/0xa0
+[   13.344842]  el0_da+0x58/0x130
+[   13.344914]  el0t_64_sync_handler+0xc4/0x138
+[   13.345002]  el0t_64_sync+0x1ac/0x1b0
+[   13.345208] Code: aa1503e0 f000f801 910f6021 97ff5779 (d4210000)
+[   13.345504] ---[ end trace 0000000000000000 ]---
+[   13.345715] note: a.out[107] exited with irqs disabled
+[   13.345954] note: a.out[107] exited with preempt_count 2
+
+If KSM is enabled, Peter Xu also discovered that do_swap_page() may
+trigger an unexpected CoW operation for small folios because
+ksm_might_need_to_copy() allocates a new folio when the folio index
+does not match linear_page_index(vma, addr).
+
+This patch also checks the swapcache when handling swap entries. If a
+match is found in the swapcache, it processes it similarly to a present
+PTE.
+However, there are some differences. For example, the folio is no longer
+exclusive because folio_try_share_anon_rmap_pte() is performed during
+unmapping.
+Furthermore, in the case of swapcache, the folio has already been
+unmapped, eliminating the risk of concurrent rmap walks and removing the
+need to acquire src_folio's anon_vma or lock.
+
+Note that for large folios, in the swapcache handling path, we directly
+return -EBUSY since split_folio() will return -EBUSY regardless if
+the folio is under writeback or unmapped. This is not an urgent issue,
+so a follow-up patch may address it separately.
+
+[v-songbaohua@oppo.com: minor cleanup according to Peter Xu]
+  Link: https://lkml.kernel.org/r/20250226024411.47092-1-21cnbao@gmail.com
+Link: https://lkml.kernel.org/r/20250226001400.9129-1-21cnbao@gmail.com
+Fixes: adef440691ba ("userfaultfd: UFFDIO_MOVE uABI")
+Signed-off-by: Barry Song <v-songbaohua@oppo.com>
+Acked-by: Peter Xu <peterx@redhat.com>
+Reviewed-by: Suren Baghdasaryan <surenb@google.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Al Viro <viro@zeniv.linux.org.uk>
+Cc: Axel Rasmussen <axelrasmussen@google.com>
+Cc: Brian Geffon <bgeffon@google.com>
+Cc: Christian Brauner <brauner@kernel.org>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Jann Horn <jannh@google.com>
+Cc: Kalesh Singh <kaleshsingh@google.com>
+Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
+Cc: Lokesh Gidra <lokeshgidra@google.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Mike Rapoport (IBM) <rppt@kernel.org>
+Cc: Nicolas Geoffray <ngeoffray@google.com>
+Cc: Ryan Roberts <ryan.roberts@arm.com>
+Cc: Shuah Khan <shuah@kernel.org>
+Cc: ZhangPeng <zhangpeng362@huawei.com>
+Cc: Tangquan Zheng <zhengtangquan@oppo.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+[ surenb: resolved merged conflict caused by the difference in
+  move_swap_pte() arguments]
+Signed-off-by: Suren Baghdasaryan <surenb@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/userfaultfd.c |   75 ++++++++++++++++++++++++++++++++++++++++++++++++-------
+ 1 file changed, 66 insertions(+), 9 deletions(-)
+
+--- a/mm/userfaultfd.c
++++ b/mm/userfaultfd.c
+@@ -18,6 +18,7 @@
+ #include <asm/tlbflush.h>
+ #include <asm/tlb.h>
+ #include "internal.h"
++#include "swap.h"
+ static __always_inline
+ bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end)
+@@ -1067,15 +1068,13 @@ out:
+       return err;
+ }
+-static int move_swap_pte(struct mm_struct *mm,
++static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma,
+                        unsigned long dst_addr, unsigned long src_addr,
+                        pte_t *dst_pte, pte_t *src_pte,
+                        pte_t orig_dst_pte, pte_t orig_src_pte,
+-                       spinlock_t *dst_ptl, spinlock_t *src_ptl)
++                       spinlock_t *dst_ptl, spinlock_t *src_ptl,
++                       struct folio *src_folio)
+ {
+-      if (!pte_swp_exclusive(orig_src_pte))
+-              return -EBUSY;
+-
+       double_pt_lock(dst_ptl, src_ptl);
+       if (!pte_same(ptep_get(src_pte), orig_src_pte) ||
+@@ -1084,6 +1083,16 @@ static int move_swap_pte(struct mm_struc
+               return -EAGAIN;
+       }
++      /*
++       * The src_folio resides in the swapcache, requiring an update to its
++       * index and mapping to align with the dst_vma, where a swap-in may
++       * occur and hit the swapcache after moving the PTE.
++       */
++      if (src_folio) {
++              folio_move_anon_rmap(src_folio, dst_vma);
++              src_folio->index = linear_page_index(dst_vma, dst_addr);
++      }
++
+       orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
+       set_pte_at(mm, dst_addr, dst_pte, orig_src_pte);
+       double_pt_unlock(dst_ptl, src_ptl);
+@@ -1130,6 +1139,7 @@ static int move_pages_pte(struct mm_stru
+                         __u64 mode)
+ {
+       swp_entry_t entry;
++      struct swap_info_struct *si = NULL;
+       pte_t orig_src_pte, orig_dst_pte;
+       pte_t src_folio_pte;
+       spinlock_t *src_ptl, *dst_ptl;
+@@ -1321,6 +1331,8 @@ retry:
+                                      orig_dst_pte, orig_src_pte,
+                                      dst_ptl, src_ptl, src_folio);
+       } else {
++              struct folio *folio = NULL;
++
+               entry = pte_to_swp_entry(orig_src_pte);
+               if (non_swap_entry(entry)) {
+                       if (is_migration_entry(entry)) {
+@@ -1334,10 +1346,53 @@ retry:
+                       goto out;
+               }
+-              err = move_swap_pte(mm, dst_addr, src_addr,
+-                                  dst_pte, src_pte,
+-                                  orig_dst_pte, orig_src_pte,
+-                                  dst_ptl, src_ptl);
++              if (!pte_swp_exclusive(orig_src_pte)) {
++                      err = -EBUSY;
++                      goto out;
++              }
++
++              si = get_swap_device(entry);
++              if (unlikely(!si)) {
++                      err = -EAGAIN;
++                      goto out;
++              }
++              /*
++               * Verify the existence of the swapcache. If present, the folio's
++               * index and mapping must be updated even when the PTE is a swap
++               * entry. The anon_vma lock is not taken during this process since
++               * the folio has already been unmapped, and the swap entry is
++               * exclusive, preventing rmap walks.
++               *
++               * For large folios, return -EBUSY immediately, as split_folio()
++               * also returns -EBUSY when attempting to split unmapped large
++               * folios in the swapcache. This issue needs to be resolved
++               * separately to allow proper handling.
++               */
++              if (!src_folio)
++                      folio = filemap_get_folio(swap_address_space(entry),
++                                      swap_cache_index(entry));
++              if (!IS_ERR_OR_NULL(folio)) {
++                      if (folio_test_large(folio)) {
++                              err = -EBUSY;
++                              folio_put(folio);
++                              goto out;
++                      }
++                      src_folio = folio;
++                      src_folio_pte = orig_src_pte;
++                      if (!folio_trylock(src_folio)) {
++                              pte_unmap(&orig_src_pte);
++                              pte_unmap(&orig_dst_pte);
++                              src_pte = dst_pte = NULL;
++                              put_swap_device(si);
++                              si = NULL;
++                              /* now we can block and wait */
++                              folio_lock(src_folio);
++                              goto retry;
++                      }
++              }
++              err = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte,
++                              orig_dst_pte, orig_src_pte,
++                              dst_ptl, src_ptl, src_folio);
+       }
+ out:
+@@ -1354,6 +1409,8 @@ out:
+       if (src_pte)
+               pte_unmap(src_pte);
+       mmu_notifier_invalidate_range_end(&range);
++      if (si)
++              put_swap_device(si);
+       return err;
+ }
diff --git a/queue-6.13/mm-slab-kvfree_rcu-switch-to-wq_mem_reclaim-wq.patch b/queue-6.13/mm-slab-kvfree_rcu-switch-to-wq_mem_reclaim-wq.patch
new file mode 100644 (file)
index 0000000..d1be1e1
--- /dev/null
@@ -0,0 +1,142 @@
+From dfd3df31c9db752234d7d2e09bef2aeabb643ce4 Mon Sep 17 00:00:00 2001
+From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
+Date: Fri, 28 Feb 2025 13:13:56 +0100
+Subject: mm/slab/kvfree_rcu: Switch to WQ_MEM_RECLAIM wq
+
+From: Uladzislau Rezki (Sony) <urezki@gmail.com>
+
+commit dfd3df31c9db752234d7d2e09bef2aeabb643ce4 upstream.
+
+Currently kvfree_rcu() APIs use a system workqueue which is
+"system_unbound_wq" to driver RCU machinery to reclaim a memory.
+
+Recently, it has been noted that the following kernel warning can
+be observed:
+
+<snip>
+workqueue: WQ_MEM_RECLAIM nvme-wq:nvme_scan_work is flushing !WQ_MEM_RECLAIM events_unbound:kfree_rcu_work
+  WARNING: CPU: 21 PID: 330 at kernel/workqueue.c:3719 check_flush_dependency+0x112/0x120
+  Modules linked in: intel_uncore_frequency(E) intel_uncore_frequency_common(E) skx_edac(E) ...
+  CPU: 21 UID: 0 PID: 330 Comm: kworker/u144:6 Tainted: G            E      6.13.2-0_g925d379822da #1
+  Hardware name: Wiwynn Twin Lakes MP/Twin Lakes Passive MP, BIOS YMM20 02/01/2023
+  Workqueue: nvme-wq nvme_scan_work
+  RIP: 0010:check_flush_dependency+0x112/0x120
+  Code: 05 9a 40 14 02 01 48 81 c6 c0 00 00 00 48 8b 50 18 48 81 c7 c0 00 00 00 48 89 f9 48 ...
+  RSP: 0018:ffffc90000df7bd8 EFLAGS: 00010082
+  RAX: 000000000000006a RBX: ffffffff81622390 RCX: 0000000000000027
+  RDX: 00000000fffeffff RSI: 000000000057ffa8 RDI: ffff88907f960c88
+  RBP: 0000000000000000 R08: ffffffff83068e50 R09: 000000000002fffd
+  R10: 0000000000000004 R11: 0000000000000000 R12: ffff8881001a4400
+  R13: 0000000000000000 R14: ffff88907f420fb8 R15: 0000000000000000
+  FS:  0000000000000000(0000) GS:ffff88907f940000(0000) knlGS:0000000000000000
+  CR2: 00007f60c3001000 CR3: 000000107d010005 CR4: 00000000007726f0
+  PKRU: 55555554
+  Call Trace:
+   <TASK>
+   ? __warn+0xa4/0x140
+   ? check_flush_dependency+0x112/0x120
+   ? report_bug+0xe1/0x140
+   ? check_flush_dependency+0x112/0x120
+   ? handle_bug+0x5e/0x90
+   ? exc_invalid_op+0x16/0x40
+   ? asm_exc_invalid_op+0x16/0x20
+   ? timer_recalc_next_expiry+0x190/0x190
+   ? check_flush_dependency+0x112/0x120
+   ? check_flush_dependency+0x112/0x120
+   __flush_work.llvm.1643880146586177030+0x174/0x2c0
+   flush_rcu_work+0x28/0x30
+   kvfree_rcu_barrier+0x12f/0x160
+   kmem_cache_destroy+0x18/0x120
+   bioset_exit+0x10c/0x150
+   disk_release.llvm.6740012984264378178+0x61/0xd0
+   device_release+0x4f/0x90
+   kobject_put+0x95/0x180
+   nvme_put_ns+0x23/0xc0
+   nvme_remove_invalid_namespaces+0xb3/0xd0
+   nvme_scan_work+0x342/0x490
+   process_scheduled_works+0x1a2/0x370
+   worker_thread+0x2ff/0x390
+   ? pwq_release_workfn+0x1e0/0x1e0
+   kthread+0xb1/0xe0
+   ? __kthread_parkme+0x70/0x70
+   ret_from_fork+0x30/0x40
+   ? __kthread_parkme+0x70/0x70
+   ret_from_fork_asm+0x11/0x20
+   </TASK>
+  ---[ end trace 0000000000000000 ]---
+<snip>
+
+To address this switch to use of independent WQ_MEM_RECLAIM
+workqueue, so the rules are not violated from workqueue framework
+point of view.
+
+Apart of that, since kvfree_rcu() does reclaim memory it is worth
+to go with WQ_MEM_RECLAIM type of wq because it is designed for
+this purpose.
+
+Fixes: 6c6c47b063b5 ("mm, slab: call kvfree_rcu_barrier() from kmem_cache_destroy()"),
+Reported-by: Keith Busch <kbusch@kernel.org>
+Closes: https://lore.kernel.org/all/Z7iqJtCjHKfo8Kho@kbusch-mbp/
+Cc: stable@vger.kernel.org
+Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
+Reviewed-by: Joel Fernandes <joelagnelf@nvidia.com>
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/rcu/tree.c |   14 ++++++++++----
+ 1 file changed, 10 insertions(+), 4 deletions(-)
+
+--- a/kernel/rcu/tree.c
++++ b/kernel/rcu/tree.c
+@@ -3191,6 +3191,8 @@ void call_rcu(struct rcu_head *head, rcu
+ }
+ EXPORT_SYMBOL_GPL(call_rcu);
++static struct workqueue_struct *rcu_reclaim_wq;
++
+ /* Maximum number of jiffies to wait before draining a batch. */
+ #define KFREE_DRAIN_JIFFIES (5 * HZ)
+ #define KFREE_N_BATCHES 2
+@@ -3519,10 +3521,10 @@ __schedule_delayed_monitor_work(struct k
+       if (delayed_work_pending(&krcp->monitor_work)) {
+               delay_left = krcp->monitor_work.timer.expires - jiffies;
+               if (delay < delay_left)
+-                      mod_delayed_work(system_unbound_wq, &krcp->monitor_work, delay);
++                      mod_delayed_work(rcu_reclaim_wq, &krcp->monitor_work, delay);
+               return;
+       }
+-      queue_delayed_work(system_unbound_wq, &krcp->monitor_work, delay);
++      queue_delayed_work(rcu_reclaim_wq, &krcp->monitor_work, delay);
+ }
+ static void
+@@ -3620,7 +3622,7 @@ kvfree_rcu_queue_batch(struct kfree_rcu_
+                       // "free channels", the batch can handle. Break
+                       // the loop since it is done with this CPU thus
+                       // queuing an RCU work is _always_ success here.
+-                      queued = queue_rcu_work(system_unbound_wq, &krwp->rcu_work);
++                      queued = queue_rcu_work(rcu_reclaim_wq, &krwp->rcu_work);
+                       WARN_ON_ONCE(!queued);
+                       break;
+               }
+@@ -3708,7 +3710,7 @@ run_page_cache_worker(struct kfree_rcu_c
+       if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
+                       !atomic_xchg(&krcp->work_in_progress, 1)) {
+               if (atomic_read(&krcp->backoff_page_cache_fill)) {
+-                      queue_delayed_work(system_unbound_wq,
++                      queue_delayed_work(rcu_reclaim_wq,
+                               &krcp->page_cache_work,
+                                       msecs_to_jiffies(rcu_delay_page_cache_fill_msec));
+               } else {
+@@ -5654,6 +5656,10 @@ static void __init kfree_rcu_batch_init(
+       int i, j;
+       struct shrinker *kfree_rcu_shrinker;
++      rcu_reclaim_wq = alloc_workqueue("kvfree_rcu_reclaim",
++                      WQ_UNBOUND | WQ_MEM_RECLAIM, 0);
++      WARN_ON(!rcu_reclaim_wq);
++
+       /* Clamp it to [0:100] seconds interval. */
+       if (rcu_delay_page_cache_fill_msec < 0 ||
+               rcu_delay_page_cache_fill_msec > 100 * MSEC_PER_SEC) {
diff --git a/queue-6.13/series b/queue-6.13/series
new file mode 100644 (file)
index 0000000..958c55e
--- /dev/null
@@ -0,0 +1,12 @@
+kvm-arm64-calculate-cptr_el2-traps-on-activating-traps.patch
+kvm-arm64-unconditionally-save-flush-host-fpsimd-sve-sme-state.patch
+kvm-arm64-remove-host-fpsimd-saving-for-non-protected-kvm.patch
+kvm-arm64-remove-vhe-host-restore-of-cpacr_el1.zen.patch
+kvm-arm64-remove-vhe-host-restore-of-cpacr_el1.smen.patch
+kvm-arm64-refactor-exit-handlers.patch
+kvm-arm64-mark-some-header-functions-as-inline.patch
+kvm-arm64-eagerly-switch-zcr_el-1-2.patch
+mm-fix-kernel-bug-when-userfaultfd_move-encounters-swapcache.patch
+userfaultfd-fix-pte-unmapping-stack-allocated-pte-copies.patch
+mm-slab-kvfree_rcu-switch-to-wq_mem_reclaim-wq.patch
+virt-sev-guest-move-snp-guest-request-data-pages-handling-under-snp_cmd_mutex.patch
diff --git a/queue-6.13/userfaultfd-fix-pte-unmapping-stack-allocated-pte-copies.patch b/queue-6.13/userfaultfd-fix-pte-unmapping-stack-allocated-pte-copies.patch
new file mode 100644 (file)
index 0000000..5616de0
--- /dev/null
@@ -0,0 +1,95 @@
+From 927e926d72d9155fde3264459fe9bfd7b5e40d28 Mon Sep 17 00:00:00 2001
+From: Suren Baghdasaryan <surenb@google.com>
+Date: Wed, 26 Feb 2025 10:55:09 -0800
+Subject: userfaultfd: fix PTE unmapping stack-allocated PTE copies
+
+From: Suren Baghdasaryan <surenb@google.com>
+
+commit 927e926d72d9155fde3264459fe9bfd7b5e40d28 upstream.
+
+Current implementation of move_pages_pte() copies source and destination
+PTEs in order to detect concurrent changes to PTEs involved in the move.
+However these copies are also used to unmap the PTEs, which will fail if
+CONFIG_HIGHPTE is enabled because the copies are allocated on the stack.
+Fix this by using the actual PTEs which were kmap()ed.
+
+Link: https://lkml.kernel.org/r/20250226185510.2732648-3-surenb@google.com
+Fixes: adef440691ba ("userfaultfd: UFFDIO_MOVE uABI")
+Signed-off-by: Suren Baghdasaryan <surenb@google.com>
+Reported-by: Peter Xu <peterx@redhat.com>
+Reviewed-by: Peter Xu <peterx@redhat.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Barry Song <21cnbao@gmail.com>
+Cc: Barry Song <v-songbaohua@oppo.com>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Jann Horn <jannh@google.com>
+Cc: Kalesh Singh <kaleshsingh@google.com>
+Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
+Cc: Lokesh Gidra <lokeshgidra@google.com>
+Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/userfaultfd.c |   20 ++++++++++----------
+ 1 file changed, 10 insertions(+), 10 deletions(-)
+
+--- a/mm/userfaultfd.c
++++ b/mm/userfaultfd.c
+@@ -1274,8 +1274,8 @@ retry:
+                       spin_unlock(src_ptl);
+                       if (!locked) {
+-                              pte_unmap(&orig_src_pte);
+-                              pte_unmap(&orig_dst_pte);
++                              pte_unmap(src_pte);
++                              pte_unmap(dst_pte);
+                               src_pte = dst_pte = NULL;
+                               /* now we can block and wait */
+                               folio_lock(src_folio);
+@@ -1291,8 +1291,8 @@ retry:
+               /* at this point we have src_folio locked */
+               if (folio_test_large(src_folio)) {
+                       /* split_folio() can block */
+-                      pte_unmap(&orig_src_pte);
+-                      pte_unmap(&orig_dst_pte);
++                      pte_unmap(src_pte);
++                      pte_unmap(dst_pte);
+                       src_pte = dst_pte = NULL;
+                       err = split_folio(src_folio);
+                       if (err)
+@@ -1317,8 +1317,8 @@ retry:
+                               goto out;
+                       }
+                       if (!anon_vma_trylock_write(src_anon_vma)) {
+-                              pte_unmap(&orig_src_pte);
+-                              pte_unmap(&orig_dst_pte);
++                              pte_unmap(src_pte);
++                              pte_unmap(dst_pte);
+                               src_pte = dst_pte = NULL;
+                               /* now we can block and wait */
+                               anon_vma_lock_write(src_anon_vma);
+@@ -1336,8 +1336,8 @@ retry:
+               entry = pte_to_swp_entry(orig_src_pte);
+               if (non_swap_entry(entry)) {
+                       if (is_migration_entry(entry)) {
+-                              pte_unmap(&orig_src_pte);
+-                              pte_unmap(&orig_dst_pte);
++                              pte_unmap(src_pte);
++                              pte_unmap(dst_pte);
+                               src_pte = dst_pte = NULL;
+                               migration_entry_wait(mm, src_pmd, src_addr);
+                               err = -EAGAIN;
+@@ -1380,8 +1380,8 @@ retry:
+                       src_folio = folio;
+                       src_folio_pte = orig_src_pte;
+                       if (!folio_trylock(src_folio)) {
+-                              pte_unmap(&orig_src_pte);
+-                              pte_unmap(&orig_dst_pte);
++                              pte_unmap(src_pte);
++                              pte_unmap(dst_pte);
+                               src_pte = dst_pte = NULL;
+                               put_swap_device(si);
+                               si = NULL;
diff --git a/queue-6.13/virt-sev-guest-move-snp-guest-request-data-pages-handling-under-snp_cmd_mutex.patch b/queue-6.13/virt-sev-guest-move-snp-guest-request-data-pages-handling-under-snp_cmd_mutex.patch
new file mode 100644 (file)
index 0000000..59b6202
--- /dev/null
@@ -0,0 +1,242 @@
+From 3e385c0d6ce88ac9916dcf84267bd5855d830748 Mon Sep 17 00:00:00 2001
+From: Alexey Kardashevskiy <aik@amd.com>
+Date: Fri, 7 Mar 2025 12:37:00 +1100
+Subject: virt: sev-guest: Move SNP Guest Request data pages handling under snp_cmd_mutex
+
+From: Alexey Kardashevskiy <aik@amd.com>
+
+commit 3e385c0d6ce88ac9916dcf84267bd5855d830748 upstream.
+
+Compared to the SNP Guest Request, the "Extended" version adds data pages for
+receiving certificates. If not enough pages provided, the HV can report to the
+VM how much is needed so the VM can reallocate and repeat.
+
+Commit
+
+  ae596615d93d ("virt: sev-guest: Reduce the scope of SNP command mutex")
+
+moved handling of the allocated/desired pages number out of scope of said
+mutex and create a possibility for a race (multiple instances trying to
+trigger Extended request in a VM) as there is just one instance of
+snp_msg_desc per /dev/sev-guest and no locking other than snp_cmd_mutex.
+
+Fix the issue by moving the data blob/size and the GHCB input struct
+(snp_req_data) into snp_guest_req which is allocated on stack now and accessed
+by the GHCB caller under that mutex.
+
+Stop allocating SEV_FW_BLOB_MAX_SIZE in snp_msg_alloc() as only one of four
+callers needs it. Free the received blob in get_ext_report() right after it is
+copied to the userspace. Possible future users of snp_send_guest_request() are
+likely to have different ideas about the buffer size anyways.
+
+Fixes: ae596615d93d ("virt: sev-guest: Reduce the scope of SNP command mutex")
+Signed-off-by: Alexey Kardashevskiy <aik@amd.com>
+Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
+Reviewed-by: Nikunj A Dadhania <nikunj@amd.com>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20250307013700.437505-3-aik@amd.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+
+---
+ arch/x86/include/asm/sev.h              |    6 +--
+ drivers/virt/coco/sev-guest/sev-guest.c |   63 +++++++++++++++++++-------------
+ 2 files changed, 42 insertions(+), 27 deletions(-)
+
+--- a/arch/x86/include/asm/sev.h
++++ b/arch/x86/include/asm/sev.h
+@@ -185,6 +185,9 @@ struct snp_guest_req {
+       unsigned int vmpck_id;
+       u8 msg_version;
+       u8 msg_type;
++
++      struct snp_req_data input;
++      void *certs_data;
+ };
+ /*
+@@ -245,9 +248,6 @@ struct snp_msg_desc {
+       struct snp_guest_msg secret_request, secret_response;
+       struct snp_secrets_page *secrets;
+-      struct snp_req_data input;
+-
+-      void *certs_data;
+       struct aesgcm_ctx *ctx;
+--- a/drivers/virt/coco/sev-guest/sev-guest.c
++++ b/drivers/virt/coco/sev-guest/sev-guest.c
+@@ -249,7 +249,7 @@ retry_request:
+        * sequence number must be incremented or the VMPCK must be deleted to
+        * prevent reuse of the IV.
+        */
+-      rc = snp_issue_guest_request(req, &mdesc->input, rio);
++      rc = snp_issue_guest_request(req, &req->input, rio);
+       switch (rc) {
+       case -ENOSPC:
+               /*
+@@ -259,7 +259,7 @@ retry_request:
+                * order to increment the sequence number and thus avoid
+                * IV reuse.
+                */
+-              override_npages = mdesc->input.data_npages;
++              override_npages = req->input.data_npages;
+               req->exit_code  = SVM_VMGEXIT_GUEST_REQUEST;
+               /*
+@@ -315,7 +315,7 @@ retry_request:
+       }
+       if (override_npages)
+-              mdesc->input.data_npages = override_npages;
++              req->input.data_npages = override_npages;
+       return rc;
+ }
+@@ -354,6 +354,11 @@ static int snp_send_guest_request(struct
+       memcpy(mdesc->request, &mdesc->secret_request,
+              sizeof(mdesc->secret_request));
++      /* initial the input address for guest request */
++      req->input.req_gpa = __pa(mdesc->request);
++      req->input.resp_gpa = __pa(mdesc->response);
++      req->input.data_gpa = req->certs_data ? __pa(req->certs_data) : 0;
++
+       rc = __handle_guest_request(mdesc, req, rio);
+       if (rc) {
+               if (rc == -EIO &&
+@@ -495,6 +500,7 @@ static int get_ext_report(struct snp_gue
+       struct snp_guest_req req = {};
+       int ret, npages = 0, resp_len;
+       sockptr_t certs_address;
++      struct page *page;
+       if (sockptr_is_null(io->req_data) || sockptr_is_null(io->resp_data))
+               return -EINVAL;
+@@ -528,8 +534,20 @@ static int get_ext_report(struct snp_gue
+        * the host. If host does not supply any certs in it, then copy
+        * zeros to indicate that certificate data was not provided.
+        */
+-      memset(mdesc->certs_data, 0, report_req->certs_len);
+       npages = report_req->certs_len >> PAGE_SHIFT;
++      page = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO,
++                         get_order(report_req->certs_len));
++      if (!page)
++              return -ENOMEM;
++
++      req.certs_data = page_address(page);
++      ret = set_memory_decrypted((unsigned long)req.certs_data, npages);
++      if (ret) {
++              pr_err("failed to mark page shared, ret=%d\n", ret);
++              __free_pages(page, get_order(report_req->certs_len));
++              return -EFAULT;
++      }
++
+ cmd:
+       /*
+        * The intermediate response buffer is used while decrypting the
+@@ -538,10 +556,12 @@ cmd:
+        */
+       resp_len = sizeof(report_resp->data) + mdesc->ctx->authsize;
+       report_resp = kzalloc(resp_len, GFP_KERNEL_ACCOUNT);
+-      if (!report_resp)
+-              return -ENOMEM;
++      if (!report_resp) {
++              ret = -ENOMEM;
++              goto e_free_data;
++      }
+-      mdesc->input.data_npages = npages;
++      req.input.data_npages = npages;
+       req.msg_version = arg->msg_version;
+       req.msg_type = SNP_MSG_REPORT_REQ;
+@@ -556,7 +576,7 @@ cmd:
+       /* If certs length is invalid then copy the returned length */
+       if (arg->vmm_error == SNP_GUEST_VMM_ERR_INVALID_LEN) {
+-              report_req->certs_len = mdesc->input.data_npages << PAGE_SHIFT;
++              report_req->certs_len = req.input.data_npages << PAGE_SHIFT;
+               if (copy_to_sockptr(io->req_data, report_req, sizeof(*report_req)))
+                       ret = -EFAULT;
+@@ -565,7 +585,7 @@ cmd:
+       if (ret)
+               goto e_free;
+-      if (npages && copy_to_sockptr(certs_address, mdesc->certs_data, report_req->certs_len)) {
++      if (npages && copy_to_sockptr(certs_address, req.certs_data, report_req->certs_len)) {
+               ret = -EFAULT;
+               goto e_free;
+       }
+@@ -575,6 +595,13 @@ cmd:
+ e_free:
+       kfree(report_resp);
++e_free_data:
++      if (npages) {
++              if (set_memory_encrypted((unsigned long)req.certs_data, npages))
++                      WARN_ONCE(ret, "failed to restore encryption mask (leak it)\n");
++              else
++                      __free_pages(page, get_order(report_req->certs_len));
++      }
+       return ret;
+ }
+@@ -1048,35 +1075,26 @@ static int __init sev_guest_probe(struct
+       if (!mdesc->response)
+               goto e_free_request;
+-      mdesc->certs_data = alloc_shared_pages(dev, SEV_FW_BLOB_MAX_SIZE);
+-      if (!mdesc->certs_data)
+-              goto e_free_response;
+-
+       ret = -EIO;
+       mdesc->ctx = snp_init_crypto(mdesc->vmpck, VMPCK_KEY_LEN);
+       if (!mdesc->ctx)
+-              goto e_free_cert_data;
++              goto e_free_response;
+       misc = &snp_dev->misc;
+       misc->minor = MISC_DYNAMIC_MINOR;
+       misc->name = DEVICE_NAME;
+       misc->fops = &snp_guest_fops;
+-      /* Initialize the input addresses for guest request */
+-      mdesc->input.req_gpa = __pa(mdesc->request);
+-      mdesc->input.resp_gpa = __pa(mdesc->response);
+-      mdesc->input.data_gpa = __pa(mdesc->certs_data);
+-
+       /* Set the privlevel_floor attribute based on the vmpck_id */
+       sev_tsm_ops.privlevel_floor = vmpck_id;
+       ret = tsm_register(&sev_tsm_ops, snp_dev);
+       if (ret)
+-              goto e_free_cert_data;
++              goto e_free_response;
+       ret = devm_add_action_or_reset(&pdev->dev, unregister_sev_tsm, NULL);
+       if (ret)
+-              goto e_free_cert_data;
++              goto e_free_response;
+       ret =  misc_register(misc);
+       if (ret)
+@@ -1088,8 +1106,6 @@ static int __init sev_guest_probe(struct
+ e_free_ctx:
+       kfree(mdesc->ctx);
+-e_free_cert_data:
+-      free_shared_pages(mdesc->certs_data, SEV_FW_BLOB_MAX_SIZE);
+ e_free_response:
+       free_shared_pages(mdesc->response, sizeof(struct snp_guest_msg));
+ e_free_request:
+@@ -1104,7 +1120,6 @@ static void __exit sev_guest_remove(stru
+       struct snp_guest_dev *snp_dev = platform_get_drvdata(pdev);
+       struct snp_msg_desc *mdesc = snp_dev->msg_desc;
+-      free_shared_pages(mdesc->certs_data, SEV_FW_BLOB_MAX_SIZE);
+       free_shared_pages(mdesc->response, sizeof(struct snp_guest_msg));
+       free_shared_pages(mdesc->request, sizeof(struct snp_guest_msg));
+       kfree(mdesc->ctx);