5.13-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Mon, 16 Aug 2021 08:27:45 +0000 (10:27 +0200)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Mon, 16 Aug 2021 08:27:45 +0000 (10:27 +0200)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 16 Aug 2021 08:27:45 +0000 (10:27 +0200)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 16 Aug 2021 08:27:45 +0000 (10:27 +0200)
diff --git a/queue-5.13/efi-libstub-arm64-double-check-image-alignment-at-entry.patch b/queue-5.13/efi-libstub-arm64-double-check-image-alignment-at-entry.patch

new file mode 100644 (file)

index 0000000..b4bc050
--- /dev/null
+++ b/queue-5.13/efi-libstub-arm64-double-check-image-alignment-at-entry.patch
@@ -0,0 +1,41 @@
+From c32ac11da3f83bb42b986702a9b92f0a14ed4182 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Mon, 26 Jul 2021 16:31:44 +0200
+Subject: efi/libstub: arm64: Double check image alignment at entry
+
+From: Ard Biesheuvel <ardb@kernel.org>
+
+commit c32ac11da3f83bb42b986702a9b92f0a14ed4182 upstream.
+
+On arm64, the stub only moves the kernel image around in memory if
+needed, which is typically only for KASLR, given that relocatable
+kernels (which is the default) can run from any 64k aligned address,
+which is also the minimum alignment communicated to EFI via the PE/COFF
+header.
+
+Unfortunately, some loaders appear to ignore this header, and load the
+kernel at some arbitrary offset in memory. We can deal with this, but
+let's check for this condition anyway, so non-compliant code can be
+spotted and fixed.
+
+Cc: <stable@vger.kernel.org> # v5.10+
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Tested-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/firmware/efi/libstub/arm64-stub.c |    4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/drivers/firmware/efi/libstub/arm64-stub.c
++++ b/drivers/firmware/efi/libstub/arm64-stub.c
+@@ -119,6 +119,10 @@ efi_status_t handle_kernel_image(unsigne
+       if (image->image_base != _text)
+               efi_err("FIRMWARE BUG: efi_loaded_image_t::image_base has bogus value\n");
+ 
++      if (!IS_ALIGNED((u64)_text, EFI_KIMG_ALIGN))
++              efi_err("FIRMWARE BUG: kernel image not aligned on %ldk boundary\n",
++                      EFI_KIMG_ALIGN >> 10);
++
+       kernel_size = _edata - _text;
+       kernel_memsize = kernel_size + (_end - _edata);
+       *reserve_size = kernel_memsize;
diff --git a/queue-5.13/kvm-nvmx-use-vmx_need_pf_intercept-when-deciding-if-l0-wants-a-pf.patch b/queue-5.13/kvm-nvmx-use-vmx_need_pf_intercept-when-deciding-if-l0-wants-a-pf.patch

new file mode 100644 (file)

index 0000000..b023341
--- /dev/null
+++ b/queue-5.13/kvm-nvmx-use-vmx_need_pf_intercept-when-deciding-if-l0-wants-a-pf.patch
@@ -0,0 +1,46 @@
+From 18712c13709d2de9516c5d3414f707c4f0a9c190 Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc@google.com>
+Date: Wed, 11 Aug 2021 21:56:15 -0700
+Subject: KVM: nVMX: Use vmx_need_pf_intercept() when deciding if L0 wants a #PF
+
+From: Sean Christopherson <seanjc@google.com>
+
+commit 18712c13709d2de9516c5d3414f707c4f0a9c190 upstream.
+
+Use vmx_need_pf_intercept() when determining if L0 wants to handle a #PF
+in L2 or if the VM-Exit should be forwarded to L1.  The current logic fails
+to account for the case where #PF is intercepted to handle
+guest.MAXPHYADDR < host.MAXPHYADDR and ends up reflecting all #PFs into
+L1.  At best, L1 will complain and inject the #PF back into L2.  At
+worst, L1 will eat the unexpected fault and cause L2 to hang on infinite
+page faults.
+
+Note, while the bug was technically introduced by the commit that added
+support for the MAXPHYADDR madness, the shame is all on commit
+a0c134347baf ("KVM: VMX: introduce vmx_need_pf_intercept").
+
+Fixes: 1dbf5d68af6f ("KVM: VMX: Add guest physical address check in EPT violation and misconfig")
+Cc: stable@vger.kernel.org
+Cc: Peter Shier <pshier@google.com>
+Cc: Oliver Upton <oupton@google.com>
+Cc: Jim Mattson <jmattson@google.com>
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Message-Id: <20210812045615.3167686-1-seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/vmx/nested.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/kvm/vmx/nested.c
++++ b/arch/x86/kvm/vmx/nested.c
+@@ -5798,7 +5798,8 @@ static bool nested_vmx_l0_wants_exit(str
+               if (is_nmi(intr_info))
+                       return true;
+               else if (is_page_fault(intr_info))
+-                      return vcpu->arch.apf.host_apf_flags || !enable_ept;
++                      return vcpu->arch.apf.host_apf_flags ||
++                             vmx_need_pf_intercept(vcpu);
+               else if (is_debug(intr_info) &&
+                        vcpu->guest_debug &
+                        (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
diff --git a/queue-5.13/kvm-vmx-use-current-vmcs-to-query-waitpkg-support-for-msr-emulation.patch b/queue-5.13/kvm-vmx-use-current-vmcs-to-query-waitpkg-support-for-msr-emulation.patch

new file mode 100644 (file)

index 0000000..9d74028
--- /dev/null
+++ b/queue-5.13/kvm-vmx-use-current-vmcs-to-query-waitpkg-support-for-msr-emulation.patch
@@ -0,0 +1,41 @@
+From 7b9cae027ba3aaac295ae23a62f47876ed97da73 Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc@google.com>
+Date: Tue, 10 Aug 2021 10:19:49 -0700
+Subject: KVM: VMX: Use current VMCS to query WAITPKG support for MSR emulation
+
+From: Sean Christopherson <seanjc@google.com>
+
+commit 7b9cae027ba3aaac295ae23a62f47876ed97da73 upstream.
+
+Use the secondary_exec_controls_get() accessor in vmx_has_waitpkg() to
+effectively get the controls for the current VMCS, as opposed to using
+vmx->secondary_exec_controls, which is the cached value of KVM's desired
+controls for vmcs01 and truly not reflective of any particular VMCS.
+
+While the waitpkg control is not dynamic, i.e. vmcs01 will always hold
+the same waitpkg configuration as vmx->secondary_exec_controls, the same
+does not hold true for vmcs02 if the L1 VMM hides the feature from L2.
+If L1 hides the feature _and_ does not intercept MSR_IA32_UMWAIT_CONTROL,
+L2 could incorrectly read/write L1's virtual MSR instead of taking a #GP.
+
+Fixes: 6e3ba4abcea5 ("KVM: vmx: Emulate MSR IA32_UMWAIT_CONTROL")
+Cc: stable@vger.kernel.org
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Message-Id: <20210810171952.2758100-2-seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/vmx/vmx.h |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/x86/kvm/vmx/vmx.h
++++ b/arch/x86/kvm/vmx/vmx.h
+@@ -538,7 +538,7 @@ static inline void decache_tsc_multiplie
+ 
+ static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx)
+ {
+-      return vmx->secondary_exec_control &
++      return secondary_exec_controls_get(vmx) &
+               SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
+ }
+ 
diff --git a/queue-5.13/kvm-x86-mmu-don-t-leak-non-leaf-sptes-when-zapping-all-sptes.patch b/queue-5.13/kvm-x86-mmu-don-t-leak-non-leaf-sptes-when-zapping-all-sptes.patch

new file mode 100644 (file)

index 0000000..d8d3b41
--- /dev/null
+++ b/queue-5.13/kvm-x86-mmu-don-t-leak-non-leaf-sptes-when-zapping-all-sptes.patch
@@ -0,0 +1,148 @@
+From 524a1e4e381fc5e7781008d5bd420fd1357c0113 Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc@google.com>
+Date: Thu, 12 Aug 2021 11:14:13 -0700
+Subject: KVM: x86/mmu: Don't leak non-leaf SPTEs when zapping all SPTEs
+
+From: Sean Christopherson <seanjc@google.com>
+
+commit 524a1e4e381fc5e7781008d5bd420fd1357c0113 upstream.
+
+Pass "all ones" as the end GFN to signal "zap all" for the TDP MMU and
+really zap all SPTEs in this case.  As is, zap_gfn_range() skips non-leaf
+SPTEs whose range exceeds the range to be zapped.  If shadow_phys_bits is
+not aligned to the range size of top-level SPTEs, e.g. 512gb with 4-level
+paging, the "zap all" flows will skip top-level SPTEs whose range extends
+beyond shadow_phys_bits and leak their SPs when the VM is destroyed.
+
+Use the current upper bound (based on host.MAXPHYADDR) to detect that the
+caller wants to zap all SPTEs, e.g. instead of using the max theoretical
+gfn, 1 << (52 - 12).  The more precise upper bound allows the TDP iterator
+to terminate its walk earlier when running on hosts with MAXPHYADDR < 52.
+
+Add a WARN on kmv->arch.tdp_mmu_pages when the TDP MMU is destroyed to
+help future debuggers should KVM decide to leak SPTEs again.
+
+The bug is most easily reproduced by running (and unloading!) KVM in a
+VM whose host.MAXPHYADDR < 39, as the SPTE for gfn=0 will be skipped.
+
+  =============================================================================
+  BUG kvm_mmu_page_header (Not tainted): Objects remaining in kvm_mmu_page_header on __kmem_cache_shutdown()
+  -----------------------------------------------------------------------------
+  Slab 0x000000004d8f7af1 objects=22 used=2 fp=0x00000000624d29ac flags=0x4000000000000200(slab|zone=1)
+  CPU: 0 PID: 1582 Comm: rmmod Not tainted 5.14.0-rc2+ #420
+  Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
+  Call Trace:
+   dump_stack_lvl+0x45/0x59
+   slab_err+0x95/0xc9
+   __kmem_cache_shutdown.cold+0x3c/0x158
+   kmem_cache_destroy+0x3d/0xf0
+   kvm_mmu_module_exit+0xa/0x30 [kvm]
+   kvm_arch_exit+0x5d/0x90 [kvm]
+   kvm_exit+0x78/0x90 [kvm]
+   vmx_exit+0x1a/0x50 [kvm_intel]
+   __x64_sys_delete_module+0x13f/0x220
+   do_syscall_64+0x3b/0xc0
+   entry_SYSCALL_64_after_hwframe+0x44/0xae
+
+Fixes: faaf05b00aec ("kvm: x86/mmu: Support zapping SPTEs in the TDP MMU")
+Cc: stable@vger.kernel.org
+Cc: Ben Gardon <bgardon@google.com>
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Message-Id: <20210812181414.3376143-2-seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/mmu/tdp_mmu.c |   26 ++++++++++++++++----------
+ 1 file changed, 16 insertions(+), 10 deletions(-)
+
+--- a/arch/x86/kvm/mmu/tdp_mmu.c
++++ b/arch/x86/kvm/mmu/tdp_mmu.c
+@@ -41,6 +41,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *
+       if (!kvm->arch.tdp_mmu_enabled)
+               return;
+ 
++      WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
+       WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
+ 
+       /*
+@@ -79,8 +80,6 @@ static void tdp_mmu_free_sp_rcu_callback
+ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
+                         bool shared)
+ {
+-      gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
+-
+       kvm_lockdep_assert_mmu_lock_held(kvm, shared);
+ 
+       if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
+@@ -92,7 +91,7 @@ void kvm_tdp_mmu_put_root(struct kvm *kv
+       list_del_rcu(&root->link);
+       spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+ 
+-      zap_gfn_range(kvm, root, 0, max_gfn, false, false, shared);
++      zap_gfn_range(kvm, root, 0, -1ull, false, false, shared);
+ 
+       call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
+ }
+@@ -722,8 +721,17 @@ static bool zap_gfn_range(struct kvm *kv
+                         gfn_t start, gfn_t end, bool can_yield, bool flush,
+                         bool shared)
+ {
++      gfn_t max_gfn_host = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
++      bool zap_all = (start == 0 && end >= max_gfn_host);
+       struct tdp_iter iter;
+ 
++      /*
++       * Bound the walk at host.MAXPHYADDR, guest accesses beyond that will
++       * hit a #PF(RSVD) and never get to an EPT Violation/Misconfig / #NPF,
++       * and so KVM will never install a SPTE for such addresses.
++       */
++      end = min(end, max_gfn_host);
++
+       kvm_lockdep_assert_mmu_lock_held(kvm, shared);
+ 
+       rcu_read_lock();
+@@ -742,9 +750,10 @@ retry:
+               /*
+                * If this is a non-last-level SPTE that covers a larger range
+                * than should be zapped, continue, and zap the mappings at a
+-               * lower level.
++               * lower level, except when zapping all SPTEs.
+                */
+-              if ((iter.gfn < start ||
++              if (!zap_all &&
++                  (iter.gfn < start ||
+                    iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
+                   !is_last_spte(iter.old_spte, iter.level))
+                       continue;
+@@ -792,12 +801,11 @@ bool __kvm_tdp_mmu_zap_gfn_range(struct
+ 
+ void kvm_tdp_mmu_zap_all(struct kvm *kvm)
+ {
+-      gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
+       bool flush = false;
+       int i;
+ 
+       for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
+-              flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn,
++              flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull,
+                                                 flush, false);
+ 
+       if (flush)
+@@ -836,7 +844,6 @@ static struct kvm_mmu_page *next_invalid
+  */
+ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
+ {
+-      gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
+       struct kvm_mmu_page *next_root;
+       struct kvm_mmu_page *root;
+       bool flush = false;
+@@ -852,8 +859,7 @@ void kvm_tdp_mmu_zap_invalidated_roots(s
+ 
+               rcu_read_unlock();
+ 
+-              flush = zap_gfn_range(kvm, root, 0, max_gfn, true, flush,
+-                                    true);
++              flush = zap_gfn_range(kvm, root, 0, -1ull, true, flush, true);
+ 
+               /*
+                * Put the reference acquired in
diff --git a/queue-5.13/kvm-x86-mmu-protect-marking-sps-unsync-when-using-tdp-mmu-with-spinlock.patch b/queue-5.13/kvm-x86-mmu-protect-marking-sps-unsync-when-using-tdp-mmu-with-spinlock.patch

new file mode 100644 (file)

index 0000000..29b36c4
--- /dev/null
+++ b/queue-5.13/kvm-x86-mmu-protect-marking-sps-unsync-when-using-tdp-mmu-with-spinlock.patch
@@ -0,0 +1,159 @@
+From ce25681d59ffc4303321e555a2d71b1946af07da Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc@google.com>
+Date: Thu, 12 Aug 2021 11:18:15 -0700
+Subject: KVM: x86/mmu: Protect marking SPs unsync when using TDP MMU with spinlock
+
+From: Sean Christopherson <seanjc@google.com>
+
+commit ce25681d59ffc4303321e555a2d71b1946af07da upstream.
+
+Add yet another spinlock for the TDP MMU and take it when marking indirect
+shadow pages unsync.  When using the TDP MMU and L1 is running L2(s) with
+nested TDP, KVM may encounter shadow pages for the TDP entries managed by
+L1 (controlling L2) when handling a TDP MMU page fault.  The unsync logic
+is not thread safe, e.g. the kvm_mmu_page fields are not atomic, and
+misbehaves when a shadow page is marked unsync via a TDP MMU page fault,
+which runs with mmu_lock held for read, not write.
+
+Lack of a critical section manifests most visibly as an underflow of
+unsync_children in clear_unsync_child_bit() due to unsync_children being
+corrupted when multiple CPUs write it without a critical section and
+without atomic operations.  But underflow is the best case scenario.  The
+worst case scenario is that unsync_children prematurely hits '0' and
+leads to guest memory corruption due to KVM neglecting to properly sync
+shadow pages.
+
+Use an entirely new spinlock even though piggybacking tdp_mmu_pages_lock
+would functionally be ok.  Usurping the lock could degrade performance when
+building upper level page tables on different vCPUs, especially since the
+unsync flow could hold the lock for a comparatively long time depending on
+the number of indirect shadow pages and the depth of the paging tree.
+
+For simplicity, take the lock for all MMUs, even though KVM could fairly
+easily know that mmu_lock is held for write.  If mmu_lock is held for
+write, there cannot be contention for the inner spinlock, and marking
+shadow pages unsync across multiple vCPUs will be slow enough that
+bouncing the kvm_arch cacheline should be in the noise.
+
+Note, even though L2 could theoretically be given access to its own EPT
+entries, a nested MMU must hold mmu_lock for write and thus cannot race
+against a TDP MMU page fault.  I.e. the additional spinlock only _needs_ to
+be taken by the TDP MMU, as opposed to being taken by any MMU for a VM
+that is running with the TDP MMU enabled.  Holding mmu_lock for read also
+prevents the indirect shadow page from being freed.  But as above, keep
+it simple and always take the lock.
+
+Alternative #1, the TDP MMU could simply pass "false" for can_unsync and
+effectively disable unsync behavior for nested TDP.  Write protecting leaf
+shadow pages is unlikely to noticeably impact traditional L1 VMMs, as such
+VMMs typically don't modify TDP entries, but the same may not hold true for
+non-standard use cases and/or VMMs that are migrating physical pages (from
+L1's perspective).
+
+Alternative #2, the unsync logic could be made thread safe.  In theory,
+simply converting all relevant kvm_mmu_page fields to atomics and using
+atomic bitops for the bitmap would suffice.  However, (a) an in-depth audit
+would be required, (b) the code churn would be substantial, and (c) legacy
+shadow paging would incur additional atomic operations in performance
+sensitive paths for no benefit (to legacy shadow paging).
+
+Fixes: a2855afc7ee8 ("KVM: x86/mmu: Allow parallel page faults for the TDP MMU")
+Cc: stable@vger.kernel.org
+Cc: Ben Gardon <bgardon@google.com>
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Message-Id: <20210812181815.3378104-1-seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ Documentation/virt/kvm/locking.rst |    8 ++++----
+ arch/x86/include/asm/kvm_host.h    |    7 +++++++
+ arch/x86/kvm/mmu/mmu.c             |   28 ++++++++++++++++++++++++++++
+ 3 files changed, 39 insertions(+), 4 deletions(-)
+
+--- a/Documentation/virt/kvm/locking.rst
++++ b/Documentation/virt/kvm/locking.rst
+@@ -20,10 +20,10 @@ On x86:
+ 
+ - vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock
+ 
+-- kvm->arch.mmu_lock is an rwlock.  kvm->arch.tdp_mmu_pages_lock is
+-  taken inside kvm->arch.mmu_lock, and cannot be taken without already
+-  holding kvm->arch.mmu_lock (typically with ``read_lock``, otherwise
+-  there's no need to take kvm->arch.tdp_mmu_pages_lock at all).
++- kvm->arch.mmu_lock is an rwlock.  kvm->arch.tdp_mmu_pages_lock and
++  kvm->arch.mmu_unsync_pages_lock are taken inside kvm->arch.mmu_lock, and
++  cannot be taken without already holding kvm->arch.mmu_lock (typically with
++  ``read_lock`` for the TDP MMU, thus the need for additional spinlocks).
+ 
+ Everything else is a leaf: no other lock is taken inside the critical
+ sections.
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -987,6 +987,13 @@ struct kvm_arch {
+       struct list_head lpage_disallowed_mmu_pages;
+       struct kvm_page_track_notifier_node mmu_sp_tracker;
+       struct kvm_page_track_notifier_head track_notifier_head;
++      /*
++       * Protects marking pages unsync during page faults, as TDP MMU page
++       * faults only take mmu_lock for read.  For simplicity, the unsync
++       * pages lock is always taken when marking pages unsync regardless of
++       * whether mmu_lock is held for read or write.
++       */
++      spinlock_t mmu_unsync_pages_lock;
+ 
+       struct list_head assigned_dev_head;
+       struct iommu_domain *iommu_domain;
+--- a/arch/x86/kvm/mmu/mmu.c
++++ b/arch/x86/kvm/mmu/mmu.c
+@@ -2454,6 +2454,7 @@ bool mmu_need_write_protect(struct kvm_v
+                           bool can_unsync)
+ {
+       struct kvm_mmu_page *sp;
++      bool locked = false;
+ 
+       if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
+               return true;
+@@ -2465,9 +2466,34 @@ bool mmu_need_write_protect(struct kvm_v
+               if (sp->unsync)
+                       continue;
+ 
++              /*
++               * TDP MMU page faults require an additional spinlock as they
++               * run with mmu_lock held for read, not write, and the unsync
++               * logic is not thread safe.  Take the spinklock regardless of
++               * the MMU type to avoid extra conditionals/parameters, there's
++               * no meaningful penalty if mmu_lock is held for write.
++               */
++              if (!locked) {
++                      locked = true;
++                      spin_lock(&vcpu->kvm->arch.mmu_unsync_pages_lock);
++
++                      /*
++                       * Recheck after taking the spinlock, a different vCPU
++                       * may have since marked the page unsync.  A false
++                       * positive on the unprotected check above is not
++                       * possible as clearing sp->unsync _must_ hold mmu_lock
++                       * for write, i.e. unsync cannot transition from 0->1
++                       * while this CPU holds mmu_lock for read (or write).
++                       */
++                      if (READ_ONCE(sp->unsync))
++                              continue;
++              }
++
+               WARN_ON(sp->role.level != PG_LEVEL_4K);
+               kvm_unsync_page(vcpu, sp);
+       }
++      if (locked)
++              spin_unlock(&vcpu->kvm->arch.mmu_unsync_pages_lock);
+ 
+       /*
+        * We need to ensure that the marking of unsync pages is visible
+@@ -5514,6 +5540,8 @@ void kvm_mmu_init_vm(struct kvm *kvm)
+ {
+       struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
+ 
++      spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
++
+       kvm_mmu_init_tdp_mmu(kvm);
+ 
+       node->track_write = kvm_mmu_pte_write;
diff --git a/queue-5.13/locking-rtmutex-use-the-correct-rtmutex-debugging-config-option.patch b/queue-5.13/locking-rtmutex-use-the-correct-rtmutex-debugging-config-option.patch

new file mode 100644 (file)

index 0000000..387433b
--- /dev/null
+++ b/queue-5.13/locking-rtmutex-use-the-correct-rtmutex-debugging-config-option.patch
@@ -0,0 +1,34 @@
+From 07d25971b220e477eb019fcb520a9f2e3ac966af Mon Sep 17 00:00:00 2001
+From: Zhen Lei <thunder.leizhen@huawei.com>
+Date: Sat, 31 Jul 2021 20:30:11 +0800
+Subject: locking/rtmutex: Use the correct rtmutex debugging config option
+
+From: Zhen Lei <thunder.leizhen@huawei.com>
+
+commit 07d25971b220e477eb019fcb520a9f2e3ac966af upstream.
+
+It's CONFIG_DEBUG_RT_MUTEXES not CONFIG_DEBUG_RT_MUTEX.
+
+Fixes: f7efc4799f81 ("locking/rtmutex: Inline chainwalk depth check")
+Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Acked-by: Will Deacon <will@kernel.org>
+Acked-by: Boqun Feng <boqun.feng@gmail.com>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20210731123011.4555-1-thunder.leizhen@huawei.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/locking/rtmutex.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/kernel/locking/rtmutex.c
++++ b/kernel/locking/rtmutex.c
+@@ -343,7 +343,7 @@ static __always_inline bool
+ rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter,
+                             enum rtmutex_chainwalk chwalk)
+ {
+-      if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEX))
++      if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES))
+               return waiter != NULL;
+       return chwalk == RT_MUTEX_FULL_CHAINWALK;
+ }
diff --git a/queue-5.13/pci-msi-correct-misleading-comments.patch b/queue-5.13/pci-msi-correct-misleading-comments.patch

new file mode 100644 (file)

index 0000000..daced38
--- /dev/null
+++ b/queue-5.13/pci-msi-correct-misleading-comments.patch
@@ -0,0 +1,45 @@
+From 689e6b5351573c38ccf92a0dd8b3e2c2241e4aff Mon Sep 17 00:00:00 2001
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Thu, 29 Jul 2021 23:51:45 +0200
+Subject: PCI/MSI: Correct misleading comments
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit 689e6b5351573c38ccf92a0dd8b3e2c2241e4aff upstream.
+
+The comments about preserving the cached state in pci_msi[x]_shutdown() are
+misleading as the MSI descriptors are freed right after those functions
+return. So there is nothing to restore. Preparatory change.
+
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Tested-by: Marc Zyngier <maz@kernel.org>
+Reviewed-by: Marc Zyngier <maz@kernel.org>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20210729222542.621609423@linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/pci/msi.c |    5 +----
+ 1 file changed, 1 insertion(+), 4 deletions(-)
+
+--- a/drivers/pci/msi.c
++++ b/drivers/pci/msi.c
+@@ -961,7 +961,6 @@ static void pci_msi_shutdown(struct pci_
+ 
+       /* Return the device with MSI unmasked as initial states */
+       mask = msi_mask(desc->msi_attrib.multi_cap);
+-      /* Keep cached state to be restored */
+       __pci_msi_desc_mask_irq(desc, mask, 0);
+ 
+       /* Restore dev->irq to its default pin-assertion IRQ */
+@@ -1047,10 +1046,8 @@ static void pci_msix_shutdown(struct pci
+       }
+ 
+       /* Return the device with MSI-X masked as initial states */
+-      for_each_pci_msi_entry(entry, dev) {
+-              /* Keep cached states to be restored */
++      for_each_pci_msi_entry(entry, dev)
+               __pci_msix_desc_mask_irq(entry, 1);
+-      }
+ 
+       pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0);
+       pci_intx_for_msi(dev, 1);
diff --git a/queue-5.13/pci-msi-do-not-set-invalid-bits-in-msi-mask.patch b/queue-5.13/pci-msi-do-not-set-invalid-bits-in-msi-mask.patch

new file mode 100644 (file)

index 0000000..0dcc3f9
--- /dev/null
+++ b/queue-5.13/pci-msi-do-not-set-invalid-bits-in-msi-mask.patch
@@ -0,0 +1,64 @@
+From 361fd37397f77578735907341579397d5bed0a2d Mon Sep 17 00:00:00 2001
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Thu, 29 Jul 2021 23:51:44 +0200
+Subject: PCI/MSI: Do not set invalid bits in MSI mask
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit 361fd37397f77578735907341579397d5bed0a2d upstream.
+
+msi_mask_irq() takes a mask and a flags argument. The mask argument is used
+to mask out bits from the cached mask and the flags argument to set bits.
+
+Some places invoke it with a flags argument which sets bits which are not
+used by the device, i.e. when the device supports up to 8 vectors a full
+unmask in some places sets the mask to 0xFFFFFF00. While devices probably
+do not care, it's still bad practice.
+
+Fixes: 7ba1930db02f ("PCI MSI: Unmask MSI if setup failed")
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Tested-by: Marc Zyngier <maz@kernel.org>
+Reviewed-by: Marc Zyngier <maz@kernel.org>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20210729222542.568173099@linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/pci/msi.c |    8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+
+--- a/drivers/pci/msi.c
++++ b/drivers/pci/msi.c
+@@ -656,21 +656,21 @@ static int msi_capability_init(struct pc
+       /* Configure MSI capability structure */
+       ret = pci_msi_setup_msi_irqs(dev, nvec, PCI_CAP_ID_MSI);
+       if (ret) {
+-              msi_mask_irq(entry, mask, ~mask);
++              msi_mask_irq(entry, mask, 0);
+               free_msi_irqs(dev);
+               return ret;
+       }
+ 
+       ret = msi_verify_entries(dev);
+       if (ret) {
+-              msi_mask_irq(entry, mask, ~mask);
++              msi_mask_irq(entry, mask, 0);
+               free_msi_irqs(dev);
+               return ret;
+       }
+ 
+       ret = populate_msi_sysfs(dev);
+       if (ret) {
+-              msi_mask_irq(entry, mask, ~mask);
++              msi_mask_irq(entry, mask, 0);
+               free_msi_irqs(dev);
+               return ret;
+       }
+@@ -962,7 +962,7 @@ static void pci_msi_shutdown(struct pci_
+       /* Return the device with MSI unmasked as initial states */
+       mask = msi_mask(desc->msi_attrib.multi_cap);
+       /* Keep cached state to be restored */
+-      __pci_msi_desc_mask_irq(desc, mask, ~mask);
++      __pci_msi_desc_mask_irq(desc, mask, 0);
+ 
+       /* Restore dev->irq to its default pin-assertion IRQ */
+       dev->irq = desc->msi_attrib.default_irq;
diff --git a/queue-5.13/pci-msi-enforce-msi-entry-updates-to-be-visible.patch b/queue-5.13/pci-msi-enforce-msi-entry-updates-to-be-visible.patch

new file mode 100644 (file)

index 0000000..b45429d
--- /dev/null
+++ b/queue-5.13/pci-msi-enforce-msi-entry-updates-to-be-visible.patch
@@ -0,0 +1,54 @@
+From b9255a7cb51754e8d2645b65dd31805e282b4f3e Mon Sep 17 00:00:00 2001
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Thu, 29 Jul 2021 23:51:43 +0200
+Subject: PCI/MSI: Enforce MSI[X] entry updates to be visible
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit b9255a7cb51754e8d2645b65dd31805e282b4f3e upstream.
+
+Nothing enforces the posted writes to be visible when the function
+returns. Flush them even if the flush might be redundant when the entry is
+masked already as the unmask will flush as well. This is either setup or a
+rare affinity change event so the extra flush is not the end of the world.
+
+While this is more a theoretical issue especially the logic in the X86
+specific msi_set_affinity() function relies on the assumption that the
+update has reached the hardware when the function returns.
+
+Again, as this never has been enforced the Fixes tag refers to a commit in:
+   git://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git
+
+Fixes: f036d4ea5fa7 ("[PATCH] ia32 Message Signalled Interrupt support")
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Tested-by: Marc Zyngier <maz@kernel.org>
+Reviewed-by: Marc Zyngier <maz@kernel.org>
+Acked-by: Bjorn Helgaas <bhelgaas@google.com>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20210729222542.515188147@linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/pci/msi.c |    5 +++++
+ 1 file changed, 5 insertions(+)
+
+--- a/drivers/pci/msi.c
++++ b/drivers/pci/msi.c
+@@ -311,6 +311,9 @@ void __pci_write_msi_msg(struct msi_desc
+ 
+               if (unmasked)
+                       __pci_msix_desc_mask_irq(entry, 0);
++
++              /* Ensure that the writes are visible in the device */
++              readl(base + PCI_MSIX_ENTRY_DATA);
+       } else {
+               int pos = dev->msi_cap;
+               u16 msgctl;
+@@ -331,6 +334,8 @@ void __pci_write_msi_msg(struct msi_desc
+                       pci_write_config_word(dev, pos + PCI_MSI_DATA_32,
+                                             msg->data);
+               }
++              /* Ensure that the writes are visible in the device */
++              pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &msgctl);
+       }
+ 
+ skip:
diff --git a/queue-5.13/pci-msi-enforce-that-msi-x-table-entry-is-masked-for-update.patch b/queue-5.13/pci-msi-enforce-that-msi-x-table-entry-is-masked-for-update.patch

new file mode 100644 (file)

index 0000000..b5cda43
--- /dev/null
+++ b/queue-5.13/pci-msi-enforce-that-msi-x-table-entry-is-masked-for-update.patch
@@ -0,0 +1,73 @@
+From da181dc974ad667579baece33c2c8d2d1e4558d5 Mon Sep 17 00:00:00 2001
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Thu, 29 Jul 2021 23:51:42 +0200
+Subject: PCI/MSI: Enforce that MSI-X table entry is masked for update
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit da181dc974ad667579baece33c2c8d2d1e4558d5 upstream.
+
+The specification (PCIe r5.0, sec 6.1.4.5) states:
+
+    For MSI-X, a function is permitted to cache Address and Data values
+    from unmasked MSI-X Table entries. However, anytime software unmasks a
+    currently masked MSI-X Table entry either by clearing its Mask bit or
+    by clearing the Function Mask bit, the function must update any Address
+    or Data values that it cached from that entry. If software changes the
+    Address or Data value of an entry while the entry is unmasked, the
+    result is undefined.
+
+The Linux kernel's MSI-X support never enforced that the entry is masked
+before the entry is modified hence the Fixes tag refers to a commit in:
+      git://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git
+
+Enforce the entry to be masked across the update.
+
+There is no point in enforcing this to be handled at all possible call
+sites as this is just pointless code duplication and the common update
+function is the obvious place to enforce this.
+
+Fixes: f036d4ea5fa7 ("[PATCH] ia32 Message Signalled Interrupt support")
+Reported-by: Kevin Tian <kevin.tian@intel.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Tested-by: Marc Zyngier <maz@kernel.org>
+Reviewed-by: Marc Zyngier <maz@kernel.org>
+Acked-by: Bjorn Helgaas <bhelgaas@google.com>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20210729222542.462096385@linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/pci/msi.c |   15 +++++++++++++++
+ 1 file changed, 15 insertions(+)
+
+--- a/drivers/pci/msi.c
++++ b/drivers/pci/msi.c
+@@ -289,13 +289,28 @@ void __pci_write_msi_msg(struct msi_desc
+               /* Don't touch the hardware now */
+       } else if (entry->msi_attrib.is_msix) {
+               void __iomem *base = pci_msix_desc_addr(entry);
++              bool unmasked = !(entry->masked & PCI_MSIX_ENTRY_CTRL_MASKBIT);
+ 
+               if (!base)
+                       goto skip;
+ 
++              /*
++               * The specification mandates that the entry is masked
++               * when the message is modified:
++               *
++               * "If software changes the Address or Data value of an
++               * entry while the entry is unmasked, the result is
++               * undefined."
++               */
++              if (unmasked)
++                      __pci_msix_desc_mask_irq(entry, PCI_MSIX_ENTRY_CTRL_MASKBIT);
++
+               writel(msg->address_lo, base + PCI_MSIX_ENTRY_LOWER_ADDR);
+               writel(msg->address_hi, base + PCI_MSIX_ENTRY_UPPER_ADDR);
+               writel(msg->data, base + PCI_MSIX_ENTRY_DATA);
++
++              if (unmasked)
++                      __pci_msix_desc_mask_irq(entry, 0);
+       } else {
+               int pos = dev->msi_cap;
+               u16 msgctl;
diff --git a/queue-5.13/pci-msi-mask-all-unused-msi-x-entries.patch b/queue-5.13/pci-msi-mask-all-unused-msi-x-entries.patch

new file mode 100644 (file)

index 0000000..7e608ae
--- /dev/null
+++ b/queue-5.13/pci-msi-mask-all-unused-msi-x-entries.patch
@@ -0,0 +1,181 @@
+From 7d5ec3d3612396dc6d4b76366d20ab9fc06f399f Mon Sep 17 00:00:00 2001
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Thu, 29 Jul 2021 23:51:41 +0200
+Subject: PCI/MSI: Mask all unused MSI-X entries
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit 7d5ec3d3612396dc6d4b76366d20ab9fc06f399f upstream.
+
+When MSI-X is enabled the ordering of calls is:
+
+  msix_map_region();
+  msix_setup_entries();
+  pci_msi_setup_msi_irqs();
+  msix_program_entries();
+
+This has a few interesting issues:
+
+ 1) msix_setup_entries() allocates the MSI descriptors and initializes them
+    except for the msi_desc:masked member which is left zero initialized.
+
+ 2) pci_msi_setup_msi_irqs() allocates the interrupt descriptors and sets
+    up the MSI interrupts which ends up in pci_write_msi_msg() unless the
+    interrupt chip provides its own irq_write_msi_msg() function.
+
+ 3) msix_program_entries() does not do what the name suggests. It solely
+    updates the entries array (if not NULL) and initializes the masked
+    member for each MSI descriptor by reading the hardware state and then
+    masks the entry.
+
+Obviously this has some issues:
+
+ 1) The uninitialized masked member of msi_desc prevents the enforcement
+    of masking the entry in pci_write_msi_msg() depending on the cached
+    masked bit. Aside of that half initialized data is a NONO in general
+
+ 2) msix_program_entries() only ensures that the actually allocated entries
+    are masked. This is wrong as experimentation with crash testing and
+    crash kernel kexec has shown.
+
+    This limited testing unearthed that when the production kernel had more
+    entries in use and unmasked when it crashed and the crash kernel
+    allocated a smaller amount of entries, then a full scan of all entries
+    found unmasked entries which were in use in the production kernel.
+
+    This is obviously a device or emulation issue as the device reset
+    should mask all MSI-X table entries, but obviously that's just part
+    of the paper specification.
+
+Cure this by:
+
+ 1) Masking all table entries in hardware
+ 2) Initializing msi_desc::masked in msix_setup_entries()
+ 3) Removing the mask dance in msix_program_entries()
+ 4) Renaming msix_program_entries() to msix_update_entries() to
+    reflect the purpose of that function.
+
+As the masking of unused entries has never been done the Fixes tag refers
+to a commit in:
+   git://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git
+
+Fixes: f036d4ea5fa7 ("[PATCH] ia32 Message Signalled Interrupt support")
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Tested-by: Marc Zyngier <maz@kernel.org>
+Reviewed-by: Marc Zyngier <maz@kernel.org>
+Acked-by: Bjorn Helgaas <bhelgaas@google.com>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20210729222542.403833459@linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/pci/msi.c |   45 +++++++++++++++++++++++++++------------------
+ 1 file changed, 27 insertions(+), 18 deletions(-)
+
+--- a/drivers/pci/msi.c
++++ b/drivers/pci/msi.c
+@@ -691,6 +691,7 @@ static int msix_setup_entries(struct pci
+ {
+       struct irq_affinity_desc *curmsk, *masks = NULL;
+       struct msi_desc *entry;
++      void __iomem *addr;
+       int ret, i;
+       int vec_count = pci_msix_vec_count(dev);
+ 
+@@ -711,6 +712,7 @@ static int msix_setup_entries(struct pci
+ 
+               entry->msi_attrib.is_msix       = 1;
+               entry->msi_attrib.is_64         = 1;
++
+               if (entries)
+                       entry->msi_attrib.entry_nr = entries[i].entry;
+               else
+@@ -722,6 +724,10 @@ static int msix_setup_entries(struct pci
+               entry->msi_attrib.default_irq   = dev->irq;
+               entry->mask_base                = base;
+ 
++              addr = pci_msix_desc_addr(entry);
++              if (addr)
++                      entry->masked = readl(addr + PCI_MSIX_ENTRY_VECTOR_CTRL);
++
+               list_add_tail(&entry->list, dev_to_msi_list(&dev->dev));
+               if (masks)
+                       curmsk++;
+@@ -732,26 +738,25 @@ out:
+       return ret;
+ }
+ 
+-static void msix_program_entries(struct pci_dev *dev,
+-                               struct msix_entry *entries)
++static void msix_update_entries(struct pci_dev *dev, struct msix_entry *entries)
+ {
+       struct msi_desc *entry;
+-      int i = 0;
+-      void __iomem *desc_addr;
+ 
+       for_each_pci_msi_entry(entry, dev) {
+-              if (entries)
+-                      entries[i++].vector = entry->irq;
++              if (entries) {
++                      entries->vector = entry->irq;
++                      entries++;
++              }
++      }
++}
+ 
+-              desc_addr = pci_msix_desc_addr(entry);
+-              if (desc_addr)
+-                      entry->masked = readl(desc_addr +
+-                                            PCI_MSIX_ENTRY_VECTOR_CTRL);
+-              else
+-                      entry->masked = 0;
++static void msix_mask_all(void __iomem *base, int tsize)
++{
++      u32 ctrl = PCI_MSIX_ENTRY_CTRL_MASKBIT;
++      int i;
+ 
+-              msix_mask_irq(entry, 1);
+-      }
++      for (i = 0; i < tsize; i++, base += PCI_MSIX_ENTRY_SIZE)
++              writel(ctrl, base + PCI_MSIX_ENTRY_VECTOR_CTRL);
+ }
+ 
+ /**
+@@ -768,9 +773,9 @@ static void msix_program_entries(struct
+ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
+                               int nvec, struct irq_affinity *affd)
+ {
+-      int ret;
+-      u16 control;
+       void __iomem *base;
++      int ret, tsize;
++      u16 control;
+ 
+       /*
+        * Some devices require MSI-X to be enabled before the MSI-X
+@@ -782,12 +787,16 @@ static int msix_capability_init(struct p
+ 
+       pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, &control);
+       /* Request & Map MSI-X table region */
+-      base = msix_map_region(dev, msix_table_size(control));
++      tsize = msix_table_size(control);
++      base = msix_map_region(dev, tsize);
+       if (!base) {
+               ret = -ENOMEM;
+               goto out_disable;
+       }
+ 
++      /* Ensure that all table entries are masked. */
++      msix_mask_all(base, tsize);
++
+       ret = msix_setup_entries(dev, base, entries, nvec, affd);
+       if (ret)
+               goto out_disable;
+@@ -801,7 +810,7 @@ static int msix_capability_init(struct p
+       if (ret)
+               goto out_free;
+ 
+-      msix_program_entries(dev, entries);
++      msix_update_entries(dev, entries);
+ 
+       ret = populate_msi_sysfs(dev);
+       if (ret)
diff --git a/queue-5.13/pci-msi-protect-msi_desc-masked-for-multi-msi.patch b/queue-5.13/pci-msi-protect-msi_desc-masked-for-multi-msi.patch

new file mode 100644 (file)

index 0000000..4b69c45
--- /dev/null
+++ b/queue-5.13/pci-msi-protect-msi_desc-masked-for-multi-msi.patch
@@ -0,0 +1,114 @@
+From 77e89afc25f30abd56e76a809ee2884d7c1b63ce Mon Sep 17 00:00:00 2001
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Thu, 29 Jul 2021 23:51:47 +0200
+Subject: PCI/MSI: Protect msi_desc::masked for multi-MSI
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit 77e89afc25f30abd56e76a809ee2884d7c1b63ce upstream.
+
+Multi-MSI uses a single MSI descriptor and there is a single mask register
+when the device supports per vector masking. To avoid reading back the mask
+register the value is cached in the MSI descriptor and updates are done by
+clearing and setting bits in the cache and writing it to the device.
+
+But nothing protects msi_desc::masked and the mask register from being
+modified concurrently on two different CPUs for two different Linux
+interrupts which belong to the same multi-MSI descriptor.
+
+Add a lock to struct device and protect any operation on the mask and the
+mask register with it.
+
+This makes the update of msi_desc::masked unconditional, but there is no
+place which requires a modification of the hardware register without
+updating the masked cache.
+
+msi_mask_irq() is now an empty wrapper which will be cleaned up in follow
+up changes.
+
+The problem goes way back to the initial support of multi-MSI, but picking
+the commit which introduced the mask cache is a valid cut off point
+(2.6.30).
+
+Fixes: f2440d9acbe8 ("PCI MSI: Refactor interrupt masking code")
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Tested-by: Marc Zyngier <maz@kernel.org>
+Reviewed-by: Marc Zyngier <maz@kernel.org>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20210729222542.726833414@linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/base/core.c    |    1 +
+ drivers/pci/msi.c      |   19 ++++++++++---------
+ include/linux/device.h |    1 +
+ include/linux/msi.h    |    2 +-
+ 4 files changed, 13 insertions(+), 10 deletions(-)
+
+--- a/drivers/base/core.c
++++ b/drivers/base/core.c
+@@ -2809,6 +2809,7 @@ void device_initialize(struct device *de
+       device_pm_init(dev);
+       set_dev_node(dev, -1);
+ #ifdef CONFIG_GENERIC_MSI_IRQ
++      raw_spin_lock_init(&dev->msi_lock);
+       INIT_LIST_HEAD(&dev->msi_list);
+ #endif
+       INIT_LIST_HEAD(&dev->links.consumers);
+--- a/drivers/pci/msi.c
++++ b/drivers/pci/msi.c
+@@ -143,24 +143,25 @@ static inline __attribute_const__ u32 ms
+  * reliably as devices without an INTx disable bit will then generate a
+  * level IRQ which will never be cleared.
+  */
+-u32 __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag)
++void __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag)
+ {
+-      u32 mask_bits = desc->masked;
++      raw_spinlock_t *lock = &desc->dev->msi_lock;
++      unsigned long flags;
+ 
+       if (pci_msi_ignore_mask || !desc->msi_attrib.maskbit)
+-              return 0;
++              return;
+ 
+-      mask_bits &= ~mask;
+-      mask_bits |= flag;
++      raw_spin_lock_irqsave(lock, flags);
++      desc->masked &= ~mask;
++      desc->masked |= flag;
+       pci_write_config_dword(msi_desc_to_pci_dev(desc), desc->mask_pos,
+-                             mask_bits);
+-
+-      return mask_bits;
++                             desc->masked);
++      raw_spin_unlock_irqrestore(lock, flags);
+ }
+ 
+ static void msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag)
+ {
+-      desc->masked = __pci_msi_desc_mask_irq(desc, mask, flag);
++      __pci_msi_desc_mask_irq(desc, mask, flag);
+ }
+ 
+ static void __iomem *pci_msix_desc_addr(struct msi_desc *desc)
+--- a/include/linux/device.h
++++ b/include/linux/device.h
+@@ -496,6 +496,7 @@ struct device {
+       struct dev_pin_info     *pins;
+ #endif
+ #ifdef CONFIG_GENERIC_MSI_IRQ
++      raw_spinlock_t          msi_lock;
+       struct list_head        msi_list;
+ #endif
+ #ifdef CONFIG_DMA_OPS
+--- a/include/linux/msi.h
++++ b/include/linux/msi.h
+@@ -233,7 +233,7 @@ void __pci_read_msi_msg(struct msi_desc
+ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
+ 
+ u32 __pci_msix_desc_mask_irq(struct msi_desc *desc, u32 flag);
+-u32 __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag);
++void __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag);
+ void pci_msi_mask_irq(struct irq_data *data);
+ void pci_msi_unmask_irq(struct irq_data *data);
+ 
diff --git a/queue-5.13/pci-msi-use-msi_mask_irq-in-pci_msi_shutdown.patch b/queue-5.13/pci-msi-use-msi_mask_irq-in-pci_msi_shutdown.patch

new file mode 100644 (file)

index 0000000..6e5ca2e
--- /dev/null
+++ b/queue-5.13/pci-msi-use-msi_mask_irq-in-pci_msi_shutdown.patch
@@ -0,0 +1,33 @@
+From d28d4ad2a1aef27458b3383725bb179beb8d015c Mon Sep 17 00:00:00 2001
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Thu, 29 Jul 2021 23:51:46 +0200
+Subject: PCI/MSI: Use msi_mask_irq() in pci_msi_shutdown()
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit d28d4ad2a1aef27458b3383725bb179beb8d015c upstream.
+
+No point in using the raw write function from shutdown. Preparatory change
+to introduce proper serialization for the msi_desc::masked cache.
+
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Tested-by: Marc Zyngier <maz@kernel.org>
+Reviewed-by: Marc Zyngier <maz@kernel.org>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20210729222542.674391354@linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/pci/msi.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/pci/msi.c
++++ b/drivers/pci/msi.c
+@@ -961,7 +961,7 @@ static void pci_msi_shutdown(struct pci_
+ 
+       /* Return the device with MSI unmasked as initial states */
+       mask = msi_mask(desc->msi_attrib.multi_cap);
+-      __pci_msi_desc_mask_irq(desc, mask, 0);
++      msi_mask_irq(desc, mask, 0);
+ 
+       /* Restore dev->irq to its default pin-assertion IRQ */
+       dev->irq = desc->msi_attrib.default_irq;
diff --git a/queue-5.13/powerpc-32-fix-critical-and-debug-interrupts-on-booke.patch b/queue-5.13/powerpc-32-fix-critical-and-debug-interrupts-on-booke.patch

new file mode 100644 (file)

index 0000000..2b347c4
--- /dev/null
+++ b/queue-5.13/powerpc-32-fix-critical-and-debug-interrupts-on-booke.patch
@@ -0,0 +1,161 @@
+From b5cfc9cd7b0426e94ffd9e9ed79d1b00ace7780a Mon Sep 17 00:00:00 2001
+From: Christophe Leroy <christophe.leroy@csgroup.eu>
+Date: Wed, 7 Jul 2021 05:55:07 +0000
+Subject: powerpc/32: Fix critical and debug interrupts on BOOKE
+
+From: Christophe Leroy <christophe.leroy@csgroup.eu>
+
+commit b5cfc9cd7b0426e94ffd9e9ed79d1b00ace7780a upstream.
+
+32 bits BOOKE have special interrupts for debug and other
+critical events.
+
+When handling those interrupts, dedicated registers are saved
+in the stack frame in addition to the standard registers, leading
+to a shift of the pt_regs struct.
+
+Since commit db297c3b07af ("powerpc/32: Don't save thread.regs on
+interrupt entry"), the pt_regs struct is expected to be at the
+same place all the time.
+
+Instead of handling a special struct in addition to pt_regs, just
+add those special registers to struct pt_regs.
+
+Fixes: db297c3b07af ("powerpc/32: Don't save thread.regs on interrupt entry")
+Cc: stable@vger.kernel.org
+Reported-by: Radu Rendec <radu.rendec@gmail.com>
+Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
+Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
+Link: https://lore.kernel.org/r/028d5483b4851b01ea4334d0751e7f260419092b.1625637264.git.christophe.leroy@csgroup.eu
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/powerpc/include/asm/ptrace.h |   16 ++++++++++++++++
+ arch/powerpc/kernel/asm-offsets.c |   31 ++++++++++++++-----------------
+ arch/powerpc/kernel/head_booke.h  |   27 +++------------------------
+ 3 files changed, 33 insertions(+), 41 deletions(-)
+
+--- a/arch/powerpc/include/asm/ptrace.h
++++ b/arch/powerpc/include/asm/ptrace.h
+@@ -68,6 +68,22 @@ struct pt_regs
+               };
+               unsigned long __pad[4]; /* Maintain 16 byte interrupt stack alignment */
+       };
++#if defined(CONFIG_PPC32) && defined(CONFIG_BOOKE)
++      struct { /* Must be a multiple of 16 bytes */
++              unsigned long mas0;
++              unsigned long mas1;
++              unsigned long mas2;
++              unsigned long mas3;
++              unsigned long mas6;
++              unsigned long mas7;
++              unsigned long srr0;
++              unsigned long srr1;
++              unsigned long csrr0;
++              unsigned long csrr1;
++              unsigned long dsrr0;
++              unsigned long dsrr1;
++      };
++#endif
+ };
+ #endif
+ 
+--- a/arch/powerpc/kernel/asm-offsets.c
++++ b/arch/powerpc/kernel/asm-offsets.c
+@@ -348,24 +348,21 @@ int main(void)
+ #endif
+ 
+ 
+-#if defined(CONFIG_PPC32)
+-#if defined(CONFIG_BOOKE) || defined(CONFIG_40x)
+-      DEFINE(EXC_LVL_SIZE, STACK_EXC_LVL_FRAME_SIZE);
+-      DEFINE(MAS0, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas0));
++#if defined(CONFIG_PPC32) && defined(CONFIG_BOOKE)
++      STACK_PT_REGS_OFFSET(MAS0, mas0);
+       /* we overload MMUCR for 44x on MAS0 since they are mutually exclusive */
+-      DEFINE(MMUCR, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas0));
+-      DEFINE(MAS1, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas1));
+-      DEFINE(MAS2, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas2));
+-      DEFINE(MAS3, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas3));
+-      DEFINE(MAS6, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas6));
+-      DEFINE(MAS7, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas7));
+-      DEFINE(_SRR0, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, srr0));
+-      DEFINE(_SRR1, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, srr1));
+-      DEFINE(_CSRR0, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, csrr0));
+-      DEFINE(_CSRR1, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, csrr1));
+-      DEFINE(_DSRR0, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, dsrr0));
+-      DEFINE(_DSRR1, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, dsrr1));
+-#endif
++      STACK_PT_REGS_OFFSET(MMUCR, mas0);
++      STACK_PT_REGS_OFFSET(MAS1, mas1);
++      STACK_PT_REGS_OFFSET(MAS2, mas2);
++      STACK_PT_REGS_OFFSET(MAS3, mas3);
++      STACK_PT_REGS_OFFSET(MAS6, mas6);
++      STACK_PT_REGS_OFFSET(MAS7, mas7);
++      STACK_PT_REGS_OFFSET(_SRR0, srr0);
++      STACK_PT_REGS_OFFSET(_SRR1, srr1);
++      STACK_PT_REGS_OFFSET(_CSRR0, csrr0);
++      STACK_PT_REGS_OFFSET(_CSRR1, csrr1);
++      STACK_PT_REGS_OFFSET(_DSRR0, dsrr0);
++      STACK_PT_REGS_OFFSET(_DSRR1, dsrr1);
+ #endif
+ 
+ #ifndef CONFIG_PPC64
+--- a/arch/powerpc/kernel/head_booke.h
++++ b/arch/powerpc/kernel/head_booke.h
+@@ -185,20 +185,18 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_EMB_HV
+ /* only on e500mc */
+ #define DBG_STACK_BASE                dbgirq_ctx
+ 
+-#define EXC_LVL_FRAME_OVERHEAD        (THREAD_SIZE - INT_FRAME_SIZE - EXC_LVL_SIZE)
+-
+ #ifdef CONFIG_SMP
+ #define BOOKE_LOAD_EXC_LEVEL_STACK(level)             \
+       mfspr   r8,SPRN_PIR;                            \
+       slwi    r8,r8,2;                                \
+       addis   r8,r8,level##_STACK_BASE@ha;            \
+       lwz     r8,level##_STACK_BASE@l(r8);            \
+-      addi    r8,r8,EXC_LVL_FRAME_OVERHEAD;
++      addi    r8,r8,THREAD_SIZE - INT_FRAME_SIZE;
+ #else
+ #define BOOKE_LOAD_EXC_LEVEL_STACK(level)             \
+       lis     r8,level##_STACK_BASE@ha;               \
+       lwz     r8,level##_STACK_BASE@l(r8);            \
+-      addi    r8,r8,EXC_LVL_FRAME_OVERHEAD;
++      addi    r8,r8,THREAD_SIZE - INT_FRAME_SIZE;
+ #endif
+ 
+ /*
+@@ -225,7 +223,7 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_EMB_HV
+       mtmsr   r11;                                                    \
+       mfspr   r11,SPRN_SPRG_THREAD;   /* if from user, start at top of   */\
+       lwz     r11, TASK_STACK - THREAD(r11); /* this thread's kernel stack */\
+-      addi    r11,r11,EXC_LVL_FRAME_OVERHEAD; /* allocate stack frame    */\
++      addi    r11,r11,THREAD_SIZE - INT_FRAME_SIZE;   /* allocate stack frame    */\
+       beq     1f;                                                          \
+       /* COMING FROM USER MODE */                                          \
+       stw     r9,_CCR(r11);           /* save CR                         */\
+@@ -533,24 +531,5 @@ label:
+       bl      kernel_fp_unavailable_exception;                              \
+       b       interrupt_return
+ 
+-#else /* __ASSEMBLY__ */
+-struct exception_regs {
+-      unsigned long mas0;
+-      unsigned long mas1;
+-      unsigned long mas2;
+-      unsigned long mas3;
+-      unsigned long mas6;
+-      unsigned long mas7;
+-      unsigned long srr0;
+-      unsigned long srr1;
+-      unsigned long csrr0;
+-      unsigned long csrr1;
+-      unsigned long dsrr0;
+-      unsigned long dsrr1;
+-};
+-
+-/* ensure this structure is always sized to a multiple of the stack alignment */
+-#define STACK_EXC_LVL_FRAME_SIZE      ALIGN(sizeof (struct exception_regs), 16)
+-
+ #endif /* __ASSEMBLY__ */
+ #endif /* __HEAD_BOOKE_H__ */
diff --git a/queue-5.13/powerpc-32s-fix-napping-restore-in-data-storage-interrupt-dsi.patch b/queue-5.13/powerpc-32s-fix-napping-restore-in-data-storage-interrupt-dsi.patch

new file mode 100644 (file)

index 0000000..6014ff9
--- /dev/null
+++ b/queue-5.13/powerpc-32s-fix-napping-restore-in-data-storage-interrupt-dsi.patch
@@ -0,0 +1,37 @@
+From 62376365048878f770d8b7d11b89b8b3e18018f1 Mon Sep 17 00:00:00 2001
+From: Christophe Leroy <christophe.leroy@csgroup.eu>
+Date: Tue, 3 Aug 2021 15:14:27 +0000
+Subject: powerpc/32s: Fix napping restore in data storage interrupt (DSI)
+
+From: Christophe Leroy <christophe.leroy@csgroup.eu>
+
+commit 62376365048878f770d8b7d11b89b8b3e18018f1 upstream.
+
+When a DSI (Data Storage Interrupt) is taken while in NAP mode,
+r11 doesn't survive the call to power_save_ppc32_restore().
+
+So use r1 instead of r11 as they both contain the virtual stack
+pointer at that point.
+
+Fixes: 4c0104a83fc3 ("powerpc/32: Dismantle EXC_XFER_STD/LITE/TEMPLATE")
+Cc: stable@vger.kernel.org # v5.13+
+Reported-by: Finn Thain <fthain@linux-m68k.org>
+Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
+Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
+Link: https://lore.kernel.org/r/731694e0885271f6ee9ffc179eb4bcee78313682.1628003562.git.christophe.leroy@csgroup.eu
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/powerpc/kernel/head_book3s_32.S |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/powerpc/kernel/head_book3s_32.S
++++ b/arch/powerpc/kernel/head_book3s_32.S
+@@ -300,7 +300,7 @@ ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_HP
+       EXCEPTION_PROLOG_1
+       EXCEPTION_PROLOG_2 INTERRUPT_DATA_STORAGE DataAccess handle_dar_dsisr=1
+       prepare_transfer_to_handler
+-      lwz     r5, _DSISR(r11)
++      lwz     r5, _DSISR(r1)
+       andis.  r0, r5, DSISR_DABRMATCH@h
+       bne-    1f
+       bl      do_page_fault
diff --git a/queue-5.13/powerpc-interrupt-do-not-call-single_step_exception-from-other-exceptions.patch b/queue-5.13/powerpc-interrupt-do-not-call-single_step_exception-from-other-exceptions.patch

new file mode 100644 (file)

index 0000000..50b208a
--- /dev/null
+++ b/queue-5.13/powerpc-interrupt-do-not-call-single_step_exception-from-other-exceptions.patch
@@ -0,0 +1,60 @@
+From 01fcac8e4dfc112f420dcaeb70056a74e326cacf Mon Sep 17 00:00:00 2001
+From: Christophe Leroy <christophe.leroy@csgroup.eu>
+Date: Tue, 10 Aug 2021 16:13:17 +0000
+Subject: powerpc/interrupt: Do not call single_step_exception() from other exceptions
+
+From: Christophe Leroy <christophe.leroy@csgroup.eu>
+
+commit 01fcac8e4dfc112f420dcaeb70056a74e326cacf upstream.
+
+single_step_exception() is called by emulate_single_step() which
+is called from (at least) alignment exception() handler and
+program_check_exception() handler.
+
+Redefine it as a regular __single_step_exception() which is called
+by both single_step_exception() handler and emulate_single_step()
+function.
+
+Fixes: 3a96570ffceb ("powerpc: convert interrupt handlers to use wrappers")
+Cc: stable@vger.kernel.org # v5.12+
+Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
+Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
+Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
+Link: https://lore.kernel.org/r/aed174f5cbc06f2cf95233c071d8aac948e46043.1628611921.git.christophe.leroy@csgroup.eu
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/powerpc/kernel/traps.c |    9 +++++++--
+ 1 file changed, 7 insertions(+), 2 deletions(-)
+
+--- a/arch/powerpc/kernel/traps.c
++++ b/arch/powerpc/kernel/traps.c
+@@ -1103,7 +1103,7 @@ DEFINE_INTERRUPT_HANDLER(RunModeExceptio
+       _exception(SIGTRAP, regs, TRAP_UNK, 0);
+ }
+ 
+-DEFINE_INTERRUPT_HANDLER(single_step_exception)
++static void __single_step_exception(struct pt_regs *regs)
+ {
+       clear_single_step(regs);
+       clear_br_trace(regs);
+@@ -1120,6 +1120,11 @@ DEFINE_INTERRUPT_HANDLER(single_step_exc
+       _exception(SIGTRAP, regs, TRAP_TRACE, regs->nip);
+ }
+ 
++DEFINE_INTERRUPT_HANDLER(single_step_exception)
++{
++      __single_step_exception(regs);
++}
++
+ /*
+  * After we have successfully emulated an instruction, we have to
+  * check if the instruction was being single-stepped, and if so,
+@@ -1129,7 +1134,7 @@ DEFINE_INTERRUPT_HANDLER(single_step_exc
+ static void emulate_single_step(struct pt_regs *regs)
+ {
+       if (single_stepping(regs))
+-              single_step_exception(regs);
++              __single_step_exception(regs);
+ }
+ 
+ static inline int __parse_fpscr(unsigned long fpscr)
diff --git a/queue-5.13/powerpc-pseries-fix-update-of-lpar-security-flavor-after-lpm.patch b/queue-5.13/powerpc-pseries-fix-update-of-lpar-security-flavor-after-lpm.patch

new file mode 100644 (file)

index 0000000..bafb177
--- /dev/null
+++ b/queue-5.13/powerpc-pseries-fix-update-of-lpar-security-flavor-after-lpm.patch
@@ -0,0 +1,42 @@
+From c18956e6e0b95f78dad2773ecc8c61a9e41f6405 Mon Sep 17 00:00:00 2001
+From: Laurent Dufour <ldufour@linux.ibm.com>
+Date: Thu, 5 Aug 2021 17:23:08 +0200
+Subject: powerpc/pseries: Fix update of LPAR security flavor after LPM
+
+From: Laurent Dufour <ldufour@linux.ibm.com>
+
+commit c18956e6e0b95f78dad2773ecc8c61a9e41f6405 upstream.
+
+After LPM, when migrating from a system with security mitigation enabled
+to a system with mitigation disabled, the security flavor exposed in
+/proc is not correctly set back to 0.
+
+Do not assume the value of the security flavor is set to 0 when entering
+init_cpu_char_feature_flags(), so when called after a LPM, the value is
+set correctly even if the mitigation are not turned off.
+
+Fixes: 6ce56e1ac380 ("powerpc/pseries: export LPAR security flavor in lparcfg")
+Cc: stable@vger.kernel.org # v5.13+
+Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com>
+Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
+Link: https://lore.kernel.org/r/20210805152308.33988-1-ldufour@linux.ibm.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/powerpc/platforms/pseries/setup.c |    5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+--- a/arch/powerpc/platforms/pseries/setup.c
++++ b/arch/powerpc/platforms/pseries/setup.c
+@@ -539,9 +539,10 @@ static void init_cpu_char_feature_flags(
+        * H_CPU_BEHAV_FAVOUR_SECURITY_H could be set only if
+        * H_CPU_BEHAV_FAVOUR_SECURITY is.
+        */
+-      if (!(result->behaviour & H_CPU_BEHAV_FAVOUR_SECURITY))
++      if (!(result->behaviour & H_CPU_BEHAV_FAVOUR_SECURITY)) {
+               security_ftr_clear(SEC_FTR_FAVOUR_SECURITY);
+-      else if (result->behaviour & H_CPU_BEHAV_FAVOUR_SECURITY_H)
++              pseries_security_flavor = 0;
++      } else if (result->behaviour & H_CPU_BEHAV_FAVOUR_SECURITY_H)
+               pseries_security_flavor = 1;
+       else
+               pseries_security_flavor = 2;
diff --git a/queue-5.13/powerpc-smp-fix-oops-in-topology_init.patch b/queue-5.13/powerpc-smp-fix-oops-in-topology_init.patch

new file mode 100644 (file)

index 0000000..0517de3
--- /dev/null
+++ b/queue-5.13/powerpc-smp-fix-oops-in-topology_init.patch
@@ -0,0 +1,63 @@
+From 8241461536f21bbe51308a6916d1c9fb2e6b75a7 Mon Sep 17 00:00:00 2001
+From: Christophe Leroy <christophe.leroy@csgroup.eu>
+Date: Wed, 4 Aug 2021 18:24:10 +0000
+Subject: powerpc/smp: Fix OOPS in topology_init()
+
+From: Christophe Leroy <christophe.leroy@csgroup.eu>
+
+commit 8241461536f21bbe51308a6916d1c9fb2e6b75a7 upstream.
+
+Running an SMP kernel on an UP platform not prepared for it,
+I encountered the following OOPS:
+
+       BUG: Kernel NULL pointer dereference on read at 0x00000034
+       Faulting instruction address: 0xc0a04110
+       Oops: Kernel access of bad area, sig: 11 [#1]
+       BE PAGE_SIZE=4K SMP NR_CPUS=2 CMPCPRO
+       Modules linked in:
+       CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.13.0-pmac-00001-g230fedfaad21 #5234
+       NIP:  c0a04110 LR: c0a040d8 CTR: c0a04084
+       REGS: e100dda0 TRAP: 0300   Not tainted  (5.13.0-pmac-00001-g230fedfaad21)
+       MSR:  00009032 <EE,ME,IR,DR,RI>  CR: 84000284  XER: 00000000
+       DAR: 00000034 DSISR: 20000000
+       GPR00: c0006bd4 e100de60 c1033320 00000000 00000000 c0942274 00000000 00000000
+       GPR08: 00000000 00000000 00000001 00000063 00000007 00000000 c0006f30 00000000
+       GPR16: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000005
+       GPR24: c0c67d74 c0c67f1c c0c60000 c0c67d70 c0c0c558 1efdf000 c0c00020 00000000
+       NIP [c0a04110] topology_init+0x8c/0x138
+       LR [c0a040d8] topology_init+0x54/0x138
+       Call Trace:
+       [e100de60] [80808080] 0x80808080 (unreliable)
+       [e100de90] [c0006bd4] do_one_initcall+0x48/0x1bc
+       [e100def0] [c0a0150c] kernel_init_freeable+0x1c8/0x278
+       [e100df20] [c0006f44] kernel_init+0x14/0x10c
+       [e100df30] [c00190fc] ret_from_kernel_thread+0x14/0x1c
+       Instruction dump:
+       7c692e70 7d290194 7c035040 7c7f1b78 5529103a 546706fe 5468103a 39400001
+       7c641b78 40800054 80c690b4 7fb9402e <81060034> 7fbeea14 2c080000 7fa3eb78
+       ---[ end trace b246ffbc6bbbb6fb ]---
+
+Fix it by checking smp_ops before using it, as already done in
+several other places in the arch/powerpc/kernel/smp.c
+
+Fixes: 39f87561454d ("powerpc/smp: Move ppc_md.cpu_die() to smp_ops.cpu_offline_self()")
+Cc: stable@vger.kernel.org
+Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
+Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
+Link: https://lore.kernel.org/r/75287841cbb8740edd44880fe60be66d489160d9.1628097995.git.christophe.leroy@csgroup.eu
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/powerpc/kernel/sysfs.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/powerpc/kernel/sysfs.c
++++ b/arch/powerpc/kernel/sysfs.c
+@@ -1167,7 +1167,7 @@ static int __init topology_init(void)
+                * CPU.  For instance, the boot cpu might never be valid
+                * for hotplugging.
+                */
+-              if (smp_ops->cpu_offline_self)
++              if (smp_ops && smp_ops->cpu_offline_self)
+                       c->hotpluggable = 1;
+ #endif
+ 
diff --git a/queue-5.13/powerpc-xive-do-not-skip-cpu-less-nodes-when-creating-the-ipis.patch b/queue-5.13/powerpc-xive-do-not-skip-cpu-less-nodes-when-creating-the-ipis.patch

new file mode 100644 (file)

index 0000000..c5b5510
--- /dev/null
+++ b/queue-5.13/powerpc-xive-do-not-skip-cpu-less-nodes-when-creating-the-ipis.patch
@@ -0,0 +1,127 @@
+From cbc06f051c524dcfe52ef0d1f30647828e226d30 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@kaod.org>
+Date: Sat, 7 Aug 2021 09:20:57 +0200
+Subject: powerpc/xive: Do not skip CPU-less nodes when creating the IPIs
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Cédric Le Goater <clg@kaod.org>
+
+commit cbc06f051c524dcfe52ef0d1f30647828e226d30 upstream.
+
+On PowerVM, CPU-less nodes can be populated with hot-plugged CPUs at
+runtime. Today, the IPI is not created for such nodes, and hot-plugged
+CPUs use a bogus IPI, which leads to soft lockups.
+
+We can not directly allocate and request the IPI on demand because
+bringup_up() is called under the IRQ sparse lock. The alternative is
+to allocate the IPIs for all possible nodes at startup and to request
+the mapping on demand when the first CPU of a node is brought up.
+
+Fixes: 7dcc37b3eff9 ("powerpc/xive: Map one IPI interrupt per node")
+Cc: stable@vger.kernel.org # v5.13
+Reported-by: Geetika Moolchandani <Geetika.Moolchandani1@ibm.com>
+Signed-off-by: Cédric Le Goater <clg@kaod.org>
+Tested-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
+Tested-by: Laurent Vivier <lvivier@redhat.com>
+Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
+Link: https://lore.kernel.org/r/20210807072057.184698-1-clg@kaod.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/powerpc/sysdev/xive/common.c |   35 ++++++++++++++++++++++++-----------
+ 1 file changed, 24 insertions(+), 11 deletions(-)
+
+--- a/arch/powerpc/sysdev/xive/common.c
++++ b/arch/powerpc/sysdev/xive/common.c
+@@ -67,6 +67,7 @@ static struct irq_domain *xive_irq_domai
+ static struct xive_ipi_desc {
+       unsigned int irq;
+       char name[16];
++      atomic_t started;
+ } *xive_ipis;
+ 
+ /*
+@@ -1120,7 +1121,7 @@ static const struct irq_domain_ops xive_
+       .alloc  = xive_ipi_irq_domain_alloc,
+ };
+ 
+-static int __init xive_request_ipi(void)
++static int __init xive_init_ipis(void)
+ {
+       struct fwnode_handle *fwnode;
+       struct irq_domain *ipi_domain;
+@@ -1144,10 +1145,6 @@ static int __init xive_request_ipi(void)
+               struct xive_ipi_desc *xid = &xive_ipis[node];
+               struct xive_ipi_alloc_info info = { node };
+ 
+-              /* Skip nodes without CPUs */
+-              if (cpumask_empty(cpumask_of_node(node)))
+-                      continue;
+-
+               /*
+                * Map one IPI interrupt per node for all cpus of that node.
+                * Since the HW interrupt number doesn't have any meaning,
+@@ -1159,11 +1156,6 @@ static int __init xive_request_ipi(void)
+               xid->irq = ret;
+ 
+               snprintf(xid->name, sizeof(xid->name), "IPI-%d", node);
+-
+-              ret = request_irq(xid->irq, xive_muxed_ipi_action,
+-                                IRQF_PERCPU | IRQF_NO_THREAD, xid->name, NULL);
+-
+-              WARN(ret < 0, "Failed to request IPI %d: %d\n", xid->irq, ret);
+       }
+ 
+       return ret;
+@@ -1178,6 +1170,22 @@ out:
+       return ret;
+ }
+ 
++static int __init xive_request_ipi(unsigned int cpu)
++{
++      struct xive_ipi_desc *xid = &xive_ipis[early_cpu_to_node(cpu)];
++      int ret;
++
++      if (atomic_inc_return(&xid->started) > 1)
++              return 0;
++
++      ret = request_irq(xid->irq, xive_muxed_ipi_action,
++                        IRQF_PERCPU | IRQF_NO_THREAD,
++                        xid->name, NULL);
++
++      WARN(ret < 0, "Failed to request IPI %d: %d\n", xid->irq, ret);
++      return ret;
++}
++
+ static int xive_setup_cpu_ipi(unsigned int cpu)
+ {
+       unsigned int xive_ipi_irq = xive_ipi_cpu_to_irq(cpu);
+@@ -1192,6 +1200,9 @@ static int xive_setup_cpu_ipi(unsigned i
+       if (xc->hw_ipi != XIVE_BAD_IRQ)
+               return 0;
+ 
++      /* Register the IPI */
++      xive_request_ipi(cpu);
++
+       /* Grab an IPI from the backend, this will populate xc->hw_ipi */
+       if (xive_ops->get_ipi(cpu, xc))
+               return -EIO;
+@@ -1231,6 +1242,8 @@ static void xive_cleanup_cpu_ipi(unsigne
+       if (xc->hw_ipi == XIVE_BAD_IRQ)
+               return;
+ 
++      /* TODO: clear IPI mapping */
++
+       /* Mask the IPI */
+       xive_do_source_set_mask(&xc->ipi_data, true);
+ 
+@@ -1253,7 +1266,7 @@ void __init xive_smp_probe(void)
+       smp_ops->cause_ipi = xive_cause_ipi;
+ 
+       /* Register the IPI */
+-      xive_request_ipi();
++      xive_init_ipis();
+ 
+       /* Allocate and setup IPI for the boot CPU */
+       xive_setup_cpu_ipi(smp_processor_id());
diff --git a/queue-5.13/series b/queue-5.13/series

index 8c43d70fd79adac5981cb88bb3baaf830aa0c105..259a9e1994c68e58c066c4a8859bc75e8c39a76d 100644 (file)
--- a/queue-5.13/series
+++ b/queue-5.13/series
@@ -126,3 +126,22 @@ genirq-msi-ensure-deactivation-on-teardown.patch
  genirq-timings-prevent-potential-array-overflow-in-__irq_timings_store.patch
  powerpc-interrupt-fix-oops-by-not-calling-do_irq-from-timer_interrupt.patch
  pci-msi-enable-and-mask-msi-x-early.patch
+pci-msi-mask-all-unused-msi-x-entries.patch
+pci-msi-enforce-that-msi-x-table-entry-is-masked-for-update.patch
+pci-msi-enforce-msi-entry-updates-to-be-visible.patch
+pci-msi-do-not-set-invalid-bits-in-msi-mask.patch
+pci-msi-correct-misleading-comments.patch
+pci-msi-use-msi_mask_irq-in-pci_msi_shutdown.patch
+pci-msi-protect-msi_desc-masked-for-multi-msi.patch
+powerpc-interrupt-do-not-call-single_step_exception-from-other-exceptions.patch
+powerpc-pseries-fix-update-of-lpar-security-flavor-after-lpm.patch
+powerpc-32s-fix-napping-restore-in-data-storage-interrupt-dsi.patch
+powerpc-smp-fix-oops-in-topology_init.patch
+powerpc-xive-do-not-skip-cpu-less-nodes-when-creating-the-ipis.patch
+powerpc-32-fix-critical-and-debug-interrupts-on-booke.patch
+efi-libstub-arm64-double-check-image-alignment-at-entry.patch
+locking-rtmutex-use-the-correct-rtmutex-debugging-config-option.patch
+kvm-vmx-use-current-vmcs-to-query-waitpkg-support-for-msr-emulation.patch
+kvm-nvmx-use-vmx_need_pf_intercept-when-deciding-if-l0-wants-a-pf.patch
+kvm-x86-mmu-don-t-leak-non-leaf-sptes-when-zapping-all-sptes.patch
+kvm-x86-mmu-protect-marking-sps-unsync-when-using-tdp-mmu-with-spinlock.patch
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Mon, 16 Aug 2021 08:27:45 +0000 (10:27 +0200)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Mon, 16 Aug 2021 08:27:45 +0000 (10:27 +0200)
queue-5.13/efi-libstub-arm64-double-check-image-alignment-at-entry.patch	[new file with mode: 0644]	patch \| blob
queue-5.13/kvm-nvmx-use-vmx_need_pf_intercept-when-deciding-if-l0-wants-a-pf.patch	[new file with mode: 0644]	patch \| blob
queue-5.13/kvm-vmx-use-current-vmcs-to-query-waitpkg-support-for-msr-emulation.patch	[new file with mode: 0644]	patch \| blob
queue-5.13/kvm-x86-mmu-don-t-leak-non-leaf-sptes-when-zapping-all-sptes.patch	[new file with mode: 0644]	patch \| blob
queue-5.13/kvm-x86-mmu-protect-marking-sps-unsync-when-using-tdp-mmu-with-spinlock.patch	[new file with mode: 0644]	patch \| blob
queue-5.13/locking-rtmutex-use-the-correct-rtmutex-debugging-config-option.patch	[new file with mode: 0644]	patch \| blob
queue-5.13/pci-msi-correct-misleading-comments.patch	[new file with mode: 0644]	patch \| blob
queue-5.13/pci-msi-do-not-set-invalid-bits-in-msi-mask.patch	[new file with mode: 0644]	patch \| blob
queue-5.13/pci-msi-enforce-msi-entry-updates-to-be-visible.patch	[new file with mode: 0644]	patch \| blob
queue-5.13/pci-msi-enforce-that-msi-x-table-entry-is-masked-for-update.patch	[new file with mode: 0644]	patch \| blob
queue-5.13/pci-msi-mask-all-unused-msi-x-entries.patch	[new file with mode: 0644]	patch \| blob
queue-5.13/pci-msi-protect-msi_desc-masked-for-multi-msi.patch	[new file with mode: 0644]	patch \| blob
queue-5.13/pci-msi-use-msi_mask_irq-in-pci_msi_shutdown.patch	[new file with mode: 0644]	patch \| blob
queue-5.13/powerpc-32-fix-critical-and-debug-interrupts-on-booke.patch	[new file with mode: 0644]	patch \| blob
queue-5.13/powerpc-32s-fix-napping-restore-in-data-storage-interrupt-dsi.patch	[new file with mode: 0644]	patch \| blob
queue-5.13/powerpc-interrupt-do-not-call-single_step_exception-from-other-exceptions.patch	[new file with mode: 0644]	patch \| blob
queue-5.13/powerpc-pseries-fix-update-of-lpar-security-flavor-after-lpm.patch	[new file with mode: 0644]	patch \| blob
queue-5.13/powerpc-smp-fix-oops-in-topology_init.patch	[new file with mode: 0644]	patch \| blob
queue-5.13/powerpc-xive-do-not-skip-cpu-less-nodes-when-creating-the-ipis.patch	[new file with mode: 0644]	patch \| blob
queue-5.13/series		patch \| blob \| blame \| history