--- /dev/null
+From c32ac11da3f83bb42b986702a9b92f0a14ed4182 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Mon, 26 Jul 2021 16:31:44 +0200
+Subject: efi/libstub: arm64: Double check image alignment at entry
+
+From: Ard Biesheuvel <ardb@kernel.org>
+
+commit c32ac11da3f83bb42b986702a9b92f0a14ed4182 upstream.
+
+On arm64, the stub only moves the kernel image around in memory if
+needed, which is typically only for KASLR, given that relocatable
+kernels (which is the default) can run from any 64k aligned address,
+which is also the minimum alignment communicated to EFI via the PE/COFF
+header.
+
+Unfortunately, some loaders appear to ignore this header, and load the
+kernel at some arbitrary offset in memory. We can deal with this, but
+let's check for this condition anyway, so non-compliant code can be
+spotted and fixed.
+
+Cc: <stable@vger.kernel.org> # v5.10+
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Tested-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/firmware/efi/libstub/arm64-stub.c | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/drivers/firmware/efi/libstub/arm64-stub.c
++++ b/drivers/firmware/efi/libstub/arm64-stub.c
+@@ -119,6 +119,10 @@ efi_status_t handle_kernel_image(unsigne
+ if (image->image_base != _text)
+ efi_err("FIRMWARE BUG: efi_loaded_image_t::image_base has bogus value\n");
+
++ if (!IS_ALIGNED((u64)_text, EFI_KIMG_ALIGN))
++ efi_err("FIRMWARE BUG: kernel image not aligned on %ldk boundary\n",
++ EFI_KIMG_ALIGN >> 10);
++
+ kernel_size = _edata - _text;
+ kernel_memsize = kernel_size + (_end - _edata);
+ *reserve_size = kernel_memsize;
--- /dev/null
+From 18712c13709d2de9516c5d3414f707c4f0a9c190 Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc@google.com>
+Date: Wed, 11 Aug 2021 21:56:15 -0700
+Subject: KVM: nVMX: Use vmx_need_pf_intercept() when deciding if L0 wants a #PF
+
+From: Sean Christopherson <seanjc@google.com>
+
+commit 18712c13709d2de9516c5d3414f707c4f0a9c190 upstream.
+
+Use vmx_need_pf_intercept() when determining if L0 wants to handle a #PF
+in L2 or if the VM-Exit should be forwarded to L1. The current logic fails
+to account for the case where #PF is intercepted to handle
+guest.MAXPHYADDR < host.MAXPHYADDR and ends up reflecting all #PFs into
+L1. At best, L1 will complain and inject the #PF back into L2. At
+worst, L1 will eat the unexpected fault and cause L2 to hang on infinite
+page faults.
+
+Note, while the bug was technically introduced by the commit that added
+support for the MAXPHYADDR madness, the shame is all on commit
+a0c134347baf ("KVM: VMX: introduce vmx_need_pf_intercept").
+
+Fixes: 1dbf5d68af6f ("KVM: VMX: Add guest physical address check in EPT violation and misconfig")
+Cc: stable@vger.kernel.org
+Cc: Peter Shier <pshier@google.com>
+Cc: Oliver Upton <oupton@google.com>
+Cc: Jim Mattson <jmattson@google.com>
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Message-Id: <20210812045615.3167686-1-seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/vmx/nested.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/kvm/vmx/nested.c
++++ b/arch/x86/kvm/vmx/nested.c
+@@ -5798,7 +5798,8 @@ static bool nested_vmx_l0_wants_exit(str
+ if (is_nmi(intr_info))
+ return true;
+ else if (is_page_fault(intr_info))
+- return vcpu->arch.apf.host_apf_flags || !enable_ept;
++ return vcpu->arch.apf.host_apf_flags ||
++ vmx_need_pf_intercept(vcpu);
+ else if (is_debug(intr_info) &&
+ vcpu->guest_debug &
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
--- /dev/null
+From 7b9cae027ba3aaac295ae23a62f47876ed97da73 Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc@google.com>
+Date: Tue, 10 Aug 2021 10:19:49 -0700
+Subject: KVM: VMX: Use current VMCS to query WAITPKG support for MSR emulation
+
+From: Sean Christopherson <seanjc@google.com>
+
+commit 7b9cae027ba3aaac295ae23a62f47876ed97da73 upstream.
+
+Use the secondary_exec_controls_get() accessor in vmx_has_waitpkg() to
+effectively get the controls for the current VMCS, as opposed to using
+vmx->secondary_exec_controls, which is the cached value of KVM's desired
+controls for vmcs01 and truly not reflective of any particular VMCS.
+
+While the waitpkg control is not dynamic, i.e. vmcs01 will always hold
+the same waitpkg configuration as vmx->secondary_exec_controls, the same
+does not hold true for vmcs02 if the L1 VMM hides the feature from L2.
+If L1 hides the feature _and_ does not intercept MSR_IA32_UMWAIT_CONTROL,
+L2 could incorrectly read/write L1's virtual MSR instead of taking a #GP.
+
+Fixes: 6e3ba4abcea5 ("KVM: vmx: Emulate MSR IA32_UMWAIT_CONTROL")
+Cc: stable@vger.kernel.org
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Message-Id: <20210810171952.2758100-2-seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/vmx/vmx.h | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/x86/kvm/vmx/vmx.h
++++ b/arch/x86/kvm/vmx/vmx.h
+@@ -538,7 +538,7 @@ static inline void decache_tsc_multiplie
+
+ static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx)
+ {
+- return vmx->secondary_exec_control &
++ return secondary_exec_controls_get(vmx) &
+ SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
+ }
+
--- /dev/null
+From 524a1e4e381fc5e7781008d5bd420fd1357c0113 Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc@google.com>
+Date: Thu, 12 Aug 2021 11:14:13 -0700
+Subject: KVM: x86/mmu: Don't leak non-leaf SPTEs when zapping all SPTEs
+
+From: Sean Christopherson <seanjc@google.com>
+
+commit 524a1e4e381fc5e7781008d5bd420fd1357c0113 upstream.
+
+Pass "all ones" as the end GFN to signal "zap all" for the TDP MMU and
+really zap all SPTEs in this case. As is, zap_gfn_range() skips non-leaf
+SPTEs whose range exceeds the range to be zapped. If shadow_phys_bits is
+not aligned to the range size of top-level SPTEs, e.g. 512gb with 4-level
+paging, the "zap all" flows will skip top-level SPTEs whose range extends
+beyond shadow_phys_bits and leak their SPs when the VM is destroyed.
+
+Use the current upper bound (based on host.MAXPHYADDR) to detect that the
+caller wants to zap all SPTEs, e.g. instead of using the max theoretical
+gfn, 1 << (52 - 12). The more precise upper bound allows the TDP iterator
+to terminate its walk earlier when running on hosts with MAXPHYADDR < 52.
+
+Add a WARN on kmv->arch.tdp_mmu_pages when the TDP MMU is destroyed to
+help future debuggers should KVM decide to leak SPTEs again.
+
+The bug is most easily reproduced by running (and unloading!) KVM in a
+VM whose host.MAXPHYADDR < 39, as the SPTE for gfn=0 will be skipped.
+
+ =============================================================================
+ BUG kvm_mmu_page_header (Not tainted): Objects remaining in kvm_mmu_page_header on __kmem_cache_shutdown()
+ -----------------------------------------------------------------------------
+ Slab 0x000000004d8f7af1 objects=22 used=2 fp=0x00000000624d29ac flags=0x4000000000000200(slab|zone=1)
+ CPU: 0 PID: 1582 Comm: rmmod Not tainted 5.14.0-rc2+ #420
+ Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
+ Call Trace:
+ dump_stack_lvl+0x45/0x59
+ slab_err+0x95/0xc9
+ __kmem_cache_shutdown.cold+0x3c/0x158
+ kmem_cache_destroy+0x3d/0xf0
+ kvm_mmu_module_exit+0xa/0x30 [kvm]
+ kvm_arch_exit+0x5d/0x90 [kvm]
+ kvm_exit+0x78/0x90 [kvm]
+ vmx_exit+0x1a/0x50 [kvm_intel]
+ __x64_sys_delete_module+0x13f/0x220
+ do_syscall_64+0x3b/0xc0
+ entry_SYSCALL_64_after_hwframe+0x44/0xae
+
+Fixes: faaf05b00aec ("kvm: x86/mmu: Support zapping SPTEs in the TDP MMU")
+Cc: stable@vger.kernel.org
+Cc: Ben Gardon <bgardon@google.com>
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Message-Id: <20210812181414.3376143-2-seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/mmu/tdp_mmu.c | 26 ++++++++++++++++----------
+ 1 file changed, 16 insertions(+), 10 deletions(-)
+
+--- a/arch/x86/kvm/mmu/tdp_mmu.c
++++ b/arch/x86/kvm/mmu/tdp_mmu.c
+@@ -41,6 +41,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *
+ if (!kvm->arch.tdp_mmu_enabled)
+ return;
+
++ WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
+ WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
+
+ /*
+@@ -79,8 +80,6 @@ static void tdp_mmu_free_sp_rcu_callback
+ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
+ bool shared)
+ {
+- gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
+-
+ kvm_lockdep_assert_mmu_lock_held(kvm, shared);
+
+ if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
+@@ -92,7 +91,7 @@ void kvm_tdp_mmu_put_root(struct kvm *kv
+ list_del_rcu(&root->link);
+ spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+
+- zap_gfn_range(kvm, root, 0, max_gfn, false, false, shared);
++ zap_gfn_range(kvm, root, 0, -1ull, false, false, shared);
+
+ call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
+ }
+@@ -722,8 +721,17 @@ static bool zap_gfn_range(struct kvm *kv
+ gfn_t start, gfn_t end, bool can_yield, bool flush,
+ bool shared)
+ {
++ gfn_t max_gfn_host = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
++ bool zap_all = (start == 0 && end >= max_gfn_host);
+ struct tdp_iter iter;
+
++ /*
++ * Bound the walk at host.MAXPHYADDR, guest accesses beyond that will
++ * hit a #PF(RSVD) and never get to an EPT Violation/Misconfig / #NPF,
++ * and so KVM will never install a SPTE for such addresses.
++ */
++ end = min(end, max_gfn_host);
++
+ kvm_lockdep_assert_mmu_lock_held(kvm, shared);
+
+ rcu_read_lock();
+@@ -742,9 +750,10 @@ retry:
+ /*
+ * If this is a non-last-level SPTE that covers a larger range
+ * than should be zapped, continue, and zap the mappings at a
+- * lower level.
++ * lower level, except when zapping all SPTEs.
+ */
+- if ((iter.gfn < start ||
++ if (!zap_all &&
++ (iter.gfn < start ||
+ iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
+ !is_last_spte(iter.old_spte, iter.level))
+ continue;
+@@ -792,12 +801,11 @@ bool __kvm_tdp_mmu_zap_gfn_range(struct
+
+ void kvm_tdp_mmu_zap_all(struct kvm *kvm)
+ {
+- gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
+ bool flush = false;
+ int i;
+
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
+- flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn,
++ flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull,
+ flush, false);
+
+ if (flush)
+@@ -836,7 +844,6 @@ static struct kvm_mmu_page *next_invalid
+ */
+ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
+ {
+- gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
+ struct kvm_mmu_page *next_root;
+ struct kvm_mmu_page *root;
+ bool flush = false;
+@@ -852,8 +859,7 @@ void kvm_tdp_mmu_zap_invalidated_roots(s
+
+ rcu_read_unlock();
+
+- flush = zap_gfn_range(kvm, root, 0, max_gfn, true, flush,
+- true);
++ flush = zap_gfn_range(kvm, root, 0, -1ull, true, flush, true);
+
+ /*
+ * Put the reference acquired in
--- /dev/null
+From ce25681d59ffc4303321e555a2d71b1946af07da Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc@google.com>
+Date: Thu, 12 Aug 2021 11:18:15 -0700
+Subject: KVM: x86/mmu: Protect marking SPs unsync when using TDP MMU with spinlock
+
+From: Sean Christopherson <seanjc@google.com>
+
+commit ce25681d59ffc4303321e555a2d71b1946af07da upstream.
+
+Add yet another spinlock for the TDP MMU and take it when marking indirect
+shadow pages unsync. When using the TDP MMU and L1 is running L2(s) with
+nested TDP, KVM may encounter shadow pages for the TDP entries managed by
+L1 (controlling L2) when handling a TDP MMU page fault. The unsync logic
+is not thread safe, e.g. the kvm_mmu_page fields are not atomic, and
+misbehaves when a shadow page is marked unsync via a TDP MMU page fault,
+which runs with mmu_lock held for read, not write.
+
+Lack of a critical section manifests most visibly as an underflow of
+unsync_children in clear_unsync_child_bit() due to unsync_children being
+corrupted when multiple CPUs write it without a critical section and
+without atomic operations. But underflow is the best case scenario. The
+worst case scenario is that unsync_children prematurely hits '0' and
+leads to guest memory corruption due to KVM neglecting to properly sync
+shadow pages.
+
+Use an entirely new spinlock even though piggybacking tdp_mmu_pages_lock
+would functionally be ok. Usurping the lock could degrade performance when
+building upper level page tables on different vCPUs, especially since the
+unsync flow could hold the lock for a comparatively long time depending on
+the number of indirect shadow pages and the depth of the paging tree.
+
+For simplicity, take the lock for all MMUs, even though KVM could fairly
+easily know that mmu_lock is held for write. If mmu_lock is held for
+write, there cannot be contention for the inner spinlock, and marking
+shadow pages unsync across multiple vCPUs will be slow enough that
+bouncing the kvm_arch cacheline should be in the noise.
+
+Note, even though L2 could theoretically be given access to its own EPT
+entries, a nested MMU must hold mmu_lock for write and thus cannot race
+against a TDP MMU page fault. I.e. the additional spinlock only _needs_ to
+be taken by the TDP MMU, as opposed to being taken by any MMU for a VM
+that is running with the TDP MMU enabled. Holding mmu_lock for read also
+prevents the indirect shadow page from being freed. But as above, keep
+it simple and always take the lock.
+
+Alternative #1, the TDP MMU could simply pass "false" for can_unsync and
+effectively disable unsync behavior for nested TDP. Write protecting leaf
+shadow pages is unlikely to noticeably impact traditional L1 VMMs, as such
+VMMs typically don't modify TDP entries, but the same may not hold true for
+non-standard use cases and/or VMMs that are migrating physical pages (from
+L1's perspective).
+
+Alternative #2, the unsync logic could be made thread safe. In theory,
+simply converting all relevant kvm_mmu_page fields to atomics and using
+atomic bitops for the bitmap would suffice. However, (a) an in-depth audit
+would be required, (b) the code churn would be substantial, and (c) legacy
+shadow paging would incur additional atomic operations in performance
+sensitive paths for no benefit (to legacy shadow paging).
+
+Fixes: a2855afc7ee8 ("KVM: x86/mmu: Allow parallel page faults for the TDP MMU")
+Cc: stable@vger.kernel.org
+Cc: Ben Gardon <bgardon@google.com>
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Message-Id: <20210812181815.3378104-1-seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ Documentation/virt/kvm/locking.rst | 8 ++++----
+ arch/x86/include/asm/kvm_host.h | 7 +++++++
+ arch/x86/kvm/mmu/mmu.c | 28 ++++++++++++++++++++++++++++
+ 3 files changed, 39 insertions(+), 4 deletions(-)
+
+--- a/Documentation/virt/kvm/locking.rst
++++ b/Documentation/virt/kvm/locking.rst
+@@ -20,10 +20,10 @@ On x86:
+
+ - vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock
+
+-- kvm->arch.mmu_lock is an rwlock. kvm->arch.tdp_mmu_pages_lock is
+- taken inside kvm->arch.mmu_lock, and cannot be taken without already
+- holding kvm->arch.mmu_lock (typically with ``read_lock``, otherwise
+- there's no need to take kvm->arch.tdp_mmu_pages_lock at all).
++- kvm->arch.mmu_lock is an rwlock. kvm->arch.tdp_mmu_pages_lock and
++ kvm->arch.mmu_unsync_pages_lock are taken inside kvm->arch.mmu_lock, and
++ cannot be taken without already holding kvm->arch.mmu_lock (typically with
++ ``read_lock`` for the TDP MMU, thus the need for additional spinlocks).
+
+ Everything else is a leaf: no other lock is taken inside the critical
+ sections.
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -987,6 +987,13 @@ struct kvm_arch {
+ struct list_head lpage_disallowed_mmu_pages;
+ struct kvm_page_track_notifier_node mmu_sp_tracker;
+ struct kvm_page_track_notifier_head track_notifier_head;
++ /*
++ * Protects marking pages unsync during page faults, as TDP MMU page
++ * faults only take mmu_lock for read. For simplicity, the unsync
++ * pages lock is always taken when marking pages unsync regardless of
++ * whether mmu_lock is held for read or write.
++ */
++ spinlock_t mmu_unsync_pages_lock;
+
+ struct list_head assigned_dev_head;
+ struct iommu_domain *iommu_domain;
+--- a/arch/x86/kvm/mmu/mmu.c
++++ b/arch/x86/kvm/mmu/mmu.c
+@@ -2454,6 +2454,7 @@ bool mmu_need_write_protect(struct kvm_v
+ bool can_unsync)
+ {
+ struct kvm_mmu_page *sp;
++ bool locked = false;
+
+ if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
+ return true;
+@@ -2465,9 +2466,34 @@ bool mmu_need_write_protect(struct kvm_v
+ if (sp->unsync)
+ continue;
+
++ /*
++ * TDP MMU page faults require an additional spinlock as they
++ * run with mmu_lock held for read, not write, and the unsync
++ * logic is not thread safe. Take the spinklock regardless of
++ * the MMU type to avoid extra conditionals/parameters, there's
++ * no meaningful penalty if mmu_lock is held for write.
++ */
++ if (!locked) {
++ locked = true;
++ spin_lock(&vcpu->kvm->arch.mmu_unsync_pages_lock);
++
++ /*
++ * Recheck after taking the spinlock, a different vCPU
++ * may have since marked the page unsync. A false
++ * positive on the unprotected check above is not
++ * possible as clearing sp->unsync _must_ hold mmu_lock
++ * for write, i.e. unsync cannot transition from 0->1
++ * while this CPU holds mmu_lock for read (or write).
++ */
++ if (READ_ONCE(sp->unsync))
++ continue;
++ }
++
+ WARN_ON(sp->role.level != PG_LEVEL_4K);
+ kvm_unsync_page(vcpu, sp);
+ }
++ if (locked)
++ spin_unlock(&vcpu->kvm->arch.mmu_unsync_pages_lock);
+
+ /*
+ * We need to ensure that the marking of unsync pages is visible
+@@ -5514,6 +5540,8 @@ void kvm_mmu_init_vm(struct kvm *kvm)
+ {
+ struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
+
++ spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
++
+ kvm_mmu_init_tdp_mmu(kvm);
+
+ node->track_write = kvm_mmu_pte_write;
--- /dev/null
+From 07d25971b220e477eb019fcb520a9f2e3ac966af Mon Sep 17 00:00:00 2001
+From: Zhen Lei <thunder.leizhen@huawei.com>
+Date: Sat, 31 Jul 2021 20:30:11 +0800
+Subject: locking/rtmutex: Use the correct rtmutex debugging config option
+
+From: Zhen Lei <thunder.leizhen@huawei.com>
+
+commit 07d25971b220e477eb019fcb520a9f2e3ac966af upstream.
+
+It's CONFIG_DEBUG_RT_MUTEXES not CONFIG_DEBUG_RT_MUTEX.
+
+Fixes: f7efc4799f81 ("locking/rtmutex: Inline chainwalk depth check")
+Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Acked-by: Will Deacon <will@kernel.org>
+Acked-by: Boqun Feng <boqun.feng@gmail.com>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20210731123011.4555-1-thunder.leizhen@huawei.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/locking/rtmutex.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/kernel/locking/rtmutex.c
++++ b/kernel/locking/rtmutex.c
+@@ -343,7 +343,7 @@ static __always_inline bool
+ rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter,
+ enum rtmutex_chainwalk chwalk)
+ {
+- if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEX))
++ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES))
+ return waiter != NULL;
+ return chwalk == RT_MUTEX_FULL_CHAINWALK;
+ }
--- /dev/null
+From 689e6b5351573c38ccf92a0dd8b3e2c2241e4aff Mon Sep 17 00:00:00 2001
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Thu, 29 Jul 2021 23:51:45 +0200
+Subject: PCI/MSI: Correct misleading comments
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit 689e6b5351573c38ccf92a0dd8b3e2c2241e4aff upstream.
+
+The comments about preserving the cached state in pci_msi[x]_shutdown() are
+misleading as the MSI descriptors are freed right after those functions
+return. So there is nothing to restore. Preparatory change.
+
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Tested-by: Marc Zyngier <maz@kernel.org>
+Reviewed-by: Marc Zyngier <maz@kernel.org>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20210729222542.621609423@linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/pci/msi.c | 5 +----
+ 1 file changed, 1 insertion(+), 4 deletions(-)
+
+--- a/drivers/pci/msi.c
++++ b/drivers/pci/msi.c
+@@ -961,7 +961,6 @@ static void pci_msi_shutdown(struct pci_
+
+ /* Return the device with MSI unmasked as initial states */
+ mask = msi_mask(desc->msi_attrib.multi_cap);
+- /* Keep cached state to be restored */
+ __pci_msi_desc_mask_irq(desc, mask, 0);
+
+ /* Restore dev->irq to its default pin-assertion IRQ */
+@@ -1047,10 +1046,8 @@ static void pci_msix_shutdown(struct pci
+ }
+
+ /* Return the device with MSI-X masked as initial states */
+- for_each_pci_msi_entry(entry, dev) {
+- /* Keep cached states to be restored */
++ for_each_pci_msi_entry(entry, dev)
+ __pci_msix_desc_mask_irq(entry, 1);
+- }
+
+ pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0);
+ pci_intx_for_msi(dev, 1);
--- /dev/null
+From 361fd37397f77578735907341579397d5bed0a2d Mon Sep 17 00:00:00 2001
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Thu, 29 Jul 2021 23:51:44 +0200
+Subject: PCI/MSI: Do not set invalid bits in MSI mask
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit 361fd37397f77578735907341579397d5bed0a2d upstream.
+
+msi_mask_irq() takes a mask and a flags argument. The mask argument is used
+to mask out bits from the cached mask and the flags argument to set bits.
+
+Some places invoke it with a flags argument which sets bits which are not
+used by the device, i.e. when the device supports up to 8 vectors a full
+unmask in some places sets the mask to 0xFFFFFF00. While devices probably
+do not care, it's still bad practice.
+
+Fixes: 7ba1930db02f ("PCI MSI: Unmask MSI if setup failed")
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Tested-by: Marc Zyngier <maz@kernel.org>
+Reviewed-by: Marc Zyngier <maz@kernel.org>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20210729222542.568173099@linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/pci/msi.c | 8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+
+--- a/drivers/pci/msi.c
++++ b/drivers/pci/msi.c
+@@ -656,21 +656,21 @@ static int msi_capability_init(struct pc
+ /* Configure MSI capability structure */
+ ret = pci_msi_setup_msi_irqs(dev, nvec, PCI_CAP_ID_MSI);
+ if (ret) {
+- msi_mask_irq(entry, mask, ~mask);
++ msi_mask_irq(entry, mask, 0);
+ free_msi_irqs(dev);
+ return ret;
+ }
+
+ ret = msi_verify_entries(dev);
+ if (ret) {
+- msi_mask_irq(entry, mask, ~mask);
++ msi_mask_irq(entry, mask, 0);
+ free_msi_irqs(dev);
+ return ret;
+ }
+
+ ret = populate_msi_sysfs(dev);
+ if (ret) {
+- msi_mask_irq(entry, mask, ~mask);
++ msi_mask_irq(entry, mask, 0);
+ free_msi_irqs(dev);
+ return ret;
+ }
+@@ -962,7 +962,7 @@ static void pci_msi_shutdown(struct pci_
+ /* Return the device with MSI unmasked as initial states */
+ mask = msi_mask(desc->msi_attrib.multi_cap);
+ /* Keep cached state to be restored */
+- __pci_msi_desc_mask_irq(desc, mask, ~mask);
++ __pci_msi_desc_mask_irq(desc, mask, 0);
+
+ /* Restore dev->irq to its default pin-assertion IRQ */
+ dev->irq = desc->msi_attrib.default_irq;
--- /dev/null
+From b9255a7cb51754e8d2645b65dd31805e282b4f3e Mon Sep 17 00:00:00 2001
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Thu, 29 Jul 2021 23:51:43 +0200
+Subject: PCI/MSI: Enforce MSI[X] entry updates to be visible
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit b9255a7cb51754e8d2645b65dd31805e282b4f3e upstream.
+
+Nothing enforces the posted writes to be visible when the function
+returns. Flush them even if the flush might be redundant when the entry is
+masked already as the unmask will flush as well. This is either setup or a
+rare affinity change event so the extra flush is not the end of the world.
+
+While this is more a theoretical issue especially the logic in the X86
+specific msi_set_affinity() function relies on the assumption that the
+update has reached the hardware when the function returns.
+
+Again, as this never has been enforced the Fixes tag refers to a commit in:
+ git://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git
+
+Fixes: f036d4ea5fa7 ("[PATCH] ia32 Message Signalled Interrupt support")
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Tested-by: Marc Zyngier <maz@kernel.org>
+Reviewed-by: Marc Zyngier <maz@kernel.org>
+Acked-by: Bjorn Helgaas <bhelgaas@google.com>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20210729222542.515188147@linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/pci/msi.c | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+--- a/drivers/pci/msi.c
++++ b/drivers/pci/msi.c
+@@ -311,6 +311,9 @@ void __pci_write_msi_msg(struct msi_desc
+
+ if (unmasked)
+ __pci_msix_desc_mask_irq(entry, 0);
++
++ /* Ensure that the writes are visible in the device */
++ readl(base + PCI_MSIX_ENTRY_DATA);
+ } else {
+ int pos = dev->msi_cap;
+ u16 msgctl;
+@@ -331,6 +334,8 @@ void __pci_write_msi_msg(struct msi_desc
+ pci_write_config_word(dev, pos + PCI_MSI_DATA_32,
+ msg->data);
+ }
++ /* Ensure that the writes are visible in the device */
++ pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &msgctl);
+ }
+
+ skip:
--- /dev/null
+From da181dc974ad667579baece33c2c8d2d1e4558d5 Mon Sep 17 00:00:00 2001
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Thu, 29 Jul 2021 23:51:42 +0200
+Subject: PCI/MSI: Enforce that MSI-X table entry is masked for update
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit da181dc974ad667579baece33c2c8d2d1e4558d5 upstream.
+
+The specification (PCIe r5.0, sec 6.1.4.5) states:
+
+ For MSI-X, a function is permitted to cache Address and Data values
+ from unmasked MSI-X Table entries. However, anytime software unmasks a
+ currently masked MSI-X Table entry either by clearing its Mask bit or
+ by clearing the Function Mask bit, the function must update any Address
+ or Data values that it cached from that entry. If software changes the
+ Address or Data value of an entry while the entry is unmasked, the
+ result is undefined.
+
+The Linux kernel's MSI-X support never enforced that the entry is masked
+before the entry is modified hence the Fixes tag refers to a commit in:
+ git://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git
+
+Enforce the entry to be masked across the update.
+
+There is no point in enforcing this to be handled at all possible call
+sites as this is just pointless code duplication and the common update
+function is the obvious place to enforce this.
+
+Fixes: f036d4ea5fa7 ("[PATCH] ia32 Message Signalled Interrupt support")
+Reported-by: Kevin Tian <kevin.tian@intel.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Tested-by: Marc Zyngier <maz@kernel.org>
+Reviewed-by: Marc Zyngier <maz@kernel.org>
+Acked-by: Bjorn Helgaas <bhelgaas@google.com>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20210729222542.462096385@linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/pci/msi.c | 15 +++++++++++++++
+ 1 file changed, 15 insertions(+)
+
+--- a/drivers/pci/msi.c
++++ b/drivers/pci/msi.c
+@@ -289,13 +289,28 @@ void __pci_write_msi_msg(struct msi_desc
+ /* Don't touch the hardware now */
+ } else if (entry->msi_attrib.is_msix) {
+ void __iomem *base = pci_msix_desc_addr(entry);
++ bool unmasked = !(entry->masked & PCI_MSIX_ENTRY_CTRL_MASKBIT);
+
+ if (!base)
+ goto skip;
+
++ /*
++ * The specification mandates that the entry is masked
++ * when the message is modified:
++ *
++ * "If software changes the Address or Data value of an
++ * entry while the entry is unmasked, the result is
++ * undefined."
++ */
++ if (unmasked)
++ __pci_msix_desc_mask_irq(entry, PCI_MSIX_ENTRY_CTRL_MASKBIT);
++
+ writel(msg->address_lo, base + PCI_MSIX_ENTRY_LOWER_ADDR);
+ writel(msg->address_hi, base + PCI_MSIX_ENTRY_UPPER_ADDR);
+ writel(msg->data, base + PCI_MSIX_ENTRY_DATA);
++
++ if (unmasked)
++ __pci_msix_desc_mask_irq(entry, 0);
+ } else {
+ int pos = dev->msi_cap;
+ u16 msgctl;
--- /dev/null
+From 7d5ec3d3612396dc6d4b76366d20ab9fc06f399f Mon Sep 17 00:00:00 2001
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Thu, 29 Jul 2021 23:51:41 +0200
+Subject: PCI/MSI: Mask all unused MSI-X entries
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit 7d5ec3d3612396dc6d4b76366d20ab9fc06f399f upstream.
+
+When MSI-X is enabled the ordering of calls is:
+
+ msix_map_region();
+ msix_setup_entries();
+ pci_msi_setup_msi_irqs();
+ msix_program_entries();
+
+This has a few interesting issues:
+
+ 1) msix_setup_entries() allocates the MSI descriptors and initializes them
+ except for the msi_desc:masked member which is left zero initialized.
+
+ 2) pci_msi_setup_msi_irqs() allocates the interrupt descriptors and sets
+ up the MSI interrupts which ends up in pci_write_msi_msg() unless the
+ interrupt chip provides its own irq_write_msi_msg() function.
+
+ 3) msix_program_entries() does not do what the name suggests. It solely
+ updates the entries array (if not NULL) and initializes the masked
+ member for each MSI descriptor by reading the hardware state and then
+ masks the entry.
+
+Obviously this has some issues:
+
+ 1) The uninitialized masked member of msi_desc prevents the enforcement
+ of masking the entry in pci_write_msi_msg() depending on the cached
+ masked bit. Aside of that half initialized data is a NONO in general
+
+ 2) msix_program_entries() only ensures that the actually allocated entries
+ are masked. This is wrong as experimentation with crash testing and
+ crash kernel kexec has shown.
+
+ This limited testing unearthed that when the production kernel had more
+ entries in use and unmasked when it crashed and the crash kernel
+ allocated a smaller amount of entries, then a full scan of all entries
+ found unmasked entries which were in use in the production kernel.
+
+ This is obviously a device or emulation issue as the device reset
+ should mask all MSI-X table entries, but obviously that's just part
+ of the paper specification.
+
+Cure this by:
+
+ 1) Masking all table entries in hardware
+ 2) Initializing msi_desc::masked in msix_setup_entries()
+ 3) Removing the mask dance in msix_program_entries()
+ 4) Renaming msix_program_entries() to msix_update_entries() to
+ reflect the purpose of that function.
+
+As the masking of unused entries has never been done the Fixes tag refers
+to a commit in:
+ git://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git
+
+Fixes: f036d4ea5fa7 ("[PATCH] ia32 Message Signalled Interrupt support")
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Tested-by: Marc Zyngier <maz@kernel.org>
+Reviewed-by: Marc Zyngier <maz@kernel.org>
+Acked-by: Bjorn Helgaas <bhelgaas@google.com>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20210729222542.403833459@linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/pci/msi.c | 45 +++++++++++++++++++++++++++------------------
+ 1 file changed, 27 insertions(+), 18 deletions(-)
+
+--- a/drivers/pci/msi.c
++++ b/drivers/pci/msi.c
+@@ -691,6 +691,7 @@ static int msix_setup_entries(struct pci
+ {
+ struct irq_affinity_desc *curmsk, *masks = NULL;
+ struct msi_desc *entry;
++ void __iomem *addr;
+ int ret, i;
+ int vec_count = pci_msix_vec_count(dev);
+
+@@ -711,6 +712,7 @@ static int msix_setup_entries(struct pci
+
+ entry->msi_attrib.is_msix = 1;
+ entry->msi_attrib.is_64 = 1;
++
+ if (entries)
+ entry->msi_attrib.entry_nr = entries[i].entry;
+ else
+@@ -722,6 +724,10 @@ static int msix_setup_entries(struct pci
+ entry->msi_attrib.default_irq = dev->irq;
+ entry->mask_base = base;
+
++ addr = pci_msix_desc_addr(entry);
++ if (addr)
++ entry->masked = readl(addr + PCI_MSIX_ENTRY_VECTOR_CTRL);
++
+ list_add_tail(&entry->list, dev_to_msi_list(&dev->dev));
+ if (masks)
+ curmsk++;
+@@ -732,26 +738,25 @@ out:
+ return ret;
+ }
+
+-static void msix_program_entries(struct pci_dev *dev,
+- struct msix_entry *entries)
++static void msix_update_entries(struct pci_dev *dev, struct msix_entry *entries)
+ {
+ struct msi_desc *entry;
+- int i = 0;
+- void __iomem *desc_addr;
+
+ for_each_pci_msi_entry(entry, dev) {
+- if (entries)
+- entries[i++].vector = entry->irq;
++ if (entries) {
++ entries->vector = entry->irq;
++ entries++;
++ }
++ }
++}
+
+- desc_addr = pci_msix_desc_addr(entry);
+- if (desc_addr)
+- entry->masked = readl(desc_addr +
+- PCI_MSIX_ENTRY_VECTOR_CTRL);
+- else
+- entry->masked = 0;
++static void msix_mask_all(void __iomem *base, int tsize)
++{
++ u32 ctrl = PCI_MSIX_ENTRY_CTRL_MASKBIT;
++ int i;
+
+- msix_mask_irq(entry, 1);
+- }
++ for (i = 0; i < tsize; i++, base += PCI_MSIX_ENTRY_SIZE)
++ writel(ctrl, base + PCI_MSIX_ENTRY_VECTOR_CTRL);
+ }
+
+ /**
+@@ -768,9 +773,9 @@ static void msix_program_entries(struct
+ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
+ int nvec, struct irq_affinity *affd)
+ {
+- int ret;
+- u16 control;
+ void __iomem *base;
++ int ret, tsize;
++ u16 control;
+
+ /*
+ * Some devices require MSI-X to be enabled before the MSI-X
+@@ -782,12 +787,16 @@ static int msix_capability_init(struct p
+
+ pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, &control);
+ /* Request & Map MSI-X table region */
+- base = msix_map_region(dev, msix_table_size(control));
++ tsize = msix_table_size(control);
++ base = msix_map_region(dev, tsize);
+ if (!base) {
+ ret = -ENOMEM;
+ goto out_disable;
+ }
+
++ /* Ensure that all table entries are masked. */
++ msix_mask_all(base, tsize);
++
+ ret = msix_setup_entries(dev, base, entries, nvec, affd);
+ if (ret)
+ goto out_disable;
+@@ -801,7 +810,7 @@ static int msix_capability_init(struct p
+ if (ret)
+ goto out_free;
+
+- msix_program_entries(dev, entries);
++ msix_update_entries(dev, entries);
+
+ ret = populate_msi_sysfs(dev);
+ if (ret)
--- /dev/null
+From 77e89afc25f30abd56e76a809ee2884d7c1b63ce Mon Sep 17 00:00:00 2001
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Thu, 29 Jul 2021 23:51:47 +0200
+Subject: PCI/MSI: Protect msi_desc::masked for multi-MSI
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit 77e89afc25f30abd56e76a809ee2884d7c1b63ce upstream.
+
+Multi-MSI uses a single MSI descriptor and there is a single mask register
+when the device supports per vector masking. To avoid reading back the mask
+register the value is cached in the MSI descriptor and updates are done by
+clearing and setting bits in the cache and writing it to the device.
+
+But nothing protects msi_desc::masked and the mask register from being
+modified concurrently on two different CPUs for two different Linux
+interrupts which belong to the same multi-MSI descriptor.
+
+Add a lock to struct device and protect any operation on the mask and the
+mask register with it.
+
+This makes the update of msi_desc::masked unconditional, but there is no
+place which requires a modification of the hardware register without
+updating the masked cache.
+
+msi_mask_irq() is now an empty wrapper which will be cleaned up in follow
+up changes.
+
+The problem goes way back to the initial support of multi-MSI, but picking
+the commit which introduced the mask cache is a valid cut off point
+(2.6.30).
+
+Fixes: f2440d9acbe8 ("PCI MSI: Refactor interrupt masking code")
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Tested-by: Marc Zyngier <maz@kernel.org>
+Reviewed-by: Marc Zyngier <maz@kernel.org>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20210729222542.726833414@linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/base/core.c | 1 +
+ drivers/pci/msi.c | 19 ++++++++++---------
+ include/linux/device.h | 1 +
+ include/linux/msi.h | 2 +-
+ 4 files changed, 13 insertions(+), 10 deletions(-)
+
+--- a/drivers/base/core.c
++++ b/drivers/base/core.c
+@@ -2809,6 +2809,7 @@ void device_initialize(struct device *de
+ device_pm_init(dev);
+ set_dev_node(dev, -1);
+ #ifdef CONFIG_GENERIC_MSI_IRQ
++ raw_spin_lock_init(&dev->msi_lock);
+ INIT_LIST_HEAD(&dev->msi_list);
+ #endif
+ INIT_LIST_HEAD(&dev->links.consumers);
+--- a/drivers/pci/msi.c
++++ b/drivers/pci/msi.c
+@@ -143,24 +143,25 @@ static inline __attribute_const__ u32 ms
+ * reliably as devices without an INTx disable bit will then generate a
+ * level IRQ which will never be cleared.
+ */
+-u32 __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag)
++void __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag)
+ {
+- u32 mask_bits = desc->masked;
++ raw_spinlock_t *lock = &desc->dev->msi_lock;
++ unsigned long flags;
+
+ if (pci_msi_ignore_mask || !desc->msi_attrib.maskbit)
+- return 0;
++ return;
+
+- mask_bits &= ~mask;
+- mask_bits |= flag;
++ raw_spin_lock_irqsave(lock, flags);
++ desc->masked &= ~mask;
++ desc->masked |= flag;
+ pci_write_config_dword(msi_desc_to_pci_dev(desc), desc->mask_pos,
+- mask_bits);
+-
+- return mask_bits;
++ desc->masked);
++ raw_spin_unlock_irqrestore(lock, flags);
+ }
+
+ static void msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag)
+ {
+- desc->masked = __pci_msi_desc_mask_irq(desc, mask, flag);
++ __pci_msi_desc_mask_irq(desc, mask, flag);
+ }
+
+ static void __iomem *pci_msix_desc_addr(struct msi_desc *desc)
+--- a/include/linux/device.h
++++ b/include/linux/device.h
+@@ -496,6 +496,7 @@ struct device {
+ struct dev_pin_info *pins;
+ #endif
+ #ifdef CONFIG_GENERIC_MSI_IRQ
++ raw_spinlock_t msi_lock;
+ struct list_head msi_list;
+ #endif
+ #ifdef CONFIG_DMA_OPS
+--- a/include/linux/msi.h
++++ b/include/linux/msi.h
+@@ -233,7 +233,7 @@ void __pci_read_msi_msg(struct msi_desc
+ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
+
+ u32 __pci_msix_desc_mask_irq(struct msi_desc *desc, u32 flag);
+-u32 __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag);
++void __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag);
+ void pci_msi_mask_irq(struct irq_data *data);
+ void pci_msi_unmask_irq(struct irq_data *data);
+
--- /dev/null
+From d28d4ad2a1aef27458b3383725bb179beb8d015c Mon Sep 17 00:00:00 2001
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Thu, 29 Jul 2021 23:51:46 +0200
+Subject: PCI/MSI: Use msi_mask_irq() in pci_msi_shutdown()
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit d28d4ad2a1aef27458b3383725bb179beb8d015c upstream.
+
+No point in using the raw write function from shutdown. Preparatory change
+to introduce proper serialization for the msi_desc::masked cache.
+
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Tested-by: Marc Zyngier <maz@kernel.org>
+Reviewed-by: Marc Zyngier <maz@kernel.org>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20210729222542.674391354@linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/pci/msi.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/pci/msi.c
++++ b/drivers/pci/msi.c
+@@ -961,7 +961,7 @@ static void pci_msi_shutdown(struct pci_
+
+ /* Return the device with MSI unmasked as initial states */
+ mask = msi_mask(desc->msi_attrib.multi_cap);
+- __pci_msi_desc_mask_irq(desc, mask, 0);
++ msi_mask_irq(desc, mask, 0);
+
+ /* Restore dev->irq to its default pin-assertion IRQ */
+ dev->irq = desc->msi_attrib.default_irq;
--- /dev/null
+From b5cfc9cd7b0426e94ffd9e9ed79d1b00ace7780a Mon Sep 17 00:00:00 2001
+From: Christophe Leroy <christophe.leroy@csgroup.eu>
+Date: Wed, 7 Jul 2021 05:55:07 +0000
+Subject: powerpc/32: Fix critical and debug interrupts on BOOKE
+
+From: Christophe Leroy <christophe.leroy@csgroup.eu>
+
+commit b5cfc9cd7b0426e94ffd9e9ed79d1b00ace7780a upstream.
+
+32 bits BOOKE have special interrupts for debug and other
+critical events.
+
+When handling those interrupts, dedicated registers are saved
+in the stack frame in addition to the standard registers, leading
+to a shift of the pt_regs struct.
+
+Since commit db297c3b07af ("powerpc/32: Don't save thread.regs on
+interrupt entry"), the pt_regs struct is expected to be at the
+same place all the time.
+
+Instead of handling a special struct in addition to pt_regs, just
+add those special registers to struct pt_regs.
+
+Fixes: db297c3b07af ("powerpc/32: Don't save thread.regs on interrupt entry")
+Cc: stable@vger.kernel.org
+Reported-by: Radu Rendec <radu.rendec@gmail.com>
+Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
+Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
+Link: https://lore.kernel.org/r/028d5483b4851b01ea4334d0751e7f260419092b.1625637264.git.christophe.leroy@csgroup.eu
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/powerpc/include/asm/ptrace.h | 16 ++++++++++++++++
+ arch/powerpc/kernel/asm-offsets.c | 31 ++++++++++++++-----------------
+ arch/powerpc/kernel/head_booke.h | 27 +++------------------------
+ 3 files changed, 33 insertions(+), 41 deletions(-)
+
+--- a/arch/powerpc/include/asm/ptrace.h
++++ b/arch/powerpc/include/asm/ptrace.h
+@@ -68,6 +68,22 @@ struct pt_regs
+ };
+ unsigned long __pad[4]; /* Maintain 16 byte interrupt stack alignment */
+ };
++#if defined(CONFIG_PPC32) && defined(CONFIG_BOOKE)
++ struct { /* Must be a multiple of 16 bytes */
++ unsigned long mas0;
++ unsigned long mas1;
++ unsigned long mas2;
++ unsigned long mas3;
++ unsigned long mas6;
++ unsigned long mas7;
++ unsigned long srr0;
++ unsigned long srr1;
++ unsigned long csrr0;
++ unsigned long csrr1;
++ unsigned long dsrr0;
++ unsigned long dsrr1;
++ };
++#endif
+ };
+ #endif
+
+--- a/arch/powerpc/kernel/asm-offsets.c
++++ b/arch/powerpc/kernel/asm-offsets.c
+@@ -348,24 +348,21 @@ int main(void)
+ #endif
+
+
+-#if defined(CONFIG_PPC32)
+-#if defined(CONFIG_BOOKE) || defined(CONFIG_40x)
+- DEFINE(EXC_LVL_SIZE, STACK_EXC_LVL_FRAME_SIZE);
+- DEFINE(MAS0, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas0));
++#if defined(CONFIG_PPC32) && defined(CONFIG_BOOKE)
++ STACK_PT_REGS_OFFSET(MAS0, mas0);
+ /* we overload MMUCR for 44x on MAS0 since they are mutually exclusive */
+- DEFINE(MMUCR, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas0));
+- DEFINE(MAS1, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas1));
+- DEFINE(MAS2, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas2));
+- DEFINE(MAS3, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas3));
+- DEFINE(MAS6, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas6));
+- DEFINE(MAS7, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas7));
+- DEFINE(_SRR0, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, srr0));
+- DEFINE(_SRR1, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, srr1));
+- DEFINE(_CSRR0, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, csrr0));
+- DEFINE(_CSRR1, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, csrr1));
+- DEFINE(_DSRR0, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, dsrr0));
+- DEFINE(_DSRR1, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, dsrr1));
+-#endif
++ STACK_PT_REGS_OFFSET(MMUCR, mas0);
++ STACK_PT_REGS_OFFSET(MAS1, mas1);
++ STACK_PT_REGS_OFFSET(MAS2, mas2);
++ STACK_PT_REGS_OFFSET(MAS3, mas3);
++ STACK_PT_REGS_OFFSET(MAS6, mas6);
++ STACK_PT_REGS_OFFSET(MAS7, mas7);
++ STACK_PT_REGS_OFFSET(_SRR0, srr0);
++ STACK_PT_REGS_OFFSET(_SRR1, srr1);
++ STACK_PT_REGS_OFFSET(_CSRR0, csrr0);
++ STACK_PT_REGS_OFFSET(_CSRR1, csrr1);
++ STACK_PT_REGS_OFFSET(_DSRR0, dsrr0);
++ STACK_PT_REGS_OFFSET(_DSRR1, dsrr1);
+ #endif
+
+ #ifndef CONFIG_PPC64
+--- a/arch/powerpc/kernel/head_booke.h
++++ b/arch/powerpc/kernel/head_booke.h
+@@ -185,20 +185,18 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_EMB_HV
+ /* only on e500mc */
+ #define DBG_STACK_BASE dbgirq_ctx
+
+-#define EXC_LVL_FRAME_OVERHEAD (THREAD_SIZE - INT_FRAME_SIZE - EXC_LVL_SIZE)
+-
+ #ifdef CONFIG_SMP
+ #define BOOKE_LOAD_EXC_LEVEL_STACK(level) \
+ mfspr r8,SPRN_PIR; \
+ slwi r8,r8,2; \
+ addis r8,r8,level##_STACK_BASE@ha; \
+ lwz r8,level##_STACK_BASE@l(r8); \
+- addi r8,r8,EXC_LVL_FRAME_OVERHEAD;
++ addi r8,r8,THREAD_SIZE - INT_FRAME_SIZE;
+ #else
+ #define BOOKE_LOAD_EXC_LEVEL_STACK(level) \
+ lis r8,level##_STACK_BASE@ha; \
+ lwz r8,level##_STACK_BASE@l(r8); \
+- addi r8,r8,EXC_LVL_FRAME_OVERHEAD;
++ addi r8,r8,THREAD_SIZE - INT_FRAME_SIZE;
+ #endif
+
+ /*
+@@ -225,7 +223,7 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_EMB_HV
+ mtmsr r11; \
+ mfspr r11,SPRN_SPRG_THREAD; /* if from user, start at top of */\
+ lwz r11, TASK_STACK - THREAD(r11); /* this thread's kernel stack */\
+- addi r11,r11,EXC_LVL_FRAME_OVERHEAD; /* allocate stack frame */\
++ addi r11,r11,THREAD_SIZE - INT_FRAME_SIZE; /* allocate stack frame */\
+ beq 1f; \
+ /* COMING FROM USER MODE */ \
+ stw r9,_CCR(r11); /* save CR */\
+@@ -533,24 +531,5 @@ label:
+ bl kernel_fp_unavailable_exception; \
+ b interrupt_return
+
+-#else /* __ASSEMBLY__ */
+-struct exception_regs {
+- unsigned long mas0;
+- unsigned long mas1;
+- unsigned long mas2;
+- unsigned long mas3;
+- unsigned long mas6;
+- unsigned long mas7;
+- unsigned long srr0;
+- unsigned long srr1;
+- unsigned long csrr0;
+- unsigned long csrr1;
+- unsigned long dsrr0;
+- unsigned long dsrr1;
+-};
+-
+-/* ensure this structure is always sized to a multiple of the stack alignment */
+-#define STACK_EXC_LVL_FRAME_SIZE ALIGN(sizeof (struct exception_regs), 16)
+-
+ #endif /* __ASSEMBLY__ */
+ #endif /* __HEAD_BOOKE_H__ */
--- /dev/null
+From 62376365048878f770d8b7d11b89b8b3e18018f1 Mon Sep 17 00:00:00 2001
+From: Christophe Leroy <christophe.leroy@csgroup.eu>
+Date: Tue, 3 Aug 2021 15:14:27 +0000
+Subject: powerpc/32s: Fix napping restore in data storage interrupt (DSI)
+
+From: Christophe Leroy <christophe.leroy@csgroup.eu>
+
+commit 62376365048878f770d8b7d11b89b8b3e18018f1 upstream.
+
+When a DSI (Data Storage Interrupt) is taken while in NAP mode,
+r11 doesn't survive the call to power_save_ppc32_restore().
+
+So use r1 instead of r11 as they both contain the virtual stack
+pointer at that point.
+
+Fixes: 4c0104a83fc3 ("powerpc/32: Dismantle EXC_XFER_STD/LITE/TEMPLATE")
+Cc: stable@vger.kernel.org # v5.13+
+Reported-by: Finn Thain <fthain@linux-m68k.org>
+Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
+Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
+Link: https://lore.kernel.org/r/731694e0885271f6ee9ffc179eb4bcee78313682.1628003562.git.christophe.leroy@csgroup.eu
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/powerpc/kernel/head_book3s_32.S | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/powerpc/kernel/head_book3s_32.S
++++ b/arch/powerpc/kernel/head_book3s_32.S
+@@ -300,7 +300,7 @@ ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_HP
+ EXCEPTION_PROLOG_1
+ EXCEPTION_PROLOG_2 INTERRUPT_DATA_STORAGE DataAccess handle_dar_dsisr=1
+ prepare_transfer_to_handler
+- lwz r5, _DSISR(r11)
++ lwz r5, _DSISR(r1)
+ andis. r0, r5, DSISR_DABRMATCH@h
+ bne- 1f
+ bl do_page_fault
--- /dev/null
+From 01fcac8e4dfc112f420dcaeb70056a74e326cacf Mon Sep 17 00:00:00 2001
+From: Christophe Leroy <christophe.leroy@csgroup.eu>
+Date: Tue, 10 Aug 2021 16:13:17 +0000
+Subject: powerpc/interrupt: Do not call single_step_exception() from other exceptions
+
+From: Christophe Leroy <christophe.leroy@csgroup.eu>
+
+commit 01fcac8e4dfc112f420dcaeb70056a74e326cacf upstream.
+
+single_step_exception() is called by emulate_single_step() which
+is called from (at least) alignment exception() handler and
+program_check_exception() handler.
+
+Redefine it as a regular __single_step_exception() which is called
+by both single_step_exception() handler and emulate_single_step()
+function.
+
+Fixes: 3a96570ffceb ("powerpc: convert interrupt handlers to use wrappers")
+Cc: stable@vger.kernel.org # v5.12+
+Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
+Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
+Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
+Link: https://lore.kernel.org/r/aed174f5cbc06f2cf95233c071d8aac948e46043.1628611921.git.christophe.leroy@csgroup.eu
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/powerpc/kernel/traps.c | 9 +++++++--
+ 1 file changed, 7 insertions(+), 2 deletions(-)
+
+--- a/arch/powerpc/kernel/traps.c
++++ b/arch/powerpc/kernel/traps.c
+@@ -1103,7 +1103,7 @@ DEFINE_INTERRUPT_HANDLER(RunModeExceptio
+ _exception(SIGTRAP, regs, TRAP_UNK, 0);
+ }
+
+-DEFINE_INTERRUPT_HANDLER(single_step_exception)
++static void __single_step_exception(struct pt_regs *regs)
+ {
+ clear_single_step(regs);
+ clear_br_trace(regs);
+@@ -1120,6 +1120,11 @@ DEFINE_INTERRUPT_HANDLER(single_step_exc
+ _exception(SIGTRAP, regs, TRAP_TRACE, regs->nip);
+ }
+
++DEFINE_INTERRUPT_HANDLER(single_step_exception)
++{
++ __single_step_exception(regs);
++}
++
+ /*
+ * After we have successfully emulated an instruction, we have to
+ * check if the instruction was being single-stepped, and if so,
+@@ -1129,7 +1134,7 @@ DEFINE_INTERRUPT_HANDLER(single_step_exc
+ static void emulate_single_step(struct pt_regs *regs)
+ {
+ if (single_stepping(regs))
+- single_step_exception(regs);
++ __single_step_exception(regs);
+ }
+
+ static inline int __parse_fpscr(unsigned long fpscr)
--- /dev/null
+From c18956e6e0b95f78dad2773ecc8c61a9e41f6405 Mon Sep 17 00:00:00 2001
+From: Laurent Dufour <ldufour@linux.ibm.com>
+Date: Thu, 5 Aug 2021 17:23:08 +0200
+Subject: powerpc/pseries: Fix update of LPAR security flavor after LPM
+
+From: Laurent Dufour <ldufour@linux.ibm.com>
+
+commit c18956e6e0b95f78dad2773ecc8c61a9e41f6405 upstream.
+
+After LPM, when migrating from a system with security mitigation enabled
+to a system with mitigation disabled, the security flavor exposed in
+/proc is not correctly set back to 0.
+
+Do not assume the value of the security flavor is set to 0 when entering
+init_cpu_char_feature_flags(), so when called after a LPM, the value is
+set correctly even if the mitigation are not turned off.
+
+Fixes: 6ce56e1ac380 ("powerpc/pseries: export LPAR security flavor in lparcfg")
+Cc: stable@vger.kernel.org # v5.13+
+Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com>
+Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
+Link: https://lore.kernel.org/r/20210805152308.33988-1-ldufour@linux.ibm.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/powerpc/platforms/pseries/setup.c | 5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+--- a/arch/powerpc/platforms/pseries/setup.c
++++ b/arch/powerpc/platforms/pseries/setup.c
+@@ -539,9 +539,10 @@ static void init_cpu_char_feature_flags(
+ * H_CPU_BEHAV_FAVOUR_SECURITY_H could be set only if
+ * H_CPU_BEHAV_FAVOUR_SECURITY is.
+ */
+- if (!(result->behaviour & H_CPU_BEHAV_FAVOUR_SECURITY))
++ if (!(result->behaviour & H_CPU_BEHAV_FAVOUR_SECURITY)) {
+ security_ftr_clear(SEC_FTR_FAVOUR_SECURITY);
+- else if (result->behaviour & H_CPU_BEHAV_FAVOUR_SECURITY_H)
++ pseries_security_flavor = 0;
++ } else if (result->behaviour & H_CPU_BEHAV_FAVOUR_SECURITY_H)
+ pseries_security_flavor = 1;
+ else
+ pseries_security_flavor = 2;
--- /dev/null
+From 8241461536f21bbe51308a6916d1c9fb2e6b75a7 Mon Sep 17 00:00:00 2001
+From: Christophe Leroy <christophe.leroy@csgroup.eu>
+Date: Wed, 4 Aug 2021 18:24:10 +0000
+Subject: powerpc/smp: Fix OOPS in topology_init()
+
+From: Christophe Leroy <christophe.leroy@csgroup.eu>
+
+commit 8241461536f21bbe51308a6916d1c9fb2e6b75a7 upstream.
+
+Running an SMP kernel on an UP platform not prepared for it,
+I encountered the following OOPS:
+
+ BUG: Kernel NULL pointer dereference on read at 0x00000034
+ Faulting instruction address: 0xc0a04110
+ Oops: Kernel access of bad area, sig: 11 [#1]
+ BE PAGE_SIZE=4K SMP NR_CPUS=2 CMPCPRO
+ Modules linked in:
+ CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.13.0-pmac-00001-g230fedfaad21 #5234
+ NIP: c0a04110 LR: c0a040d8 CTR: c0a04084
+ REGS: e100dda0 TRAP: 0300 Not tainted (5.13.0-pmac-00001-g230fedfaad21)
+ MSR: 00009032 <EE,ME,IR,DR,RI> CR: 84000284 XER: 00000000
+ DAR: 00000034 DSISR: 20000000
+ GPR00: c0006bd4 e100de60 c1033320 00000000 00000000 c0942274 00000000 00000000
+ GPR08: 00000000 00000000 00000001 00000063 00000007 00000000 c0006f30 00000000
+ GPR16: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000005
+ GPR24: c0c67d74 c0c67f1c c0c60000 c0c67d70 c0c0c558 1efdf000 c0c00020 00000000
+ NIP [c0a04110] topology_init+0x8c/0x138
+ LR [c0a040d8] topology_init+0x54/0x138
+ Call Trace:
+ [e100de60] [80808080] 0x80808080 (unreliable)
+ [e100de90] [c0006bd4] do_one_initcall+0x48/0x1bc
+ [e100def0] [c0a0150c] kernel_init_freeable+0x1c8/0x278
+ [e100df20] [c0006f44] kernel_init+0x14/0x10c
+ [e100df30] [c00190fc] ret_from_kernel_thread+0x14/0x1c
+ Instruction dump:
+ 7c692e70 7d290194 7c035040 7c7f1b78 5529103a 546706fe 5468103a 39400001
+ 7c641b78 40800054 80c690b4 7fb9402e <81060034> 7fbeea14 2c080000 7fa3eb78
+ ---[ end trace b246ffbc6bbbb6fb ]---
+
+Fix it by checking smp_ops before using it, as already done in
+several other places in the arch/powerpc/kernel/smp.c
+
+Fixes: 39f87561454d ("powerpc/smp: Move ppc_md.cpu_die() to smp_ops.cpu_offline_self()")
+Cc: stable@vger.kernel.org
+Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
+Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
+Link: https://lore.kernel.org/r/75287841cbb8740edd44880fe60be66d489160d9.1628097995.git.christophe.leroy@csgroup.eu
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/powerpc/kernel/sysfs.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/powerpc/kernel/sysfs.c
++++ b/arch/powerpc/kernel/sysfs.c
+@@ -1167,7 +1167,7 @@ static int __init topology_init(void)
+ * CPU. For instance, the boot cpu might never be valid
+ * for hotplugging.
+ */
+- if (smp_ops->cpu_offline_self)
++ if (smp_ops && smp_ops->cpu_offline_self)
+ c->hotpluggable = 1;
+ #endif
+
--- /dev/null
+From cbc06f051c524dcfe52ef0d1f30647828e226d30 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@kaod.org>
+Date: Sat, 7 Aug 2021 09:20:57 +0200
+Subject: powerpc/xive: Do not skip CPU-less nodes when creating the IPIs
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Cédric Le Goater <clg@kaod.org>
+
+commit cbc06f051c524dcfe52ef0d1f30647828e226d30 upstream.
+
+On PowerVM, CPU-less nodes can be populated with hot-plugged CPUs at
+runtime. Today, the IPI is not created for such nodes, and hot-plugged
+CPUs use a bogus IPI, which leads to soft lockups.
+
+We can not directly allocate and request the IPI on demand because
+bringup_up() is called under the IRQ sparse lock. The alternative is
+to allocate the IPIs for all possible nodes at startup and to request
+the mapping on demand when the first CPU of a node is brought up.
+
+Fixes: 7dcc37b3eff9 ("powerpc/xive: Map one IPI interrupt per node")
+Cc: stable@vger.kernel.org # v5.13
+Reported-by: Geetika Moolchandani <Geetika.Moolchandani1@ibm.com>
+Signed-off-by: Cédric Le Goater <clg@kaod.org>
+Tested-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
+Tested-by: Laurent Vivier <lvivier@redhat.com>
+Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
+Link: https://lore.kernel.org/r/20210807072057.184698-1-clg@kaod.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/powerpc/sysdev/xive/common.c | 35 ++++++++++++++++++++++++-----------
+ 1 file changed, 24 insertions(+), 11 deletions(-)
+
+--- a/arch/powerpc/sysdev/xive/common.c
++++ b/arch/powerpc/sysdev/xive/common.c
+@@ -67,6 +67,7 @@ static struct irq_domain *xive_irq_domai
+ static struct xive_ipi_desc {
+ unsigned int irq;
+ char name[16];
++ atomic_t started;
+ } *xive_ipis;
+
+ /*
+@@ -1120,7 +1121,7 @@ static const struct irq_domain_ops xive_
+ .alloc = xive_ipi_irq_domain_alloc,
+ };
+
+-static int __init xive_request_ipi(void)
++static int __init xive_init_ipis(void)
+ {
+ struct fwnode_handle *fwnode;
+ struct irq_domain *ipi_domain;
+@@ -1144,10 +1145,6 @@ static int __init xive_request_ipi(void)
+ struct xive_ipi_desc *xid = &xive_ipis[node];
+ struct xive_ipi_alloc_info info = { node };
+
+- /* Skip nodes without CPUs */
+- if (cpumask_empty(cpumask_of_node(node)))
+- continue;
+-
+ /*
+ * Map one IPI interrupt per node for all cpus of that node.
+ * Since the HW interrupt number doesn't have any meaning,
+@@ -1159,11 +1156,6 @@ static int __init xive_request_ipi(void)
+ xid->irq = ret;
+
+ snprintf(xid->name, sizeof(xid->name), "IPI-%d", node);
+-
+- ret = request_irq(xid->irq, xive_muxed_ipi_action,
+- IRQF_PERCPU | IRQF_NO_THREAD, xid->name, NULL);
+-
+- WARN(ret < 0, "Failed to request IPI %d: %d\n", xid->irq, ret);
+ }
+
+ return ret;
+@@ -1178,6 +1170,22 @@ out:
+ return ret;
+ }
+
++static int __init xive_request_ipi(unsigned int cpu)
++{
++ struct xive_ipi_desc *xid = &xive_ipis[early_cpu_to_node(cpu)];
++ int ret;
++
++ if (atomic_inc_return(&xid->started) > 1)
++ return 0;
++
++ ret = request_irq(xid->irq, xive_muxed_ipi_action,
++ IRQF_PERCPU | IRQF_NO_THREAD,
++ xid->name, NULL);
++
++ WARN(ret < 0, "Failed to request IPI %d: %d\n", xid->irq, ret);
++ return ret;
++}
++
+ static int xive_setup_cpu_ipi(unsigned int cpu)
+ {
+ unsigned int xive_ipi_irq = xive_ipi_cpu_to_irq(cpu);
+@@ -1192,6 +1200,9 @@ static int xive_setup_cpu_ipi(unsigned i
+ if (xc->hw_ipi != XIVE_BAD_IRQ)
+ return 0;
+
++ /* Register the IPI */
++ xive_request_ipi(cpu);
++
+ /* Grab an IPI from the backend, this will populate xc->hw_ipi */
+ if (xive_ops->get_ipi(cpu, xc))
+ return -EIO;
+@@ -1231,6 +1242,8 @@ static void xive_cleanup_cpu_ipi(unsigne
+ if (xc->hw_ipi == XIVE_BAD_IRQ)
+ return;
+
++ /* TODO: clear IPI mapping */
++
+ /* Mask the IPI */
+ xive_do_source_set_mask(&xc->ipi_data, true);
+
+@@ -1253,7 +1266,7 @@ void __init xive_smp_probe(void)
+ smp_ops->cause_ipi = xive_cause_ipi;
+
+ /* Register the IPI */
+- xive_request_ipi();
++ xive_init_ipis();
+
+ /* Allocate and setup IPI for the boot CPU */
+ xive_setup_cpu_ipi(smp_processor_id());
genirq-timings-prevent-potential-array-overflow-in-__irq_timings_store.patch
powerpc-interrupt-fix-oops-by-not-calling-do_irq-from-timer_interrupt.patch
pci-msi-enable-and-mask-msi-x-early.patch
+pci-msi-mask-all-unused-msi-x-entries.patch
+pci-msi-enforce-that-msi-x-table-entry-is-masked-for-update.patch
+pci-msi-enforce-msi-entry-updates-to-be-visible.patch
+pci-msi-do-not-set-invalid-bits-in-msi-mask.patch
+pci-msi-correct-misleading-comments.patch
+pci-msi-use-msi_mask_irq-in-pci_msi_shutdown.patch
+pci-msi-protect-msi_desc-masked-for-multi-msi.patch
+powerpc-interrupt-do-not-call-single_step_exception-from-other-exceptions.patch
+powerpc-pseries-fix-update-of-lpar-security-flavor-after-lpm.patch
+powerpc-32s-fix-napping-restore-in-data-storage-interrupt-dsi.patch
+powerpc-smp-fix-oops-in-topology_init.patch
+powerpc-xive-do-not-skip-cpu-less-nodes-when-creating-the-ipis.patch
+powerpc-32-fix-critical-and-debug-interrupts-on-booke.patch
+efi-libstub-arm64-double-check-image-alignment-at-entry.patch
+locking-rtmutex-use-the-correct-rtmutex-debugging-config-option.patch
+kvm-vmx-use-current-vmcs-to-query-waitpkg-support-for-msr-emulation.patch
+kvm-nvmx-use-vmx_need_pf_intercept-when-deciding-if-l0-wants-a-pf.patch
+kvm-x86-mmu-don-t-leak-non-leaf-sptes-when-zapping-all-sptes.patch
+kvm-x86-mmu-protect-marking-sps-unsync-when-using-tdp-mmu-with-spinlock.patch