From: Greg Kroah-Hartman Date: Mon, 16 Aug 2021 08:27:45 +0000 (+0200) Subject: 5.13-stable patches X-Git-Tag: v5.4.142~25 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=fb915482351804740dd2d2e390008c3591c237c6;p=thirdparty%2Fkernel%2Fstable-queue.git 5.13-stable patches added patches: efi-libstub-arm64-double-check-image-alignment-at-entry.patch kvm-nvmx-use-vmx_need_pf_intercept-when-deciding-if-l0-wants-a-pf.patch kvm-vmx-use-current-vmcs-to-query-waitpkg-support-for-msr-emulation.patch kvm-x86-mmu-don-t-leak-non-leaf-sptes-when-zapping-all-sptes.patch kvm-x86-mmu-protect-marking-sps-unsync-when-using-tdp-mmu-with-spinlock.patch locking-rtmutex-use-the-correct-rtmutex-debugging-config-option.patch pci-msi-correct-misleading-comments.patch pci-msi-do-not-set-invalid-bits-in-msi-mask.patch pci-msi-enforce-msi-entry-updates-to-be-visible.patch pci-msi-enforce-that-msi-x-table-entry-is-masked-for-update.patch pci-msi-mask-all-unused-msi-x-entries.patch pci-msi-protect-msi_desc-masked-for-multi-msi.patch pci-msi-use-msi_mask_irq-in-pci_msi_shutdown.patch powerpc-32-fix-critical-and-debug-interrupts-on-booke.patch powerpc-32s-fix-napping-restore-in-data-storage-interrupt-dsi.patch powerpc-interrupt-do-not-call-single_step_exception-from-other-exceptions.patch powerpc-pseries-fix-update-of-lpar-security-flavor-after-lpm.patch powerpc-smp-fix-oops-in-topology_init.patch powerpc-xive-do-not-skip-cpu-less-nodes-when-creating-the-ipis.patch --- diff --git a/queue-5.13/efi-libstub-arm64-double-check-image-alignment-at-entry.patch b/queue-5.13/efi-libstub-arm64-double-check-image-alignment-at-entry.patch new file mode 100644 index 00000000000..b4bc0502359 --- /dev/null +++ b/queue-5.13/efi-libstub-arm64-double-check-image-alignment-at-entry.patch @@ -0,0 +1,41 @@ +From c32ac11da3f83bb42b986702a9b92f0a14ed4182 Mon Sep 17 00:00:00 2001 +From: Ard Biesheuvel +Date: Mon, 26 Jul 2021 16:31:44 +0200 +Subject: efi/libstub: arm64: Double check image alignment at entry + +From: Ard Biesheuvel + +commit c32ac11da3f83bb42b986702a9b92f0a14ed4182 upstream. + +On arm64, the stub only moves the kernel image around in memory if +needed, which is typically only for KASLR, given that relocatable +kernels (which is the default) can run from any 64k aligned address, +which is also the minimum alignment communicated to EFI via the PE/COFF +header. + +Unfortunately, some loaders appear to ignore this header, and load the +kernel at some arbitrary offset in memory. We can deal with this, but +let's check for this condition anyway, so non-compliant code can be +spotted and fixed. + +Cc: # v5.10+ +Signed-off-by: Ard Biesheuvel +Tested-by: Benjamin Herrenschmidt +Signed-off-by: Greg Kroah-Hartman +--- + drivers/firmware/efi/libstub/arm64-stub.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/drivers/firmware/efi/libstub/arm64-stub.c ++++ b/drivers/firmware/efi/libstub/arm64-stub.c +@@ -119,6 +119,10 @@ efi_status_t handle_kernel_image(unsigne + if (image->image_base != _text) + efi_err("FIRMWARE BUG: efi_loaded_image_t::image_base has bogus value\n"); + ++ if (!IS_ALIGNED((u64)_text, EFI_KIMG_ALIGN)) ++ efi_err("FIRMWARE BUG: kernel image not aligned on %ldk boundary\n", ++ EFI_KIMG_ALIGN >> 10); ++ + kernel_size = _edata - _text; + kernel_memsize = kernel_size + (_end - _edata); + *reserve_size = kernel_memsize; diff --git a/queue-5.13/kvm-nvmx-use-vmx_need_pf_intercept-when-deciding-if-l0-wants-a-pf.patch b/queue-5.13/kvm-nvmx-use-vmx_need_pf_intercept-when-deciding-if-l0-wants-a-pf.patch new file mode 100644 index 00000000000..b0233419eeb --- /dev/null +++ b/queue-5.13/kvm-nvmx-use-vmx_need_pf_intercept-when-deciding-if-l0-wants-a-pf.patch @@ -0,0 +1,46 @@ +From 18712c13709d2de9516c5d3414f707c4f0a9c190 Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Wed, 11 Aug 2021 21:56:15 -0700 +Subject: KVM: nVMX: Use vmx_need_pf_intercept() when deciding if L0 wants a #PF + +From: Sean Christopherson + +commit 18712c13709d2de9516c5d3414f707c4f0a9c190 upstream. + +Use vmx_need_pf_intercept() when determining if L0 wants to handle a #PF +in L2 or if the VM-Exit should be forwarded to L1. The current logic fails +to account for the case where #PF is intercepted to handle +guest.MAXPHYADDR < host.MAXPHYADDR and ends up reflecting all #PFs into +L1. At best, L1 will complain and inject the #PF back into L2. At +worst, L1 will eat the unexpected fault and cause L2 to hang on infinite +page faults. + +Note, while the bug was technically introduced by the commit that added +support for the MAXPHYADDR madness, the shame is all on commit +a0c134347baf ("KVM: VMX: introduce vmx_need_pf_intercept"). + +Fixes: 1dbf5d68af6f ("KVM: VMX: Add guest physical address check in EPT violation and misconfig") +Cc: stable@vger.kernel.org +Cc: Peter Shier +Cc: Oliver Upton +Cc: Jim Mattson +Signed-off-by: Sean Christopherson +Message-Id: <20210812045615.3167686-1-seanjc@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/vmx/nested.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/arch/x86/kvm/vmx/nested.c ++++ b/arch/x86/kvm/vmx/nested.c +@@ -5798,7 +5798,8 @@ static bool nested_vmx_l0_wants_exit(str + if (is_nmi(intr_info)) + return true; + else if (is_page_fault(intr_info)) +- return vcpu->arch.apf.host_apf_flags || !enable_ept; ++ return vcpu->arch.apf.host_apf_flags || ++ vmx_need_pf_intercept(vcpu); + else if (is_debug(intr_info) && + vcpu->guest_debug & + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) diff --git a/queue-5.13/kvm-vmx-use-current-vmcs-to-query-waitpkg-support-for-msr-emulation.patch b/queue-5.13/kvm-vmx-use-current-vmcs-to-query-waitpkg-support-for-msr-emulation.patch new file mode 100644 index 00000000000..9d740289b3e --- /dev/null +++ b/queue-5.13/kvm-vmx-use-current-vmcs-to-query-waitpkg-support-for-msr-emulation.patch @@ -0,0 +1,41 @@ +From 7b9cae027ba3aaac295ae23a62f47876ed97da73 Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Tue, 10 Aug 2021 10:19:49 -0700 +Subject: KVM: VMX: Use current VMCS to query WAITPKG support for MSR emulation + +From: Sean Christopherson + +commit 7b9cae027ba3aaac295ae23a62f47876ed97da73 upstream. + +Use the secondary_exec_controls_get() accessor in vmx_has_waitpkg() to +effectively get the controls for the current VMCS, as opposed to using +vmx->secondary_exec_controls, which is the cached value of KVM's desired +controls for vmcs01 and truly not reflective of any particular VMCS. + +While the waitpkg control is not dynamic, i.e. vmcs01 will always hold +the same waitpkg configuration as vmx->secondary_exec_controls, the same +does not hold true for vmcs02 if the L1 VMM hides the feature from L2. +If L1 hides the feature _and_ does not intercept MSR_IA32_UMWAIT_CONTROL, +L2 could incorrectly read/write L1's virtual MSR instead of taking a #GP. + +Fixes: 6e3ba4abcea5 ("KVM: vmx: Emulate MSR IA32_UMWAIT_CONTROL") +Cc: stable@vger.kernel.org +Signed-off-by: Sean Christopherson +Message-Id: <20210810171952.2758100-2-seanjc@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/vmx/vmx.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/x86/kvm/vmx/vmx.h ++++ b/arch/x86/kvm/vmx/vmx.h +@@ -538,7 +538,7 @@ static inline void decache_tsc_multiplie + + static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx) + { +- return vmx->secondary_exec_control & ++ return secondary_exec_controls_get(vmx) & + SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; + } + diff --git a/queue-5.13/kvm-x86-mmu-don-t-leak-non-leaf-sptes-when-zapping-all-sptes.patch b/queue-5.13/kvm-x86-mmu-don-t-leak-non-leaf-sptes-when-zapping-all-sptes.patch new file mode 100644 index 00000000000..d8d3b414f35 --- /dev/null +++ b/queue-5.13/kvm-x86-mmu-don-t-leak-non-leaf-sptes-when-zapping-all-sptes.patch @@ -0,0 +1,148 @@ +From 524a1e4e381fc5e7781008d5bd420fd1357c0113 Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Thu, 12 Aug 2021 11:14:13 -0700 +Subject: KVM: x86/mmu: Don't leak non-leaf SPTEs when zapping all SPTEs + +From: Sean Christopherson + +commit 524a1e4e381fc5e7781008d5bd420fd1357c0113 upstream. + +Pass "all ones" as the end GFN to signal "zap all" for the TDP MMU and +really zap all SPTEs in this case. As is, zap_gfn_range() skips non-leaf +SPTEs whose range exceeds the range to be zapped. If shadow_phys_bits is +not aligned to the range size of top-level SPTEs, e.g. 512gb with 4-level +paging, the "zap all" flows will skip top-level SPTEs whose range extends +beyond shadow_phys_bits and leak their SPs when the VM is destroyed. + +Use the current upper bound (based on host.MAXPHYADDR) to detect that the +caller wants to zap all SPTEs, e.g. instead of using the max theoretical +gfn, 1 << (52 - 12). The more precise upper bound allows the TDP iterator +to terminate its walk earlier when running on hosts with MAXPHYADDR < 52. + +Add a WARN on kmv->arch.tdp_mmu_pages when the TDP MMU is destroyed to +help future debuggers should KVM decide to leak SPTEs again. + +The bug is most easily reproduced by running (and unloading!) KVM in a +VM whose host.MAXPHYADDR < 39, as the SPTE for gfn=0 will be skipped. + + ============================================================================= + BUG kvm_mmu_page_header (Not tainted): Objects remaining in kvm_mmu_page_header on __kmem_cache_shutdown() + ----------------------------------------------------------------------------- + Slab 0x000000004d8f7af1 objects=22 used=2 fp=0x00000000624d29ac flags=0x4000000000000200(slab|zone=1) + CPU: 0 PID: 1582 Comm: rmmod Not tainted 5.14.0-rc2+ #420 + Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 + Call Trace: + dump_stack_lvl+0x45/0x59 + slab_err+0x95/0xc9 + __kmem_cache_shutdown.cold+0x3c/0x158 + kmem_cache_destroy+0x3d/0xf0 + kvm_mmu_module_exit+0xa/0x30 [kvm] + kvm_arch_exit+0x5d/0x90 [kvm] + kvm_exit+0x78/0x90 [kvm] + vmx_exit+0x1a/0x50 [kvm_intel] + __x64_sys_delete_module+0x13f/0x220 + do_syscall_64+0x3b/0xc0 + entry_SYSCALL_64_after_hwframe+0x44/0xae + +Fixes: faaf05b00aec ("kvm: x86/mmu: Support zapping SPTEs in the TDP MMU") +Cc: stable@vger.kernel.org +Cc: Ben Gardon +Signed-off-by: Sean Christopherson +Message-Id: <20210812181414.3376143-2-seanjc@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/mmu/tdp_mmu.c | 26 ++++++++++++++++---------- + 1 file changed, 16 insertions(+), 10 deletions(-) + +--- a/arch/x86/kvm/mmu/tdp_mmu.c ++++ b/arch/x86/kvm/mmu/tdp_mmu.c +@@ -41,6 +41,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm * + if (!kvm->arch.tdp_mmu_enabled) + return; + ++ WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages)); + WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); + + /* +@@ -79,8 +80,6 @@ static void tdp_mmu_free_sp_rcu_callback + void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, + bool shared) + { +- gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); +- + kvm_lockdep_assert_mmu_lock_held(kvm, shared); + + if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) +@@ -92,7 +91,7 @@ void kvm_tdp_mmu_put_root(struct kvm *kv + list_del_rcu(&root->link); + spin_unlock(&kvm->arch.tdp_mmu_pages_lock); + +- zap_gfn_range(kvm, root, 0, max_gfn, false, false, shared); ++ zap_gfn_range(kvm, root, 0, -1ull, false, false, shared); + + call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback); + } +@@ -722,8 +721,17 @@ static bool zap_gfn_range(struct kvm *kv + gfn_t start, gfn_t end, bool can_yield, bool flush, + bool shared) + { ++ gfn_t max_gfn_host = 1ULL << (shadow_phys_bits - PAGE_SHIFT); ++ bool zap_all = (start == 0 && end >= max_gfn_host); + struct tdp_iter iter; + ++ /* ++ * Bound the walk at host.MAXPHYADDR, guest accesses beyond that will ++ * hit a #PF(RSVD) and never get to an EPT Violation/Misconfig / #NPF, ++ * and so KVM will never install a SPTE for such addresses. ++ */ ++ end = min(end, max_gfn_host); ++ + kvm_lockdep_assert_mmu_lock_held(kvm, shared); + + rcu_read_lock(); +@@ -742,9 +750,10 @@ retry: + /* + * If this is a non-last-level SPTE that covers a larger range + * than should be zapped, continue, and zap the mappings at a +- * lower level. ++ * lower level, except when zapping all SPTEs. + */ +- if ((iter.gfn < start || ++ if (!zap_all && ++ (iter.gfn < start || + iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) && + !is_last_spte(iter.old_spte, iter.level)) + continue; +@@ -792,12 +801,11 @@ bool __kvm_tdp_mmu_zap_gfn_range(struct + + void kvm_tdp_mmu_zap_all(struct kvm *kvm) + { +- gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); + bool flush = false; + int i; + + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) +- flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn, ++ flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull, + flush, false); + + if (flush) +@@ -836,7 +844,6 @@ static struct kvm_mmu_page *next_invalid + */ + void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) + { +- gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); + struct kvm_mmu_page *next_root; + struct kvm_mmu_page *root; + bool flush = false; +@@ -852,8 +859,7 @@ void kvm_tdp_mmu_zap_invalidated_roots(s + + rcu_read_unlock(); + +- flush = zap_gfn_range(kvm, root, 0, max_gfn, true, flush, +- true); ++ flush = zap_gfn_range(kvm, root, 0, -1ull, true, flush, true); + + /* + * Put the reference acquired in diff --git a/queue-5.13/kvm-x86-mmu-protect-marking-sps-unsync-when-using-tdp-mmu-with-spinlock.patch b/queue-5.13/kvm-x86-mmu-protect-marking-sps-unsync-when-using-tdp-mmu-with-spinlock.patch new file mode 100644 index 00000000000..29b36c49821 --- /dev/null +++ b/queue-5.13/kvm-x86-mmu-protect-marking-sps-unsync-when-using-tdp-mmu-with-spinlock.patch @@ -0,0 +1,159 @@ +From ce25681d59ffc4303321e555a2d71b1946af07da Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Thu, 12 Aug 2021 11:18:15 -0700 +Subject: KVM: x86/mmu: Protect marking SPs unsync when using TDP MMU with spinlock + +From: Sean Christopherson + +commit ce25681d59ffc4303321e555a2d71b1946af07da upstream. + +Add yet another spinlock for the TDP MMU and take it when marking indirect +shadow pages unsync. When using the TDP MMU and L1 is running L2(s) with +nested TDP, KVM may encounter shadow pages for the TDP entries managed by +L1 (controlling L2) when handling a TDP MMU page fault. The unsync logic +is not thread safe, e.g. the kvm_mmu_page fields are not atomic, and +misbehaves when a shadow page is marked unsync via a TDP MMU page fault, +which runs with mmu_lock held for read, not write. + +Lack of a critical section manifests most visibly as an underflow of +unsync_children in clear_unsync_child_bit() due to unsync_children being +corrupted when multiple CPUs write it without a critical section and +without atomic operations. But underflow is the best case scenario. The +worst case scenario is that unsync_children prematurely hits '0' and +leads to guest memory corruption due to KVM neglecting to properly sync +shadow pages. + +Use an entirely new spinlock even though piggybacking tdp_mmu_pages_lock +would functionally be ok. Usurping the lock could degrade performance when +building upper level page tables on different vCPUs, especially since the +unsync flow could hold the lock for a comparatively long time depending on +the number of indirect shadow pages and the depth of the paging tree. + +For simplicity, take the lock for all MMUs, even though KVM could fairly +easily know that mmu_lock is held for write. If mmu_lock is held for +write, there cannot be contention for the inner spinlock, and marking +shadow pages unsync across multiple vCPUs will be slow enough that +bouncing the kvm_arch cacheline should be in the noise. + +Note, even though L2 could theoretically be given access to its own EPT +entries, a nested MMU must hold mmu_lock for write and thus cannot race +against a TDP MMU page fault. I.e. the additional spinlock only _needs_ to +be taken by the TDP MMU, as opposed to being taken by any MMU for a VM +that is running with the TDP MMU enabled. Holding mmu_lock for read also +prevents the indirect shadow page from being freed. But as above, keep +it simple and always take the lock. + +Alternative #1, the TDP MMU could simply pass "false" for can_unsync and +effectively disable unsync behavior for nested TDP. Write protecting leaf +shadow pages is unlikely to noticeably impact traditional L1 VMMs, as such +VMMs typically don't modify TDP entries, but the same may not hold true for +non-standard use cases and/or VMMs that are migrating physical pages (from +L1's perspective). + +Alternative #2, the unsync logic could be made thread safe. In theory, +simply converting all relevant kvm_mmu_page fields to atomics and using +atomic bitops for the bitmap would suffice. However, (a) an in-depth audit +would be required, (b) the code churn would be substantial, and (c) legacy +shadow paging would incur additional atomic operations in performance +sensitive paths for no benefit (to legacy shadow paging). + +Fixes: a2855afc7ee8 ("KVM: x86/mmu: Allow parallel page faults for the TDP MMU") +Cc: stable@vger.kernel.org +Cc: Ben Gardon +Signed-off-by: Sean Christopherson +Message-Id: <20210812181815.3378104-1-seanjc@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + Documentation/virt/kvm/locking.rst | 8 ++++---- + arch/x86/include/asm/kvm_host.h | 7 +++++++ + arch/x86/kvm/mmu/mmu.c | 28 ++++++++++++++++++++++++++++ + 3 files changed, 39 insertions(+), 4 deletions(-) + +--- a/Documentation/virt/kvm/locking.rst ++++ b/Documentation/virt/kvm/locking.rst +@@ -20,10 +20,10 @@ On x86: + + - vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock + +-- kvm->arch.mmu_lock is an rwlock. kvm->arch.tdp_mmu_pages_lock is +- taken inside kvm->arch.mmu_lock, and cannot be taken without already +- holding kvm->arch.mmu_lock (typically with ``read_lock``, otherwise +- there's no need to take kvm->arch.tdp_mmu_pages_lock at all). ++- kvm->arch.mmu_lock is an rwlock. kvm->arch.tdp_mmu_pages_lock and ++ kvm->arch.mmu_unsync_pages_lock are taken inside kvm->arch.mmu_lock, and ++ cannot be taken without already holding kvm->arch.mmu_lock (typically with ++ ``read_lock`` for the TDP MMU, thus the need for additional spinlocks). + + Everything else is a leaf: no other lock is taken inside the critical + sections. +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -987,6 +987,13 @@ struct kvm_arch { + struct list_head lpage_disallowed_mmu_pages; + struct kvm_page_track_notifier_node mmu_sp_tracker; + struct kvm_page_track_notifier_head track_notifier_head; ++ /* ++ * Protects marking pages unsync during page faults, as TDP MMU page ++ * faults only take mmu_lock for read. For simplicity, the unsync ++ * pages lock is always taken when marking pages unsync regardless of ++ * whether mmu_lock is held for read or write. ++ */ ++ spinlock_t mmu_unsync_pages_lock; + + struct list_head assigned_dev_head; + struct iommu_domain *iommu_domain; +--- a/arch/x86/kvm/mmu/mmu.c ++++ b/arch/x86/kvm/mmu/mmu.c +@@ -2454,6 +2454,7 @@ bool mmu_need_write_protect(struct kvm_v + bool can_unsync) + { + struct kvm_mmu_page *sp; ++ bool locked = false; + + if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE)) + return true; +@@ -2465,9 +2466,34 @@ bool mmu_need_write_protect(struct kvm_v + if (sp->unsync) + continue; + ++ /* ++ * TDP MMU page faults require an additional spinlock as they ++ * run with mmu_lock held for read, not write, and the unsync ++ * logic is not thread safe. Take the spinklock regardless of ++ * the MMU type to avoid extra conditionals/parameters, there's ++ * no meaningful penalty if mmu_lock is held for write. ++ */ ++ if (!locked) { ++ locked = true; ++ spin_lock(&vcpu->kvm->arch.mmu_unsync_pages_lock); ++ ++ /* ++ * Recheck after taking the spinlock, a different vCPU ++ * may have since marked the page unsync. A false ++ * positive on the unprotected check above is not ++ * possible as clearing sp->unsync _must_ hold mmu_lock ++ * for write, i.e. unsync cannot transition from 0->1 ++ * while this CPU holds mmu_lock for read (or write). ++ */ ++ if (READ_ONCE(sp->unsync)) ++ continue; ++ } ++ + WARN_ON(sp->role.level != PG_LEVEL_4K); + kvm_unsync_page(vcpu, sp); + } ++ if (locked) ++ spin_unlock(&vcpu->kvm->arch.mmu_unsync_pages_lock); + + /* + * We need to ensure that the marking of unsync pages is visible +@@ -5514,6 +5540,8 @@ void kvm_mmu_init_vm(struct kvm *kvm) + { + struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; + ++ spin_lock_init(&kvm->arch.mmu_unsync_pages_lock); ++ + kvm_mmu_init_tdp_mmu(kvm); + + node->track_write = kvm_mmu_pte_write; diff --git a/queue-5.13/locking-rtmutex-use-the-correct-rtmutex-debugging-config-option.patch b/queue-5.13/locking-rtmutex-use-the-correct-rtmutex-debugging-config-option.patch new file mode 100644 index 00000000000..387433be6c3 --- /dev/null +++ b/queue-5.13/locking-rtmutex-use-the-correct-rtmutex-debugging-config-option.patch @@ -0,0 +1,34 @@ +From 07d25971b220e477eb019fcb520a9f2e3ac966af Mon Sep 17 00:00:00 2001 +From: Zhen Lei +Date: Sat, 31 Jul 2021 20:30:11 +0800 +Subject: locking/rtmutex: Use the correct rtmutex debugging config option + +From: Zhen Lei + +commit 07d25971b220e477eb019fcb520a9f2e3ac966af upstream. + +It's CONFIG_DEBUG_RT_MUTEXES not CONFIG_DEBUG_RT_MUTEX. + +Fixes: f7efc4799f81 ("locking/rtmutex: Inline chainwalk depth check") +Signed-off-by: Zhen Lei +Signed-off-by: Thomas Gleixner +Acked-by: Will Deacon +Acked-by: Boqun Feng +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/20210731123011.4555-1-thunder.leizhen@huawei.com +Signed-off-by: Greg Kroah-Hartman +--- + kernel/locking/rtmutex.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -343,7 +343,7 @@ static __always_inline bool + rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter, + enum rtmutex_chainwalk chwalk) + { +- if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEX)) ++ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES)) + return waiter != NULL; + return chwalk == RT_MUTEX_FULL_CHAINWALK; + } diff --git a/queue-5.13/pci-msi-correct-misleading-comments.patch b/queue-5.13/pci-msi-correct-misleading-comments.patch new file mode 100644 index 00000000000..daced38f20e --- /dev/null +++ b/queue-5.13/pci-msi-correct-misleading-comments.patch @@ -0,0 +1,45 @@ +From 689e6b5351573c38ccf92a0dd8b3e2c2241e4aff Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Thu, 29 Jul 2021 23:51:45 +0200 +Subject: PCI/MSI: Correct misleading comments + +From: Thomas Gleixner + +commit 689e6b5351573c38ccf92a0dd8b3e2c2241e4aff upstream. + +The comments about preserving the cached state in pci_msi[x]_shutdown() are +misleading as the MSI descriptors are freed right after those functions +return. So there is nothing to restore. Preparatory change. + +Signed-off-by: Thomas Gleixner +Tested-by: Marc Zyngier +Reviewed-by: Marc Zyngier +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/20210729222542.621609423@linutronix.de +Signed-off-by: Greg Kroah-Hartman +--- + drivers/pci/msi.c | 5 +---- + 1 file changed, 1 insertion(+), 4 deletions(-) + +--- a/drivers/pci/msi.c ++++ b/drivers/pci/msi.c +@@ -961,7 +961,6 @@ static void pci_msi_shutdown(struct pci_ + + /* Return the device with MSI unmasked as initial states */ + mask = msi_mask(desc->msi_attrib.multi_cap); +- /* Keep cached state to be restored */ + __pci_msi_desc_mask_irq(desc, mask, 0); + + /* Restore dev->irq to its default pin-assertion IRQ */ +@@ -1047,10 +1046,8 @@ static void pci_msix_shutdown(struct pci + } + + /* Return the device with MSI-X masked as initial states */ +- for_each_pci_msi_entry(entry, dev) { +- /* Keep cached states to be restored */ ++ for_each_pci_msi_entry(entry, dev) + __pci_msix_desc_mask_irq(entry, 1); +- } + + pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0); + pci_intx_for_msi(dev, 1); diff --git a/queue-5.13/pci-msi-do-not-set-invalid-bits-in-msi-mask.patch b/queue-5.13/pci-msi-do-not-set-invalid-bits-in-msi-mask.patch new file mode 100644 index 00000000000..0dcc3f9065b --- /dev/null +++ b/queue-5.13/pci-msi-do-not-set-invalid-bits-in-msi-mask.patch @@ -0,0 +1,64 @@ +From 361fd37397f77578735907341579397d5bed0a2d Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Thu, 29 Jul 2021 23:51:44 +0200 +Subject: PCI/MSI: Do not set invalid bits in MSI mask + +From: Thomas Gleixner + +commit 361fd37397f77578735907341579397d5bed0a2d upstream. + +msi_mask_irq() takes a mask and a flags argument. The mask argument is used +to mask out bits from the cached mask and the flags argument to set bits. + +Some places invoke it with a flags argument which sets bits which are not +used by the device, i.e. when the device supports up to 8 vectors a full +unmask in some places sets the mask to 0xFFFFFF00. While devices probably +do not care, it's still bad practice. + +Fixes: 7ba1930db02f ("PCI MSI: Unmask MSI if setup failed") +Signed-off-by: Thomas Gleixner +Tested-by: Marc Zyngier +Reviewed-by: Marc Zyngier +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/20210729222542.568173099@linutronix.de +Signed-off-by: Greg Kroah-Hartman +--- + drivers/pci/msi.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +--- a/drivers/pci/msi.c ++++ b/drivers/pci/msi.c +@@ -656,21 +656,21 @@ static int msi_capability_init(struct pc + /* Configure MSI capability structure */ + ret = pci_msi_setup_msi_irqs(dev, nvec, PCI_CAP_ID_MSI); + if (ret) { +- msi_mask_irq(entry, mask, ~mask); ++ msi_mask_irq(entry, mask, 0); + free_msi_irqs(dev); + return ret; + } + + ret = msi_verify_entries(dev); + if (ret) { +- msi_mask_irq(entry, mask, ~mask); ++ msi_mask_irq(entry, mask, 0); + free_msi_irqs(dev); + return ret; + } + + ret = populate_msi_sysfs(dev); + if (ret) { +- msi_mask_irq(entry, mask, ~mask); ++ msi_mask_irq(entry, mask, 0); + free_msi_irqs(dev); + return ret; + } +@@ -962,7 +962,7 @@ static void pci_msi_shutdown(struct pci_ + /* Return the device with MSI unmasked as initial states */ + mask = msi_mask(desc->msi_attrib.multi_cap); + /* Keep cached state to be restored */ +- __pci_msi_desc_mask_irq(desc, mask, ~mask); ++ __pci_msi_desc_mask_irq(desc, mask, 0); + + /* Restore dev->irq to its default pin-assertion IRQ */ + dev->irq = desc->msi_attrib.default_irq; diff --git a/queue-5.13/pci-msi-enforce-msi-entry-updates-to-be-visible.patch b/queue-5.13/pci-msi-enforce-msi-entry-updates-to-be-visible.patch new file mode 100644 index 00000000000..b45429d8f71 --- /dev/null +++ b/queue-5.13/pci-msi-enforce-msi-entry-updates-to-be-visible.patch @@ -0,0 +1,54 @@ +From b9255a7cb51754e8d2645b65dd31805e282b4f3e Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Thu, 29 Jul 2021 23:51:43 +0200 +Subject: PCI/MSI: Enforce MSI[X] entry updates to be visible + +From: Thomas Gleixner + +commit b9255a7cb51754e8d2645b65dd31805e282b4f3e upstream. + +Nothing enforces the posted writes to be visible when the function +returns. Flush them even if the flush might be redundant when the entry is +masked already as the unmask will flush as well. This is either setup or a +rare affinity change event so the extra flush is not the end of the world. + +While this is more a theoretical issue especially the logic in the X86 +specific msi_set_affinity() function relies on the assumption that the +update has reached the hardware when the function returns. + +Again, as this never has been enforced the Fixes tag refers to a commit in: + git://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git + +Fixes: f036d4ea5fa7 ("[PATCH] ia32 Message Signalled Interrupt support") +Signed-off-by: Thomas Gleixner +Tested-by: Marc Zyngier +Reviewed-by: Marc Zyngier +Acked-by: Bjorn Helgaas +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/20210729222542.515188147@linutronix.de +Signed-off-by: Greg Kroah-Hartman +--- + drivers/pci/msi.c | 5 +++++ + 1 file changed, 5 insertions(+) + +--- a/drivers/pci/msi.c ++++ b/drivers/pci/msi.c +@@ -311,6 +311,9 @@ void __pci_write_msi_msg(struct msi_desc + + if (unmasked) + __pci_msix_desc_mask_irq(entry, 0); ++ ++ /* Ensure that the writes are visible in the device */ ++ readl(base + PCI_MSIX_ENTRY_DATA); + } else { + int pos = dev->msi_cap; + u16 msgctl; +@@ -331,6 +334,8 @@ void __pci_write_msi_msg(struct msi_desc + pci_write_config_word(dev, pos + PCI_MSI_DATA_32, + msg->data); + } ++ /* Ensure that the writes are visible in the device */ ++ pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &msgctl); + } + + skip: diff --git a/queue-5.13/pci-msi-enforce-that-msi-x-table-entry-is-masked-for-update.patch b/queue-5.13/pci-msi-enforce-that-msi-x-table-entry-is-masked-for-update.patch new file mode 100644 index 00000000000..b5cda436867 --- /dev/null +++ b/queue-5.13/pci-msi-enforce-that-msi-x-table-entry-is-masked-for-update.patch @@ -0,0 +1,73 @@ +From da181dc974ad667579baece33c2c8d2d1e4558d5 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Thu, 29 Jul 2021 23:51:42 +0200 +Subject: PCI/MSI: Enforce that MSI-X table entry is masked for update + +From: Thomas Gleixner + +commit da181dc974ad667579baece33c2c8d2d1e4558d5 upstream. + +The specification (PCIe r5.0, sec 6.1.4.5) states: + + For MSI-X, a function is permitted to cache Address and Data values + from unmasked MSI-X Table entries. However, anytime software unmasks a + currently masked MSI-X Table entry either by clearing its Mask bit or + by clearing the Function Mask bit, the function must update any Address + or Data values that it cached from that entry. If software changes the + Address or Data value of an entry while the entry is unmasked, the + result is undefined. + +The Linux kernel's MSI-X support never enforced that the entry is masked +before the entry is modified hence the Fixes tag refers to a commit in: + git://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git + +Enforce the entry to be masked across the update. + +There is no point in enforcing this to be handled at all possible call +sites as this is just pointless code duplication and the common update +function is the obvious place to enforce this. + +Fixes: f036d4ea5fa7 ("[PATCH] ia32 Message Signalled Interrupt support") +Reported-by: Kevin Tian +Signed-off-by: Thomas Gleixner +Tested-by: Marc Zyngier +Reviewed-by: Marc Zyngier +Acked-by: Bjorn Helgaas +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/20210729222542.462096385@linutronix.de +Signed-off-by: Greg Kroah-Hartman +--- + drivers/pci/msi.c | 15 +++++++++++++++ + 1 file changed, 15 insertions(+) + +--- a/drivers/pci/msi.c ++++ b/drivers/pci/msi.c +@@ -289,13 +289,28 @@ void __pci_write_msi_msg(struct msi_desc + /* Don't touch the hardware now */ + } else if (entry->msi_attrib.is_msix) { + void __iomem *base = pci_msix_desc_addr(entry); ++ bool unmasked = !(entry->masked & PCI_MSIX_ENTRY_CTRL_MASKBIT); + + if (!base) + goto skip; + ++ /* ++ * The specification mandates that the entry is masked ++ * when the message is modified: ++ * ++ * "If software changes the Address or Data value of an ++ * entry while the entry is unmasked, the result is ++ * undefined." ++ */ ++ if (unmasked) ++ __pci_msix_desc_mask_irq(entry, PCI_MSIX_ENTRY_CTRL_MASKBIT); ++ + writel(msg->address_lo, base + PCI_MSIX_ENTRY_LOWER_ADDR); + writel(msg->address_hi, base + PCI_MSIX_ENTRY_UPPER_ADDR); + writel(msg->data, base + PCI_MSIX_ENTRY_DATA); ++ ++ if (unmasked) ++ __pci_msix_desc_mask_irq(entry, 0); + } else { + int pos = dev->msi_cap; + u16 msgctl; diff --git a/queue-5.13/pci-msi-mask-all-unused-msi-x-entries.patch b/queue-5.13/pci-msi-mask-all-unused-msi-x-entries.patch new file mode 100644 index 00000000000..7e608aec4a1 --- /dev/null +++ b/queue-5.13/pci-msi-mask-all-unused-msi-x-entries.patch @@ -0,0 +1,181 @@ +From 7d5ec3d3612396dc6d4b76366d20ab9fc06f399f Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Thu, 29 Jul 2021 23:51:41 +0200 +Subject: PCI/MSI: Mask all unused MSI-X entries + +From: Thomas Gleixner + +commit 7d5ec3d3612396dc6d4b76366d20ab9fc06f399f upstream. + +When MSI-X is enabled the ordering of calls is: + + msix_map_region(); + msix_setup_entries(); + pci_msi_setup_msi_irqs(); + msix_program_entries(); + +This has a few interesting issues: + + 1) msix_setup_entries() allocates the MSI descriptors and initializes them + except for the msi_desc:masked member which is left zero initialized. + + 2) pci_msi_setup_msi_irqs() allocates the interrupt descriptors and sets + up the MSI interrupts which ends up in pci_write_msi_msg() unless the + interrupt chip provides its own irq_write_msi_msg() function. + + 3) msix_program_entries() does not do what the name suggests. It solely + updates the entries array (if not NULL) and initializes the masked + member for each MSI descriptor by reading the hardware state and then + masks the entry. + +Obviously this has some issues: + + 1) The uninitialized masked member of msi_desc prevents the enforcement + of masking the entry in pci_write_msi_msg() depending on the cached + masked bit. Aside of that half initialized data is a NONO in general + + 2) msix_program_entries() only ensures that the actually allocated entries + are masked. This is wrong as experimentation with crash testing and + crash kernel kexec has shown. + + This limited testing unearthed that when the production kernel had more + entries in use and unmasked when it crashed and the crash kernel + allocated a smaller amount of entries, then a full scan of all entries + found unmasked entries which were in use in the production kernel. + + This is obviously a device or emulation issue as the device reset + should mask all MSI-X table entries, but obviously that's just part + of the paper specification. + +Cure this by: + + 1) Masking all table entries in hardware + 2) Initializing msi_desc::masked in msix_setup_entries() + 3) Removing the mask dance in msix_program_entries() + 4) Renaming msix_program_entries() to msix_update_entries() to + reflect the purpose of that function. + +As the masking of unused entries has never been done the Fixes tag refers +to a commit in: + git://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git + +Fixes: f036d4ea5fa7 ("[PATCH] ia32 Message Signalled Interrupt support") +Signed-off-by: Thomas Gleixner +Tested-by: Marc Zyngier +Reviewed-by: Marc Zyngier +Acked-by: Bjorn Helgaas +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/20210729222542.403833459@linutronix.de +Signed-off-by: Greg Kroah-Hartman +--- + drivers/pci/msi.c | 45 +++++++++++++++++++++++++++------------------ + 1 file changed, 27 insertions(+), 18 deletions(-) + +--- a/drivers/pci/msi.c ++++ b/drivers/pci/msi.c +@@ -691,6 +691,7 @@ static int msix_setup_entries(struct pci + { + struct irq_affinity_desc *curmsk, *masks = NULL; + struct msi_desc *entry; ++ void __iomem *addr; + int ret, i; + int vec_count = pci_msix_vec_count(dev); + +@@ -711,6 +712,7 @@ static int msix_setup_entries(struct pci + + entry->msi_attrib.is_msix = 1; + entry->msi_attrib.is_64 = 1; ++ + if (entries) + entry->msi_attrib.entry_nr = entries[i].entry; + else +@@ -722,6 +724,10 @@ static int msix_setup_entries(struct pci + entry->msi_attrib.default_irq = dev->irq; + entry->mask_base = base; + ++ addr = pci_msix_desc_addr(entry); ++ if (addr) ++ entry->masked = readl(addr + PCI_MSIX_ENTRY_VECTOR_CTRL); ++ + list_add_tail(&entry->list, dev_to_msi_list(&dev->dev)); + if (masks) + curmsk++; +@@ -732,26 +738,25 @@ out: + return ret; + } + +-static void msix_program_entries(struct pci_dev *dev, +- struct msix_entry *entries) ++static void msix_update_entries(struct pci_dev *dev, struct msix_entry *entries) + { + struct msi_desc *entry; +- int i = 0; +- void __iomem *desc_addr; + + for_each_pci_msi_entry(entry, dev) { +- if (entries) +- entries[i++].vector = entry->irq; ++ if (entries) { ++ entries->vector = entry->irq; ++ entries++; ++ } ++ } ++} + +- desc_addr = pci_msix_desc_addr(entry); +- if (desc_addr) +- entry->masked = readl(desc_addr + +- PCI_MSIX_ENTRY_VECTOR_CTRL); +- else +- entry->masked = 0; ++static void msix_mask_all(void __iomem *base, int tsize) ++{ ++ u32 ctrl = PCI_MSIX_ENTRY_CTRL_MASKBIT; ++ int i; + +- msix_mask_irq(entry, 1); +- } ++ for (i = 0; i < tsize; i++, base += PCI_MSIX_ENTRY_SIZE) ++ writel(ctrl, base + PCI_MSIX_ENTRY_VECTOR_CTRL); + } + + /** +@@ -768,9 +773,9 @@ static void msix_program_entries(struct + static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries, + int nvec, struct irq_affinity *affd) + { +- int ret; +- u16 control; + void __iomem *base; ++ int ret, tsize; ++ u16 control; + + /* + * Some devices require MSI-X to be enabled before the MSI-X +@@ -782,12 +787,16 @@ static int msix_capability_init(struct p + + pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, &control); + /* Request & Map MSI-X table region */ +- base = msix_map_region(dev, msix_table_size(control)); ++ tsize = msix_table_size(control); ++ base = msix_map_region(dev, tsize); + if (!base) { + ret = -ENOMEM; + goto out_disable; + } + ++ /* Ensure that all table entries are masked. */ ++ msix_mask_all(base, tsize); ++ + ret = msix_setup_entries(dev, base, entries, nvec, affd); + if (ret) + goto out_disable; +@@ -801,7 +810,7 @@ static int msix_capability_init(struct p + if (ret) + goto out_free; + +- msix_program_entries(dev, entries); ++ msix_update_entries(dev, entries); + + ret = populate_msi_sysfs(dev); + if (ret) diff --git a/queue-5.13/pci-msi-protect-msi_desc-masked-for-multi-msi.patch b/queue-5.13/pci-msi-protect-msi_desc-masked-for-multi-msi.patch new file mode 100644 index 00000000000..4b69c45a693 --- /dev/null +++ b/queue-5.13/pci-msi-protect-msi_desc-masked-for-multi-msi.patch @@ -0,0 +1,114 @@ +From 77e89afc25f30abd56e76a809ee2884d7c1b63ce Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Thu, 29 Jul 2021 23:51:47 +0200 +Subject: PCI/MSI: Protect msi_desc::masked for multi-MSI + +From: Thomas Gleixner + +commit 77e89afc25f30abd56e76a809ee2884d7c1b63ce upstream. + +Multi-MSI uses a single MSI descriptor and there is a single mask register +when the device supports per vector masking. To avoid reading back the mask +register the value is cached in the MSI descriptor and updates are done by +clearing and setting bits in the cache and writing it to the device. + +But nothing protects msi_desc::masked and the mask register from being +modified concurrently on two different CPUs for two different Linux +interrupts which belong to the same multi-MSI descriptor. + +Add a lock to struct device and protect any operation on the mask and the +mask register with it. + +This makes the update of msi_desc::masked unconditional, but there is no +place which requires a modification of the hardware register without +updating the masked cache. + +msi_mask_irq() is now an empty wrapper which will be cleaned up in follow +up changes. + +The problem goes way back to the initial support of multi-MSI, but picking +the commit which introduced the mask cache is a valid cut off point +(2.6.30). + +Fixes: f2440d9acbe8 ("PCI MSI: Refactor interrupt masking code") +Signed-off-by: Thomas Gleixner +Tested-by: Marc Zyngier +Reviewed-by: Marc Zyngier +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/20210729222542.726833414@linutronix.de +Signed-off-by: Greg Kroah-Hartman +--- + drivers/base/core.c | 1 + + drivers/pci/msi.c | 19 ++++++++++--------- + include/linux/device.h | 1 + + include/linux/msi.h | 2 +- + 4 files changed, 13 insertions(+), 10 deletions(-) + +--- a/drivers/base/core.c ++++ b/drivers/base/core.c +@@ -2809,6 +2809,7 @@ void device_initialize(struct device *de + device_pm_init(dev); + set_dev_node(dev, -1); + #ifdef CONFIG_GENERIC_MSI_IRQ ++ raw_spin_lock_init(&dev->msi_lock); + INIT_LIST_HEAD(&dev->msi_list); + #endif + INIT_LIST_HEAD(&dev->links.consumers); +--- a/drivers/pci/msi.c ++++ b/drivers/pci/msi.c +@@ -143,24 +143,25 @@ static inline __attribute_const__ u32 ms + * reliably as devices without an INTx disable bit will then generate a + * level IRQ which will never be cleared. + */ +-u32 __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag) ++void __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag) + { +- u32 mask_bits = desc->masked; ++ raw_spinlock_t *lock = &desc->dev->msi_lock; ++ unsigned long flags; + + if (pci_msi_ignore_mask || !desc->msi_attrib.maskbit) +- return 0; ++ return; + +- mask_bits &= ~mask; +- mask_bits |= flag; ++ raw_spin_lock_irqsave(lock, flags); ++ desc->masked &= ~mask; ++ desc->masked |= flag; + pci_write_config_dword(msi_desc_to_pci_dev(desc), desc->mask_pos, +- mask_bits); +- +- return mask_bits; ++ desc->masked); ++ raw_spin_unlock_irqrestore(lock, flags); + } + + static void msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag) + { +- desc->masked = __pci_msi_desc_mask_irq(desc, mask, flag); ++ __pci_msi_desc_mask_irq(desc, mask, flag); + } + + static void __iomem *pci_msix_desc_addr(struct msi_desc *desc) +--- a/include/linux/device.h ++++ b/include/linux/device.h +@@ -496,6 +496,7 @@ struct device { + struct dev_pin_info *pins; + #endif + #ifdef CONFIG_GENERIC_MSI_IRQ ++ raw_spinlock_t msi_lock; + struct list_head msi_list; + #endif + #ifdef CONFIG_DMA_OPS +--- a/include/linux/msi.h ++++ b/include/linux/msi.h +@@ -233,7 +233,7 @@ void __pci_read_msi_msg(struct msi_desc + void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg); + + u32 __pci_msix_desc_mask_irq(struct msi_desc *desc, u32 flag); +-u32 __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag); ++void __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag); + void pci_msi_mask_irq(struct irq_data *data); + void pci_msi_unmask_irq(struct irq_data *data); + diff --git a/queue-5.13/pci-msi-use-msi_mask_irq-in-pci_msi_shutdown.patch b/queue-5.13/pci-msi-use-msi_mask_irq-in-pci_msi_shutdown.patch new file mode 100644 index 00000000000..6e5ca2e91f0 --- /dev/null +++ b/queue-5.13/pci-msi-use-msi_mask_irq-in-pci_msi_shutdown.patch @@ -0,0 +1,33 @@ +From d28d4ad2a1aef27458b3383725bb179beb8d015c Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Thu, 29 Jul 2021 23:51:46 +0200 +Subject: PCI/MSI: Use msi_mask_irq() in pci_msi_shutdown() + +From: Thomas Gleixner + +commit d28d4ad2a1aef27458b3383725bb179beb8d015c upstream. + +No point in using the raw write function from shutdown. Preparatory change +to introduce proper serialization for the msi_desc::masked cache. + +Signed-off-by: Thomas Gleixner +Tested-by: Marc Zyngier +Reviewed-by: Marc Zyngier +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/20210729222542.674391354@linutronix.de +Signed-off-by: Greg Kroah-Hartman +--- + drivers/pci/msi.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/pci/msi.c ++++ b/drivers/pci/msi.c +@@ -961,7 +961,7 @@ static void pci_msi_shutdown(struct pci_ + + /* Return the device with MSI unmasked as initial states */ + mask = msi_mask(desc->msi_attrib.multi_cap); +- __pci_msi_desc_mask_irq(desc, mask, 0); ++ msi_mask_irq(desc, mask, 0); + + /* Restore dev->irq to its default pin-assertion IRQ */ + dev->irq = desc->msi_attrib.default_irq; diff --git a/queue-5.13/powerpc-32-fix-critical-and-debug-interrupts-on-booke.patch b/queue-5.13/powerpc-32-fix-critical-and-debug-interrupts-on-booke.patch new file mode 100644 index 00000000000..2b347c4d657 --- /dev/null +++ b/queue-5.13/powerpc-32-fix-critical-and-debug-interrupts-on-booke.patch @@ -0,0 +1,161 @@ +From b5cfc9cd7b0426e94ffd9e9ed79d1b00ace7780a Mon Sep 17 00:00:00 2001 +From: Christophe Leroy +Date: Wed, 7 Jul 2021 05:55:07 +0000 +Subject: powerpc/32: Fix critical and debug interrupts on BOOKE + +From: Christophe Leroy + +commit b5cfc9cd7b0426e94ffd9e9ed79d1b00ace7780a upstream. + +32 bits BOOKE have special interrupts for debug and other +critical events. + +When handling those interrupts, dedicated registers are saved +in the stack frame in addition to the standard registers, leading +to a shift of the pt_regs struct. + +Since commit db297c3b07af ("powerpc/32: Don't save thread.regs on +interrupt entry"), the pt_regs struct is expected to be at the +same place all the time. + +Instead of handling a special struct in addition to pt_regs, just +add those special registers to struct pt_regs. + +Fixes: db297c3b07af ("powerpc/32: Don't save thread.regs on interrupt entry") +Cc: stable@vger.kernel.org +Reported-by: Radu Rendec +Signed-off-by: Christophe Leroy +Signed-off-by: Michael Ellerman +Link: https://lore.kernel.org/r/028d5483b4851b01ea4334d0751e7f260419092b.1625637264.git.christophe.leroy@csgroup.eu +Signed-off-by: Greg Kroah-Hartman +--- + arch/powerpc/include/asm/ptrace.h | 16 ++++++++++++++++ + arch/powerpc/kernel/asm-offsets.c | 31 ++++++++++++++----------------- + arch/powerpc/kernel/head_booke.h | 27 +++------------------------ + 3 files changed, 33 insertions(+), 41 deletions(-) + +--- a/arch/powerpc/include/asm/ptrace.h ++++ b/arch/powerpc/include/asm/ptrace.h +@@ -68,6 +68,22 @@ struct pt_regs + }; + unsigned long __pad[4]; /* Maintain 16 byte interrupt stack alignment */ + }; ++#if defined(CONFIG_PPC32) && defined(CONFIG_BOOKE) ++ struct { /* Must be a multiple of 16 bytes */ ++ unsigned long mas0; ++ unsigned long mas1; ++ unsigned long mas2; ++ unsigned long mas3; ++ unsigned long mas6; ++ unsigned long mas7; ++ unsigned long srr0; ++ unsigned long srr1; ++ unsigned long csrr0; ++ unsigned long csrr1; ++ unsigned long dsrr0; ++ unsigned long dsrr1; ++ }; ++#endif + }; + #endif + +--- a/arch/powerpc/kernel/asm-offsets.c ++++ b/arch/powerpc/kernel/asm-offsets.c +@@ -348,24 +348,21 @@ int main(void) + #endif + + +-#if defined(CONFIG_PPC32) +-#if defined(CONFIG_BOOKE) || defined(CONFIG_40x) +- DEFINE(EXC_LVL_SIZE, STACK_EXC_LVL_FRAME_SIZE); +- DEFINE(MAS0, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas0)); ++#if defined(CONFIG_PPC32) && defined(CONFIG_BOOKE) ++ STACK_PT_REGS_OFFSET(MAS0, mas0); + /* we overload MMUCR for 44x on MAS0 since they are mutually exclusive */ +- DEFINE(MMUCR, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas0)); +- DEFINE(MAS1, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas1)); +- DEFINE(MAS2, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas2)); +- DEFINE(MAS3, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas3)); +- DEFINE(MAS6, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas6)); +- DEFINE(MAS7, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas7)); +- DEFINE(_SRR0, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, srr0)); +- DEFINE(_SRR1, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, srr1)); +- DEFINE(_CSRR0, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, csrr0)); +- DEFINE(_CSRR1, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, csrr1)); +- DEFINE(_DSRR0, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, dsrr0)); +- DEFINE(_DSRR1, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, dsrr1)); +-#endif ++ STACK_PT_REGS_OFFSET(MMUCR, mas0); ++ STACK_PT_REGS_OFFSET(MAS1, mas1); ++ STACK_PT_REGS_OFFSET(MAS2, mas2); ++ STACK_PT_REGS_OFFSET(MAS3, mas3); ++ STACK_PT_REGS_OFFSET(MAS6, mas6); ++ STACK_PT_REGS_OFFSET(MAS7, mas7); ++ STACK_PT_REGS_OFFSET(_SRR0, srr0); ++ STACK_PT_REGS_OFFSET(_SRR1, srr1); ++ STACK_PT_REGS_OFFSET(_CSRR0, csrr0); ++ STACK_PT_REGS_OFFSET(_CSRR1, csrr1); ++ STACK_PT_REGS_OFFSET(_DSRR0, dsrr0); ++ STACK_PT_REGS_OFFSET(_DSRR1, dsrr1); + #endif + + #ifndef CONFIG_PPC64 +--- a/arch/powerpc/kernel/head_booke.h ++++ b/arch/powerpc/kernel/head_booke.h +@@ -185,20 +185,18 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_EMB_HV + /* only on e500mc */ + #define DBG_STACK_BASE dbgirq_ctx + +-#define EXC_LVL_FRAME_OVERHEAD (THREAD_SIZE - INT_FRAME_SIZE - EXC_LVL_SIZE) +- + #ifdef CONFIG_SMP + #define BOOKE_LOAD_EXC_LEVEL_STACK(level) \ + mfspr r8,SPRN_PIR; \ + slwi r8,r8,2; \ + addis r8,r8,level##_STACK_BASE@ha; \ + lwz r8,level##_STACK_BASE@l(r8); \ +- addi r8,r8,EXC_LVL_FRAME_OVERHEAD; ++ addi r8,r8,THREAD_SIZE - INT_FRAME_SIZE; + #else + #define BOOKE_LOAD_EXC_LEVEL_STACK(level) \ + lis r8,level##_STACK_BASE@ha; \ + lwz r8,level##_STACK_BASE@l(r8); \ +- addi r8,r8,EXC_LVL_FRAME_OVERHEAD; ++ addi r8,r8,THREAD_SIZE - INT_FRAME_SIZE; + #endif + + /* +@@ -225,7 +223,7 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_EMB_HV + mtmsr r11; \ + mfspr r11,SPRN_SPRG_THREAD; /* if from user, start at top of */\ + lwz r11, TASK_STACK - THREAD(r11); /* this thread's kernel stack */\ +- addi r11,r11,EXC_LVL_FRAME_OVERHEAD; /* allocate stack frame */\ ++ addi r11,r11,THREAD_SIZE - INT_FRAME_SIZE; /* allocate stack frame */\ + beq 1f; \ + /* COMING FROM USER MODE */ \ + stw r9,_CCR(r11); /* save CR */\ +@@ -533,24 +531,5 @@ label: + bl kernel_fp_unavailable_exception; \ + b interrupt_return + +-#else /* __ASSEMBLY__ */ +-struct exception_regs { +- unsigned long mas0; +- unsigned long mas1; +- unsigned long mas2; +- unsigned long mas3; +- unsigned long mas6; +- unsigned long mas7; +- unsigned long srr0; +- unsigned long srr1; +- unsigned long csrr0; +- unsigned long csrr1; +- unsigned long dsrr0; +- unsigned long dsrr1; +-}; +- +-/* ensure this structure is always sized to a multiple of the stack alignment */ +-#define STACK_EXC_LVL_FRAME_SIZE ALIGN(sizeof (struct exception_regs), 16) +- + #endif /* __ASSEMBLY__ */ + #endif /* __HEAD_BOOKE_H__ */ diff --git a/queue-5.13/powerpc-32s-fix-napping-restore-in-data-storage-interrupt-dsi.patch b/queue-5.13/powerpc-32s-fix-napping-restore-in-data-storage-interrupt-dsi.patch new file mode 100644 index 00000000000..6014ff90795 --- /dev/null +++ b/queue-5.13/powerpc-32s-fix-napping-restore-in-data-storage-interrupt-dsi.patch @@ -0,0 +1,37 @@ +From 62376365048878f770d8b7d11b89b8b3e18018f1 Mon Sep 17 00:00:00 2001 +From: Christophe Leroy +Date: Tue, 3 Aug 2021 15:14:27 +0000 +Subject: powerpc/32s: Fix napping restore in data storage interrupt (DSI) + +From: Christophe Leroy + +commit 62376365048878f770d8b7d11b89b8b3e18018f1 upstream. + +When a DSI (Data Storage Interrupt) is taken while in NAP mode, +r11 doesn't survive the call to power_save_ppc32_restore(). + +So use r1 instead of r11 as they both contain the virtual stack +pointer at that point. + +Fixes: 4c0104a83fc3 ("powerpc/32: Dismantle EXC_XFER_STD/LITE/TEMPLATE") +Cc: stable@vger.kernel.org # v5.13+ +Reported-by: Finn Thain +Signed-off-by: Christophe Leroy +Signed-off-by: Michael Ellerman +Link: https://lore.kernel.org/r/731694e0885271f6ee9ffc179eb4bcee78313682.1628003562.git.christophe.leroy@csgroup.eu +Signed-off-by: Greg Kroah-Hartman +--- + arch/powerpc/kernel/head_book3s_32.S | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/powerpc/kernel/head_book3s_32.S ++++ b/arch/powerpc/kernel/head_book3s_32.S +@@ -300,7 +300,7 @@ ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_HP + EXCEPTION_PROLOG_1 + EXCEPTION_PROLOG_2 INTERRUPT_DATA_STORAGE DataAccess handle_dar_dsisr=1 + prepare_transfer_to_handler +- lwz r5, _DSISR(r11) ++ lwz r5, _DSISR(r1) + andis. r0, r5, DSISR_DABRMATCH@h + bne- 1f + bl do_page_fault diff --git a/queue-5.13/powerpc-interrupt-do-not-call-single_step_exception-from-other-exceptions.patch b/queue-5.13/powerpc-interrupt-do-not-call-single_step_exception-from-other-exceptions.patch new file mode 100644 index 00000000000..50b208a3fa3 --- /dev/null +++ b/queue-5.13/powerpc-interrupt-do-not-call-single_step_exception-from-other-exceptions.patch @@ -0,0 +1,60 @@ +From 01fcac8e4dfc112f420dcaeb70056a74e326cacf Mon Sep 17 00:00:00 2001 +From: Christophe Leroy +Date: Tue, 10 Aug 2021 16:13:17 +0000 +Subject: powerpc/interrupt: Do not call single_step_exception() from other exceptions + +From: Christophe Leroy + +commit 01fcac8e4dfc112f420dcaeb70056a74e326cacf upstream. + +single_step_exception() is called by emulate_single_step() which +is called from (at least) alignment exception() handler and +program_check_exception() handler. + +Redefine it as a regular __single_step_exception() which is called +by both single_step_exception() handler and emulate_single_step() +function. + +Fixes: 3a96570ffceb ("powerpc: convert interrupt handlers to use wrappers") +Cc: stable@vger.kernel.org # v5.12+ +Signed-off-by: Christophe Leroy +Reviewed-by: Nicholas Piggin +Signed-off-by: Michael Ellerman +Link: https://lore.kernel.org/r/aed174f5cbc06f2cf95233c071d8aac948e46043.1628611921.git.christophe.leroy@csgroup.eu +Signed-off-by: Greg Kroah-Hartman +--- + arch/powerpc/kernel/traps.c | 9 +++++++-- + 1 file changed, 7 insertions(+), 2 deletions(-) + +--- a/arch/powerpc/kernel/traps.c ++++ b/arch/powerpc/kernel/traps.c +@@ -1103,7 +1103,7 @@ DEFINE_INTERRUPT_HANDLER(RunModeExceptio + _exception(SIGTRAP, regs, TRAP_UNK, 0); + } + +-DEFINE_INTERRUPT_HANDLER(single_step_exception) ++static void __single_step_exception(struct pt_regs *regs) + { + clear_single_step(regs); + clear_br_trace(regs); +@@ -1120,6 +1120,11 @@ DEFINE_INTERRUPT_HANDLER(single_step_exc + _exception(SIGTRAP, regs, TRAP_TRACE, regs->nip); + } + ++DEFINE_INTERRUPT_HANDLER(single_step_exception) ++{ ++ __single_step_exception(regs); ++} ++ + /* + * After we have successfully emulated an instruction, we have to + * check if the instruction was being single-stepped, and if so, +@@ -1129,7 +1134,7 @@ DEFINE_INTERRUPT_HANDLER(single_step_exc + static void emulate_single_step(struct pt_regs *regs) + { + if (single_stepping(regs)) +- single_step_exception(regs); ++ __single_step_exception(regs); + } + + static inline int __parse_fpscr(unsigned long fpscr) diff --git a/queue-5.13/powerpc-pseries-fix-update-of-lpar-security-flavor-after-lpm.patch b/queue-5.13/powerpc-pseries-fix-update-of-lpar-security-flavor-after-lpm.patch new file mode 100644 index 00000000000..bafb1771546 --- /dev/null +++ b/queue-5.13/powerpc-pseries-fix-update-of-lpar-security-flavor-after-lpm.patch @@ -0,0 +1,42 @@ +From c18956e6e0b95f78dad2773ecc8c61a9e41f6405 Mon Sep 17 00:00:00 2001 +From: Laurent Dufour +Date: Thu, 5 Aug 2021 17:23:08 +0200 +Subject: powerpc/pseries: Fix update of LPAR security flavor after LPM + +From: Laurent Dufour + +commit c18956e6e0b95f78dad2773ecc8c61a9e41f6405 upstream. + +After LPM, when migrating from a system with security mitigation enabled +to a system with mitigation disabled, the security flavor exposed in +/proc is not correctly set back to 0. + +Do not assume the value of the security flavor is set to 0 when entering +init_cpu_char_feature_flags(), so when called after a LPM, the value is +set correctly even if the mitigation are not turned off. + +Fixes: 6ce56e1ac380 ("powerpc/pseries: export LPAR security flavor in lparcfg") +Cc: stable@vger.kernel.org # v5.13+ +Signed-off-by: Laurent Dufour +Signed-off-by: Michael Ellerman +Link: https://lore.kernel.org/r/20210805152308.33988-1-ldufour@linux.ibm.com +Signed-off-by: Greg Kroah-Hartman +--- + arch/powerpc/platforms/pseries/setup.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +--- a/arch/powerpc/platforms/pseries/setup.c ++++ b/arch/powerpc/platforms/pseries/setup.c +@@ -539,9 +539,10 @@ static void init_cpu_char_feature_flags( + * H_CPU_BEHAV_FAVOUR_SECURITY_H could be set only if + * H_CPU_BEHAV_FAVOUR_SECURITY is. + */ +- if (!(result->behaviour & H_CPU_BEHAV_FAVOUR_SECURITY)) ++ if (!(result->behaviour & H_CPU_BEHAV_FAVOUR_SECURITY)) { + security_ftr_clear(SEC_FTR_FAVOUR_SECURITY); +- else if (result->behaviour & H_CPU_BEHAV_FAVOUR_SECURITY_H) ++ pseries_security_flavor = 0; ++ } else if (result->behaviour & H_CPU_BEHAV_FAVOUR_SECURITY_H) + pseries_security_flavor = 1; + else + pseries_security_flavor = 2; diff --git a/queue-5.13/powerpc-smp-fix-oops-in-topology_init.patch b/queue-5.13/powerpc-smp-fix-oops-in-topology_init.patch new file mode 100644 index 00000000000..0517de34c78 --- /dev/null +++ b/queue-5.13/powerpc-smp-fix-oops-in-topology_init.patch @@ -0,0 +1,63 @@ +From 8241461536f21bbe51308a6916d1c9fb2e6b75a7 Mon Sep 17 00:00:00 2001 +From: Christophe Leroy +Date: Wed, 4 Aug 2021 18:24:10 +0000 +Subject: powerpc/smp: Fix OOPS in topology_init() + +From: Christophe Leroy + +commit 8241461536f21bbe51308a6916d1c9fb2e6b75a7 upstream. + +Running an SMP kernel on an UP platform not prepared for it, +I encountered the following OOPS: + + BUG: Kernel NULL pointer dereference on read at 0x00000034 + Faulting instruction address: 0xc0a04110 + Oops: Kernel access of bad area, sig: 11 [#1] + BE PAGE_SIZE=4K SMP NR_CPUS=2 CMPCPRO + Modules linked in: + CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.13.0-pmac-00001-g230fedfaad21 #5234 + NIP: c0a04110 LR: c0a040d8 CTR: c0a04084 + REGS: e100dda0 TRAP: 0300 Not tainted (5.13.0-pmac-00001-g230fedfaad21) + MSR: 00009032 CR: 84000284 XER: 00000000 + DAR: 00000034 DSISR: 20000000 + GPR00: c0006bd4 e100de60 c1033320 00000000 00000000 c0942274 00000000 00000000 + GPR08: 00000000 00000000 00000001 00000063 00000007 00000000 c0006f30 00000000 + GPR16: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000005 + GPR24: c0c67d74 c0c67f1c c0c60000 c0c67d70 c0c0c558 1efdf000 c0c00020 00000000 + NIP [c0a04110] topology_init+0x8c/0x138 + LR [c0a040d8] topology_init+0x54/0x138 + Call Trace: + [e100de60] [80808080] 0x80808080 (unreliable) + [e100de90] [c0006bd4] do_one_initcall+0x48/0x1bc + [e100def0] [c0a0150c] kernel_init_freeable+0x1c8/0x278 + [e100df20] [c0006f44] kernel_init+0x14/0x10c + [e100df30] [c00190fc] ret_from_kernel_thread+0x14/0x1c + Instruction dump: + 7c692e70 7d290194 7c035040 7c7f1b78 5529103a 546706fe 5468103a 39400001 + 7c641b78 40800054 80c690b4 7fb9402e <81060034> 7fbeea14 2c080000 7fa3eb78 + ---[ end trace b246ffbc6bbbb6fb ]--- + +Fix it by checking smp_ops before using it, as already done in +several other places in the arch/powerpc/kernel/smp.c + +Fixes: 39f87561454d ("powerpc/smp: Move ppc_md.cpu_die() to smp_ops.cpu_offline_self()") +Cc: stable@vger.kernel.org +Signed-off-by: Christophe Leroy +Signed-off-by: Michael Ellerman +Link: https://lore.kernel.org/r/75287841cbb8740edd44880fe60be66d489160d9.1628097995.git.christophe.leroy@csgroup.eu +Signed-off-by: Greg Kroah-Hartman +--- + arch/powerpc/kernel/sysfs.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/powerpc/kernel/sysfs.c ++++ b/arch/powerpc/kernel/sysfs.c +@@ -1167,7 +1167,7 @@ static int __init topology_init(void) + * CPU. For instance, the boot cpu might never be valid + * for hotplugging. + */ +- if (smp_ops->cpu_offline_self) ++ if (smp_ops && smp_ops->cpu_offline_self) + c->hotpluggable = 1; + #endif + diff --git a/queue-5.13/powerpc-xive-do-not-skip-cpu-less-nodes-when-creating-the-ipis.patch b/queue-5.13/powerpc-xive-do-not-skip-cpu-less-nodes-when-creating-the-ipis.patch new file mode 100644 index 00000000000..c5b551031a8 --- /dev/null +++ b/queue-5.13/powerpc-xive-do-not-skip-cpu-less-nodes-when-creating-the-ipis.patch @@ -0,0 +1,127 @@ +From cbc06f051c524dcfe52ef0d1f30647828e226d30 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= +Date: Sat, 7 Aug 2021 09:20:57 +0200 +Subject: powerpc/xive: Do not skip CPU-less nodes when creating the IPIs +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Cédric Le Goater + +commit cbc06f051c524dcfe52ef0d1f30647828e226d30 upstream. + +On PowerVM, CPU-less nodes can be populated with hot-plugged CPUs at +runtime. Today, the IPI is not created for such nodes, and hot-plugged +CPUs use a bogus IPI, which leads to soft lockups. + +We can not directly allocate and request the IPI on demand because +bringup_up() is called under the IRQ sparse lock. The alternative is +to allocate the IPIs for all possible nodes at startup and to request +the mapping on demand when the first CPU of a node is brought up. + +Fixes: 7dcc37b3eff9 ("powerpc/xive: Map one IPI interrupt per node") +Cc: stable@vger.kernel.org # v5.13 +Reported-by: Geetika Moolchandani +Signed-off-by: Cédric Le Goater +Tested-by: Srikar Dronamraju +Tested-by: Laurent Vivier +Signed-off-by: Michael Ellerman +Link: https://lore.kernel.org/r/20210807072057.184698-1-clg@kaod.org +Signed-off-by: Greg Kroah-Hartman +--- + arch/powerpc/sysdev/xive/common.c | 35 ++++++++++++++++++++++++----------- + 1 file changed, 24 insertions(+), 11 deletions(-) + +--- a/arch/powerpc/sysdev/xive/common.c ++++ b/arch/powerpc/sysdev/xive/common.c +@@ -67,6 +67,7 @@ static struct irq_domain *xive_irq_domai + static struct xive_ipi_desc { + unsigned int irq; + char name[16]; ++ atomic_t started; + } *xive_ipis; + + /* +@@ -1120,7 +1121,7 @@ static const struct irq_domain_ops xive_ + .alloc = xive_ipi_irq_domain_alloc, + }; + +-static int __init xive_request_ipi(void) ++static int __init xive_init_ipis(void) + { + struct fwnode_handle *fwnode; + struct irq_domain *ipi_domain; +@@ -1144,10 +1145,6 @@ static int __init xive_request_ipi(void) + struct xive_ipi_desc *xid = &xive_ipis[node]; + struct xive_ipi_alloc_info info = { node }; + +- /* Skip nodes without CPUs */ +- if (cpumask_empty(cpumask_of_node(node))) +- continue; +- + /* + * Map one IPI interrupt per node for all cpus of that node. + * Since the HW interrupt number doesn't have any meaning, +@@ -1159,11 +1156,6 @@ static int __init xive_request_ipi(void) + xid->irq = ret; + + snprintf(xid->name, sizeof(xid->name), "IPI-%d", node); +- +- ret = request_irq(xid->irq, xive_muxed_ipi_action, +- IRQF_PERCPU | IRQF_NO_THREAD, xid->name, NULL); +- +- WARN(ret < 0, "Failed to request IPI %d: %d\n", xid->irq, ret); + } + + return ret; +@@ -1178,6 +1170,22 @@ out: + return ret; + } + ++static int __init xive_request_ipi(unsigned int cpu) ++{ ++ struct xive_ipi_desc *xid = &xive_ipis[early_cpu_to_node(cpu)]; ++ int ret; ++ ++ if (atomic_inc_return(&xid->started) > 1) ++ return 0; ++ ++ ret = request_irq(xid->irq, xive_muxed_ipi_action, ++ IRQF_PERCPU | IRQF_NO_THREAD, ++ xid->name, NULL); ++ ++ WARN(ret < 0, "Failed to request IPI %d: %d\n", xid->irq, ret); ++ return ret; ++} ++ + static int xive_setup_cpu_ipi(unsigned int cpu) + { + unsigned int xive_ipi_irq = xive_ipi_cpu_to_irq(cpu); +@@ -1192,6 +1200,9 @@ static int xive_setup_cpu_ipi(unsigned i + if (xc->hw_ipi != XIVE_BAD_IRQ) + return 0; + ++ /* Register the IPI */ ++ xive_request_ipi(cpu); ++ + /* Grab an IPI from the backend, this will populate xc->hw_ipi */ + if (xive_ops->get_ipi(cpu, xc)) + return -EIO; +@@ -1231,6 +1242,8 @@ static void xive_cleanup_cpu_ipi(unsigne + if (xc->hw_ipi == XIVE_BAD_IRQ) + return; + ++ /* TODO: clear IPI mapping */ ++ + /* Mask the IPI */ + xive_do_source_set_mask(&xc->ipi_data, true); + +@@ -1253,7 +1266,7 @@ void __init xive_smp_probe(void) + smp_ops->cause_ipi = xive_cause_ipi; + + /* Register the IPI */ +- xive_request_ipi(); ++ xive_init_ipis(); + + /* Allocate and setup IPI for the boot CPU */ + xive_setup_cpu_ipi(smp_processor_id()); diff --git a/queue-5.13/series b/queue-5.13/series index 8c43d70fd79..259a9e1994c 100644 --- a/queue-5.13/series +++ b/queue-5.13/series @@ -126,3 +126,22 @@ genirq-msi-ensure-deactivation-on-teardown.patch genirq-timings-prevent-potential-array-overflow-in-__irq_timings_store.patch powerpc-interrupt-fix-oops-by-not-calling-do_irq-from-timer_interrupt.patch pci-msi-enable-and-mask-msi-x-early.patch +pci-msi-mask-all-unused-msi-x-entries.patch +pci-msi-enforce-that-msi-x-table-entry-is-masked-for-update.patch +pci-msi-enforce-msi-entry-updates-to-be-visible.patch +pci-msi-do-not-set-invalid-bits-in-msi-mask.patch +pci-msi-correct-misleading-comments.patch +pci-msi-use-msi_mask_irq-in-pci_msi_shutdown.patch +pci-msi-protect-msi_desc-masked-for-multi-msi.patch +powerpc-interrupt-do-not-call-single_step_exception-from-other-exceptions.patch +powerpc-pseries-fix-update-of-lpar-security-flavor-after-lpm.patch +powerpc-32s-fix-napping-restore-in-data-storage-interrupt-dsi.patch +powerpc-smp-fix-oops-in-topology_init.patch +powerpc-xive-do-not-skip-cpu-less-nodes-when-creating-the-ipis.patch +powerpc-32-fix-critical-and-debug-interrupts-on-booke.patch +efi-libstub-arm64-double-check-image-alignment-at-entry.patch +locking-rtmutex-use-the-correct-rtmutex-debugging-config-option.patch +kvm-vmx-use-current-vmcs-to-query-waitpkg-support-for-msr-emulation.patch +kvm-nvmx-use-vmx_need_pf_intercept-when-deciding-if-l0-wants-a-pf.patch +kvm-x86-mmu-don-t-leak-non-leaf-sptes-when-zapping-all-sptes.patch +kvm-x86-mmu-protect-marking-sps-unsync-when-using-tdp-mmu-with-spinlock.patch