From 7f0dcb0c33e448cfe2d4d5bbffea4318aa2a9f57 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Mon, 9 May 2022 20:56:59 -0400 Subject: [PATCH] Fixes for 5.15 Signed-off-by: Sasha Levin --- ...lease-return-enodev-if-fbdev-was-unr.patch | 53 +++ .../gpio-mvebu-drop-pwm-base-assignment.patch | 48 ++ ...issing-module-owner-to-ops-structure.patch | 36 ++ ...-timer-posted-interrupt-only-when-mw.patch | 54 +++ ...-not-use-bitfields-larger-than-32-bi.patch | 433 ++++++++++++++++++ ...lence-compiler-warning-in-the-kvm_pa.patch | 48 ++ ...ly-provide-cpuid-leaf-0xa-if-host-ha.patch | 54 +++ ...change-icr-on-write-to-apic_self_ipi.patch | 43 ++ ...d-null-pointer-dereference-on-page-f.patch | 39 ++ ...b-out-of-bounds-while-reading-resour.patch | 151 ++++++ ...don-t-skip-fib-events-on-current-dst.patch | 114 +++++ ...-lag-fix-fib_info-pointer-assignment.patch | 40 ++ ...x-use-after-free-in-fib-event-handle.patch | 247 ++++++++++ ...sleep-in-atomic-bug-when-firmware-do.patch | 70 +++ ...t-vm-verify-mmap-addr-in-mremap_test.patch | 73 +++ ...fy-remap-destination-address-in-mrem.patch | 66 +++ queue-5.15/series | 17 + ...-bsp-msr_kvm_poll_control-across-sus.patch | 71 +++ 18 files changed, 1657 insertions(+) create mode 100644 queue-5.15/fbdev-make-fb_release-return-enodev-if-fbdev-was-unr.patch create mode 100644 queue-5.15/gpio-mvebu-drop-pwm-base-assignment.patch create mode 100644 queue-5.15/iommu-dart-add-missing-module-owner-to-ops-structure.patch create mode 100644 queue-5.15/kvm-lapic-enable-timer-posted-interrupt-only-when-mw.patch create mode 100644 queue-5.15/kvm-selftests-do-not-use-bitfields-larger-than-32-bi.patch create mode 100644 queue-5.15/kvm-selftests-silence-compiler-warning-in-the-kvm_pa.patch create mode 100644 queue-5.15/kvm-x86-cpuid-only-provide-cpuid-leaf-0xa-if-host-ha.patch create mode 100644 queue-5.15/kvm-x86-do-not-change-icr-on-write-to-apic_self_ipi.patch create mode 100644 queue-5.15/kvm-x86-mmu-avoid-null-pointer-dereference-on-page-f.patch create mode 100644 queue-5.15/net-mlx5-fix-slab-out-of-bounds-while-reading-resour.patch create mode 100644 queue-5.15/net-mlx5e-lag-don-t-skip-fib-events-on-current-dst.patch create mode 100644 queue-5.15/net-mlx5e-lag-fix-fib_info-pointer-assignment.patch create mode 100644 queue-5.15/net-mlx5e-lag-fix-use-after-free-in-fib-event-handle.patch create mode 100644 queue-5.15/nfc-netlink-fix-sleep-in-atomic-bug-when-firmware-do.patch create mode 100644 queue-5.15/selftest-vm-verify-mmap-addr-in-mremap_test.patch create mode 100644 queue-5.15/selftest-vm-verify-remap-destination-address-in-mrem.patch create mode 100644 queue-5.15/x86-kvm-preserve-bsp-msr_kvm_poll_control-across-sus.patch diff --git a/queue-5.15/fbdev-make-fb_release-return-enodev-if-fbdev-was-unr.patch b/queue-5.15/fbdev-make-fb_release-return-enodev-if-fbdev-was-unr.patch new file mode 100644 index 00000000000..11b99358356 --- /dev/null +++ b/queue-5.15/fbdev-make-fb_release-return-enodev-if-fbdev-was-unr.patch @@ -0,0 +1,53 @@ +From 386bb0a5603b5a0cf950d70cbeb5bbcff764b502 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 2 May 2022 15:50:14 +0200 +Subject: fbdev: Make fb_release() return -ENODEV if fbdev was unregistered + +From: Javier Martinez Canillas + +[ Upstream commit aafa025c76dcc7d1a8c8f0bdefcbe4eb480b2f6a ] + +A reference to the framebuffer device struct fb_info is stored in the file +private data, but this reference could no longer be valid and must not be +accessed directly. Instead, the file_fb_info() accessor function must be +used since it does sanity checking to make sure that the fb_info is valid. + +This can happen for example if the registered framebuffer device is for a +driver that just uses a framebuffer provided by the system firmware. In +that case, the fbdev core would unregister the framebuffer device when a +real video driver is probed and ask to remove conflicting framebuffers. + +The bug has been present for a long time but commit 27599aacbaef ("fbdev: +Hot-unplug firmware fb devices on forced removal") unmasked it since the +fbdev core started unregistering the framebuffers' devices associated. + +Fixes: 27599aacbaef ("fbdev: Hot-unplug firmware fb devices on forced removal") +Reported-by: Maxime Ripard +Reported-by: Junxiao Chang +Signed-off-by: Javier Martinez Canillas +Reviewed-by: Thomas Zimmermann +Link: https://patchwork.freedesktop.org/patch/msgid/20220502135014.377945-1-javierm@redhat.com +Signed-off-by: Sasha Levin +--- + drivers/video/fbdev/core/fbmem.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c +index 0371ad233fdf..8e38a7a5cf2f 100644 +--- a/drivers/video/fbdev/core/fbmem.c ++++ b/drivers/video/fbdev/core/fbmem.c +@@ -1436,7 +1436,10 @@ fb_release(struct inode *inode, struct file *file) + __acquires(&info->lock) + __releases(&info->lock) + { +- struct fb_info * const info = file->private_data; ++ struct fb_info * const info = file_fb_info(file); ++ ++ if (!info) ++ return -ENODEV; + + lock_fb_info(info); + if (info->fbops->fb_release) +-- +2.35.1 + diff --git a/queue-5.15/gpio-mvebu-drop-pwm-base-assignment.patch b/queue-5.15/gpio-mvebu-drop-pwm-base-assignment.patch new file mode 100644 index 00000000000..1f75726bbaf --- /dev/null +++ b/queue-5.15/gpio-mvebu-drop-pwm-base-assignment.patch @@ -0,0 +1,48 @@ +From 6559461a5b252da76d76fe4dae48d7a2d2a08842 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 11 Apr 2022 09:23:40 +0300 +Subject: gpio: mvebu: drop pwm base assignment +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Baruch Siach + +[ Upstream commit e5f6e5d554ac274f9c8ba60078103d0425b93c19 ] + +pwmchip_add() unconditionally assigns the base ID dynamically. Commit +f9a8ee8c8bcd1 ("pwm: Always allocate PWM chip base ID dynamically") +dropped all base assignment from drivers under drivers/pwm/. It missed +this driver. Fix that. + +Fixes: f9a8ee8c8bcd1 ("pwm: Always allocate PWM chip base ID dynamically") +Signed-off-by: Baruch Siach +Reviewed-by: Uwe Kleine-König +Acked-by: Linus Walleij +Signed-off-by: Bartosz Golaszewski +Signed-off-by: Sasha Levin +--- + drivers/gpio/gpio-mvebu.c | 7 ------- + 1 file changed, 7 deletions(-) + +diff --git a/drivers/gpio/gpio-mvebu.c b/drivers/gpio/gpio-mvebu.c +index 8f429d9f3661..ad8822da7c27 100644 +--- a/drivers/gpio/gpio-mvebu.c ++++ b/drivers/gpio/gpio-mvebu.c +@@ -871,13 +871,6 @@ static int mvebu_pwm_probe(struct platform_device *pdev, + mvpwm->chip.dev = dev; + mvpwm->chip.ops = &mvebu_pwm_ops; + mvpwm->chip.npwm = mvchip->chip.ngpio; +- /* +- * There may already be some PWM allocated, so we can't force +- * mvpwm->chip.base to a fixed point like mvchip->chip.base. +- * So, we let pwmchip_add() do the numbering and take the next free +- * region. +- */ +- mvpwm->chip.base = -1; + + spin_lock_init(&mvpwm->lock); + +-- +2.35.1 + diff --git a/queue-5.15/iommu-dart-add-missing-module-owner-to-ops-structure.patch b/queue-5.15/iommu-dart-add-missing-module-owner-to-ops-structure.patch new file mode 100644 index 00000000000..5abae8513c3 --- /dev/null +++ b/queue-5.15/iommu-dart-add-missing-module-owner-to-ops-structure.patch @@ -0,0 +1,36 @@ +From 3871d84c07782172308cd3b4056a42517bdc8d50 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 2 May 2022 18:22:38 +0900 +Subject: iommu/dart: Add missing module owner to ops structure + +From: Hector Martin + +[ Upstream commit 2ac2fab52917ae82cbca97cf6e5d2993530257ed ] + +This is required to make loading this as a module work. + +Signed-off-by: Hector Martin +Fixes: 46d1fb072e76 ("iommu/dart: Add DART iommu driver") +Reviewed-by: Sven Peter +Link: https://lore.kernel.org/r/20220502092238.30486-1-marcan@marcan.st +Signed-off-by: Joerg Roedel +Signed-off-by: Sasha Levin +--- + drivers/iommu/apple-dart.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/drivers/iommu/apple-dart.c b/drivers/iommu/apple-dart.c +index 9c9bbccc00bd..baba4571c815 100644 +--- a/drivers/iommu/apple-dart.c ++++ b/drivers/iommu/apple-dart.c +@@ -757,6 +757,7 @@ static const struct iommu_ops apple_dart_iommu_ops = { + .of_xlate = apple_dart_of_xlate, + .def_domain_type = apple_dart_def_domain_type, + .pgsize_bitmap = -1UL, /* Restricted during dart probe */ ++ .owner = THIS_MODULE, + }; + + static irqreturn_t apple_dart_irq(int irq, void *dev) +-- +2.35.1 + diff --git a/queue-5.15/kvm-lapic-enable-timer-posted-interrupt-only-when-mw.patch b/queue-5.15/kvm-lapic-enable-timer-posted-interrupt-only-when-mw.patch new file mode 100644 index 00000000000..76150707117 --- /dev/null +++ b/queue-5.15/kvm-lapic-enable-timer-posted-interrupt-only-when-mw.patch @@ -0,0 +1,54 @@ +From 015b9b8612ebb385a08f2436f658a31289ab357e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 25 Jan 2022 04:08:58 -0800 +Subject: KVM: LAPIC: Enable timer posted-interrupt only when mwait/hlt is + advertised + +From: Wanpeng Li + +[ Upstream commit 1714a4eb6fb0cb79f182873cd011a8ed60ac65e8 ] + +As commit 0c5f81dad46 ("KVM: LAPIC: Inject timer interrupt via posted +interrupt") mentioned that the host admin should well tune the guest +setup, so that vCPUs are placed on isolated pCPUs, and with several pCPUs +surplus for *busy* housekeeping. In this setup, it is preferrable to +disable mwait/hlt/pause vmexits to keep the vCPUs in non-root mode. + +However, if only some guests isolated and others not, they would not +have any benefit from posted timer interrupts, and at the same time lose +VMX preemption timer fast paths because kvm_can_post_timer_interrupt() +returns true and therefore forces kvm_can_use_hv_timer() to false. + +By guaranteeing that posted-interrupt timer is only used if MWAIT or +HLT are done without vmexit, KVM can make a better choice and use the +VMX preemption timer and the corresponding fast paths. + +Reported-by: Aili Yao +Reviewed-by: Sean Christopherson +Cc: Aili Yao +Cc: Sean Christopherson +Signed-off-by: Wanpeng Li +Message-Id: <1643112538-36743-1-git-send-email-wanpengli@tencent.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/lapic.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c +index 83d1743a1dd0..493d636e6231 100644 +--- a/arch/x86/kvm/lapic.c ++++ b/arch/x86/kvm/lapic.c +@@ -113,7 +113,8 @@ static inline u32 kvm_x2apic_id(struct kvm_lapic *apic) + + static bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu) + { +- return pi_inject_timer && kvm_vcpu_apicv_active(vcpu); ++ return pi_inject_timer && kvm_vcpu_apicv_active(vcpu) && ++ (kvm_mwait_in_guest(vcpu->kvm) || kvm_hlt_in_guest(vcpu->kvm)); + } + + bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu) +-- +2.35.1 + diff --git a/queue-5.15/kvm-selftests-do-not-use-bitfields-larger-than-32-bi.patch b/queue-5.15/kvm-selftests-do-not-use-bitfields-larger-than-32-bi.patch new file mode 100644 index 00000000000..54a2708d250 --- /dev/null +++ b/queue-5.15/kvm-selftests-do-not-use-bitfields-larger-than-32-bi.patch @@ -0,0 +1,433 @@ +From af02e8ffadf897472c41890285242a18d6e88632 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 20 Apr 2022 06:27:27 -0400 +Subject: kvm: selftests: do not use bitfields larger than 32-bits for PTEs + +From: Paolo Bonzini + +[ Upstream commit f18b4aebe107d092e384b1ae680b1e1de7a0196d ] + +Red Hat's QE team reported test failure on access_tracking_perf_test: + +Testing guest mode: PA-bits:ANY, VA-bits:48, 4K pages +guest physical test memory offset: 0x3fffbffff000 + +Populating memory : 0.684014577s +Writing to populated memory : 0.006230175s +Reading from populated memory : 0.004557805s +==== Test Assertion Failure ==== + lib/kvm_util.c:1411: false + pid=125806 tid=125809 errno=4 - Interrupted system call + 1 0x0000000000402f7c: addr_gpa2hva at kvm_util.c:1411 + 2 (inlined by) addr_gpa2hva at kvm_util.c:1405 + 3 0x0000000000401f52: lookup_pfn at access_tracking_perf_test.c:98 + 4 (inlined by) mark_vcpu_memory_idle at access_tracking_perf_test.c:152 + 5 (inlined by) vcpu_thread_main at access_tracking_perf_test.c:232 + 6 0x00007fefe9ff81ce: ?? ??:0 + 7 0x00007fefe9c64d82: ?? ??:0 + No vm physical memory at 0xffbffff000 + +I can easily reproduce it with a Intel(R) Xeon(R) CPU E5-2630 with 46 bits +PA. + +It turns out that the address translation for clearing idle page tracking +returned a wrong result; addr_gva2gpa()'s last step, which is based on +"pte[index[0]].pfn", did the calculation with 40 bits length and the +high 12 bits got truncated. In above case the GPA address to be returned +should be 0x3fffbffff000 for GVA 0xc0000000, but it got truncated into +0xffbffff000 and the subsequent gpa2hva lookup failed. + +The width of operations on bit fields greater than 32-bit is +implementation defined, and differs between GCC (which uses the bitfield +precision) and clang (which uses 64-bit arithmetic), so this is a +potential minefield. Remove the bit fields and using manual masking +instead. + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2075036 +Reported-by: Nana Liu +Reviewed-by: Peter Xu +Tested-by: Peter Xu +Signed-off-by: Paolo Bonzini +Signed-off-by: Sasha Levin +--- + .../selftests/kvm/include/x86_64/processor.h | 15 ++ + .../selftests/kvm/lib/x86_64/processor.c | 192 +++++++----------- + 2 files changed, 92 insertions(+), 115 deletions(-) + +diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h +index 05e65ca1c30c..23861c8faa61 100644 +--- a/tools/testing/selftests/kvm/include/x86_64/processor.h ++++ b/tools/testing/selftests/kvm/include/x86_64/processor.h +@@ -58,6 +58,21 @@ + /* CPUID.0x8000_0001.EDX */ + #define CPUID_GBPAGES (1ul << 26) + ++/* Page table bitfield declarations */ ++#define PTE_PRESENT_MASK BIT_ULL(0) ++#define PTE_WRITABLE_MASK BIT_ULL(1) ++#define PTE_USER_MASK BIT_ULL(2) ++#define PTE_ACCESSED_MASK BIT_ULL(5) ++#define PTE_DIRTY_MASK BIT_ULL(6) ++#define PTE_LARGE_MASK BIT_ULL(7) ++#define PTE_GLOBAL_MASK BIT_ULL(8) ++#define PTE_NX_MASK BIT_ULL(63) ++ ++#define PAGE_SHIFT 12 ++ ++#define PHYSICAL_PAGE_MASK GENMASK_ULL(51, 12) ++#define PTE_GET_PFN(pte) (((pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT) ++ + /* General Registers in 64-Bit Mode */ + struct gpr64_regs { + u64 rax; +diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c +index da73b97e1e6d..46057079d8bb 100644 +--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c ++++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c +@@ -19,38 +19,6 @@ + + vm_vaddr_t exception_handlers; + +-/* Virtual translation table structure declarations */ +-struct pageUpperEntry { +- uint64_t present:1; +- uint64_t writable:1; +- uint64_t user:1; +- uint64_t write_through:1; +- uint64_t cache_disable:1; +- uint64_t accessed:1; +- uint64_t ignored_06:1; +- uint64_t page_size:1; +- uint64_t ignored_11_08:4; +- uint64_t pfn:40; +- uint64_t ignored_62_52:11; +- uint64_t execute_disable:1; +-}; +- +-struct pageTableEntry { +- uint64_t present:1; +- uint64_t writable:1; +- uint64_t user:1; +- uint64_t write_through:1; +- uint64_t cache_disable:1; +- uint64_t accessed:1; +- uint64_t dirty:1; +- uint64_t reserved_07:1; +- uint64_t global:1; +- uint64_t ignored_11_09:3; +- uint64_t pfn:40; +- uint64_t ignored_62_52:11; +- uint64_t execute_disable:1; +-}; +- + void regs_dump(FILE *stream, struct kvm_regs *regs, + uint8_t indent) + { +@@ -195,23 +163,21 @@ static void *virt_get_pte(struct kvm_vm *vm, uint64_t pt_pfn, uint64_t vaddr, + return &page_table[index]; + } + +-static struct pageUpperEntry *virt_create_upper_pte(struct kvm_vm *vm, +- uint64_t pt_pfn, +- uint64_t vaddr, +- uint64_t paddr, +- int level, +- enum x86_page_size page_size) ++static uint64_t *virt_create_upper_pte(struct kvm_vm *vm, ++ uint64_t pt_pfn, ++ uint64_t vaddr, ++ uint64_t paddr, ++ int level, ++ enum x86_page_size page_size) + { +- struct pageUpperEntry *pte = virt_get_pte(vm, pt_pfn, vaddr, level); +- +- if (!pte->present) { +- pte->writable = true; +- pte->present = true; +- pte->page_size = (level == page_size); +- if (pte->page_size) +- pte->pfn = paddr >> vm->page_shift; ++ uint64_t *pte = virt_get_pte(vm, pt_pfn, vaddr, level); ++ ++ if (!(*pte & PTE_PRESENT_MASK)) { ++ *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK; ++ if (level == page_size) ++ *pte |= PTE_LARGE_MASK | (paddr & PHYSICAL_PAGE_MASK); + else +- pte->pfn = vm_alloc_page_table(vm) >> vm->page_shift; ++ *pte |= vm_alloc_page_table(vm) & PHYSICAL_PAGE_MASK; + } else { + /* + * Entry already present. Assert that the caller doesn't want +@@ -221,7 +187,7 @@ static struct pageUpperEntry *virt_create_upper_pte(struct kvm_vm *vm, + TEST_ASSERT(level != page_size, + "Cannot create hugepage at level: %u, vaddr: 0x%lx\n", + page_size, vaddr); +- TEST_ASSERT(!pte->page_size, ++ TEST_ASSERT(!(*pte & PTE_LARGE_MASK), + "Cannot create page table at level: %u, vaddr: 0x%lx\n", + level, vaddr); + } +@@ -232,8 +198,8 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, + enum x86_page_size page_size) + { + const uint64_t pg_size = 1ull << ((page_size * 9) + 12); +- struct pageUpperEntry *pml4e, *pdpe, *pde; +- struct pageTableEntry *pte; ++ uint64_t *pml4e, *pdpe, *pde; ++ uint64_t *pte; + + TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, + "Unknown or unsupported guest mode, mode: 0x%x", vm->mode); +@@ -257,24 +223,22 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, + */ + pml4e = virt_create_upper_pte(vm, vm->pgd >> vm->page_shift, + vaddr, paddr, 3, page_size); +- if (pml4e->page_size) ++ if (*pml4e & PTE_LARGE_MASK) + return; + +- pdpe = virt_create_upper_pte(vm, pml4e->pfn, vaddr, paddr, 2, page_size); +- if (pdpe->page_size) ++ pdpe = virt_create_upper_pte(vm, PTE_GET_PFN(*pml4e), vaddr, paddr, 2, page_size); ++ if (*pdpe & PTE_LARGE_MASK) + return; + +- pde = virt_create_upper_pte(vm, pdpe->pfn, vaddr, paddr, 1, page_size); +- if (pde->page_size) ++ pde = virt_create_upper_pte(vm, PTE_GET_PFN(*pdpe), vaddr, paddr, 1, page_size); ++ if (*pde & PTE_LARGE_MASK) + return; + + /* Fill in page table entry. */ +- pte = virt_get_pte(vm, pde->pfn, vaddr, 0); +- TEST_ASSERT(!pte->present, ++ pte = virt_get_pte(vm, PTE_GET_PFN(*pde), vaddr, 0); ++ TEST_ASSERT(!(*pte & PTE_PRESENT_MASK), + "PTE already present for 4k page at vaddr: 0x%lx\n", vaddr); +- pte->pfn = paddr >> vm->page_shift; +- pte->writable = true; +- pte->present = 1; ++ *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK | (paddr & PHYSICAL_PAGE_MASK); + } + + void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) +@@ -282,12 +246,12 @@ void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) + __virt_pg_map(vm, vaddr, paddr, X86_PAGE_SIZE_4K); + } + +-static struct pageTableEntry *_vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, ++static uint64_t *_vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, + uint64_t vaddr) + { + uint16_t index[4]; +- struct pageUpperEntry *pml4e, *pdpe, *pde; +- struct pageTableEntry *pte; ++ uint64_t *pml4e, *pdpe, *pde; ++ uint64_t *pte; + struct kvm_cpuid_entry2 *entry; + struct kvm_sregs sregs; + int max_phy_addr; +@@ -329,30 +293,29 @@ static struct pageTableEntry *_vm_get_page_table_entry(struct kvm_vm *vm, int vc + index[3] = (vaddr >> 39) & 0x1ffu; + + pml4e = addr_gpa2hva(vm, vm->pgd); +- TEST_ASSERT(pml4e[index[3]].present, ++ TEST_ASSERT(pml4e[index[3]] & PTE_PRESENT_MASK, + "Expected pml4e to be present for gva: 0x%08lx", vaddr); +- TEST_ASSERT((*(uint64_t*)(&pml4e[index[3]]) & +- (rsvd_mask | (1ull << 7))) == 0, ++ TEST_ASSERT((pml4e[index[3]] & (rsvd_mask | PTE_LARGE_MASK)) == 0, + "Unexpected reserved bits set."); + +- pdpe = addr_gpa2hva(vm, pml4e[index[3]].pfn * vm->page_size); +- TEST_ASSERT(pdpe[index[2]].present, ++ pdpe = addr_gpa2hva(vm, PTE_GET_PFN(pml4e[index[3]]) * vm->page_size); ++ TEST_ASSERT(pdpe[index[2]] & PTE_PRESENT_MASK, + "Expected pdpe to be present for gva: 0x%08lx", vaddr); +- TEST_ASSERT(pdpe[index[2]].page_size == 0, ++ TEST_ASSERT(!(pdpe[index[2]] & PTE_LARGE_MASK), + "Expected pdpe to map a pde not a 1-GByte page."); +- TEST_ASSERT((*(uint64_t*)(&pdpe[index[2]]) & rsvd_mask) == 0, ++ TEST_ASSERT((pdpe[index[2]] & rsvd_mask) == 0, + "Unexpected reserved bits set."); + +- pde = addr_gpa2hva(vm, pdpe[index[2]].pfn * vm->page_size); +- TEST_ASSERT(pde[index[1]].present, ++ pde = addr_gpa2hva(vm, PTE_GET_PFN(pdpe[index[2]]) * vm->page_size); ++ TEST_ASSERT(pde[index[1]] & PTE_PRESENT_MASK, + "Expected pde to be present for gva: 0x%08lx", vaddr); +- TEST_ASSERT(pde[index[1]].page_size == 0, ++ TEST_ASSERT(!(pde[index[1]] & PTE_LARGE_MASK), + "Expected pde to map a pte not a 2-MByte page."); +- TEST_ASSERT((*(uint64_t*)(&pde[index[1]]) & rsvd_mask) == 0, ++ TEST_ASSERT((pde[index[1]] & rsvd_mask) == 0, + "Unexpected reserved bits set."); + +- pte = addr_gpa2hva(vm, pde[index[1]].pfn * vm->page_size); +- TEST_ASSERT(pte[index[0]].present, ++ pte = addr_gpa2hva(vm, PTE_GET_PFN(pde[index[1]]) * vm->page_size); ++ TEST_ASSERT(pte[index[0]] & PTE_PRESENT_MASK, + "Expected pte to be present for gva: 0x%08lx", vaddr); + + return &pte[index[0]]; +@@ -360,7 +323,7 @@ static struct pageTableEntry *_vm_get_page_table_entry(struct kvm_vm *vm, int vc + + uint64_t vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr) + { +- struct pageTableEntry *pte = _vm_get_page_table_entry(vm, vcpuid, vaddr); ++ uint64_t *pte = _vm_get_page_table_entry(vm, vcpuid, vaddr); + + return *(uint64_t *)pte; + } +@@ -368,18 +331,17 @@ uint64_t vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr) + void vm_set_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr, + uint64_t pte) + { +- struct pageTableEntry *new_pte = _vm_get_page_table_entry(vm, vcpuid, +- vaddr); ++ uint64_t *new_pte = _vm_get_page_table_entry(vm, vcpuid, vaddr); + + *(uint64_t *)new_pte = pte; + } + + void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) + { +- struct pageUpperEntry *pml4e, *pml4e_start; +- struct pageUpperEntry *pdpe, *pdpe_start; +- struct pageUpperEntry *pde, *pde_start; +- struct pageTableEntry *pte, *pte_start; ++ uint64_t *pml4e, *pml4e_start; ++ uint64_t *pdpe, *pdpe_start; ++ uint64_t *pde, *pde_start; ++ uint64_t *pte, *pte_start; + + if (!vm->pgd_created) + return; +@@ -389,58 +351,58 @@ void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) + fprintf(stream, "%*s index hvaddr gpaddr " + "addr w exec dirty\n", + indent, ""); +- pml4e_start = (struct pageUpperEntry *) addr_gpa2hva(vm, vm->pgd); ++ pml4e_start = (uint64_t *) addr_gpa2hva(vm, vm->pgd); + for (uint16_t n1 = 0; n1 <= 0x1ffu; n1++) { + pml4e = &pml4e_start[n1]; +- if (!pml4e->present) ++ if (!(*pml4e & PTE_PRESENT_MASK)) + continue; +- fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10lx %u " ++ fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10llx %u " + " %u\n", + indent, "", + pml4e - pml4e_start, pml4e, +- addr_hva2gpa(vm, pml4e), (uint64_t) pml4e->pfn, +- pml4e->writable, pml4e->execute_disable); ++ addr_hva2gpa(vm, pml4e), PTE_GET_PFN(*pml4e), ++ !!(*pml4e & PTE_WRITABLE_MASK), !!(*pml4e & PTE_NX_MASK)); + +- pdpe_start = addr_gpa2hva(vm, pml4e->pfn * vm->page_size); ++ pdpe_start = addr_gpa2hva(vm, *pml4e & PHYSICAL_PAGE_MASK); + for (uint16_t n2 = 0; n2 <= 0x1ffu; n2++) { + pdpe = &pdpe_start[n2]; +- if (!pdpe->present) ++ if (!(*pdpe & PTE_PRESENT_MASK)) + continue; +- fprintf(stream, "%*spdpe 0x%-3zx %p 0x%-12lx 0x%-10lx " ++ fprintf(stream, "%*spdpe 0x%-3zx %p 0x%-12lx 0x%-10llx " + "%u %u\n", + indent, "", + pdpe - pdpe_start, pdpe, + addr_hva2gpa(vm, pdpe), +- (uint64_t) pdpe->pfn, pdpe->writable, +- pdpe->execute_disable); ++ PTE_GET_PFN(*pdpe), !!(*pdpe & PTE_WRITABLE_MASK), ++ !!(*pdpe & PTE_NX_MASK)); + +- pde_start = addr_gpa2hva(vm, pdpe->pfn * vm->page_size); ++ pde_start = addr_gpa2hva(vm, *pdpe & PHYSICAL_PAGE_MASK); + for (uint16_t n3 = 0; n3 <= 0x1ffu; n3++) { + pde = &pde_start[n3]; +- if (!pde->present) ++ if (!(*pde & PTE_PRESENT_MASK)) + continue; + fprintf(stream, "%*spde 0x%-3zx %p " +- "0x%-12lx 0x%-10lx %u %u\n", ++ "0x%-12lx 0x%-10llx %u %u\n", + indent, "", pde - pde_start, pde, + addr_hva2gpa(vm, pde), +- (uint64_t) pde->pfn, pde->writable, +- pde->execute_disable); ++ PTE_GET_PFN(*pde), !!(*pde & PTE_WRITABLE_MASK), ++ !!(*pde & PTE_NX_MASK)); + +- pte_start = addr_gpa2hva(vm, pde->pfn * vm->page_size); ++ pte_start = addr_gpa2hva(vm, *pde & PHYSICAL_PAGE_MASK); + for (uint16_t n4 = 0; n4 <= 0x1ffu; n4++) { + pte = &pte_start[n4]; +- if (!pte->present) ++ if (!(*pte & PTE_PRESENT_MASK)) + continue; + fprintf(stream, "%*spte 0x%-3zx %p " +- "0x%-12lx 0x%-10lx %u %u " ++ "0x%-12lx 0x%-10llx %u %u " + " %u 0x%-10lx\n", + indent, "", + pte - pte_start, pte, + addr_hva2gpa(vm, pte), +- (uint64_t) pte->pfn, +- pte->writable, +- pte->execute_disable, +- pte->dirty, ++ PTE_GET_PFN(*pte), ++ !!(*pte & PTE_WRITABLE_MASK), ++ !!(*pte & PTE_NX_MASK), ++ !!(*pte & PTE_DIRTY_MASK), + ((uint64_t) n1 << 27) + | ((uint64_t) n2 << 18) + | ((uint64_t) n3 << 9) +@@ -558,8 +520,8 @@ static void kvm_seg_set_kernel_data_64bit(struct kvm_vm *vm, uint16_t selector, + vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) + { + uint16_t index[4]; +- struct pageUpperEntry *pml4e, *pdpe, *pde; +- struct pageTableEntry *pte; ++ uint64_t *pml4e, *pdpe, *pde; ++ uint64_t *pte; + + TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " + "unknown or unsupported guest mode, mode: 0x%x", vm->mode); +@@ -572,22 +534,22 @@ vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) + if (!vm->pgd_created) + goto unmapped_gva; + pml4e = addr_gpa2hva(vm, vm->pgd); +- if (!pml4e[index[3]].present) ++ if (!(pml4e[index[3]] & PTE_PRESENT_MASK)) + goto unmapped_gva; + +- pdpe = addr_gpa2hva(vm, pml4e[index[3]].pfn * vm->page_size); +- if (!pdpe[index[2]].present) ++ pdpe = addr_gpa2hva(vm, PTE_GET_PFN(pml4e[index[3]]) * vm->page_size); ++ if (!(pdpe[index[2]] & PTE_PRESENT_MASK)) + goto unmapped_gva; + +- pde = addr_gpa2hva(vm, pdpe[index[2]].pfn * vm->page_size); +- if (!pde[index[1]].present) ++ pde = addr_gpa2hva(vm, PTE_GET_PFN(pdpe[index[2]]) * vm->page_size); ++ if (!(pde[index[1]] & PTE_PRESENT_MASK)) + goto unmapped_gva; + +- pte = addr_gpa2hva(vm, pde[index[1]].pfn * vm->page_size); +- if (!pte[index[0]].present) ++ pte = addr_gpa2hva(vm, PTE_GET_PFN(pde[index[1]]) * vm->page_size); ++ if (!(pte[index[0]] & PTE_PRESENT_MASK)) + goto unmapped_gva; + +- return (pte[index[0]].pfn * vm->page_size) + (gva & 0xfffu); ++ return (PTE_GET_PFN(pte[index[0]]) * vm->page_size) + (gva & 0xfffu); + + unmapped_gva: + TEST_FAIL("No mapping for vm virtual address, gva: 0x%lx", gva); +-- +2.35.1 + diff --git a/queue-5.15/kvm-selftests-silence-compiler-warning-in-the-kvm_pa.patch b/queue-5.15/kvm-selftests-silence-compiler-warning-in-the-kvm_pa.patch new file mode 100644 index 00000000000..0368dedd7d6 --- /dev/null +++ b/queue-5.15/kvm-selftests-silence-compiler-warning-in-the-kvm_pa.patch @@ -0,0 +1,48 @@ +From 1c4fe2e5ba211b99c22d5f594a27a4e3af5af7ef Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Apr 2022 12:30:31 +0200 +Subject: KVM: selftests: Silence compiler warning in the kvm_page_table_test + +From: Thomas Huth + +[ Upstream commit 266a19a0bc4fbfab4d981a47640ca98972a01865 ] + +When compiling kvm_page_table_test.c, I get this compiler warning +with gcc 11.2: + +kvm_page_table_test.c: In function 'pre_init_before_test': +../../../../tools/include/linux/kernel.h:44:24: warning: comparison of + distinct pointer types lacks a cast + 44 | (void) (&_max1 == &_max2); \ + | ^~ +kvm_page_table_test.c:281:21: note: in expansion of macro 'max' + 281 | alignment = max(0x100000, alignment); + | ^~~ + +Fix it by adjusting the type of the absolute value. + +Signed-off-by: Thomas Huth +Reviewed-by: Claudio Imbrenda +Message-Id: <20220414103031.565037-1-thuth@redhat.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Sasha Levin +--- + tools/testing/selftests/kvm/kvm_page_table_test.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c +index 36407cb0ec85..f1ddfe4c4a03 100644 +--- a/tools/testing/selftests/kvm/kvm_page_table_test.c ++++ b/tools/testing/selftests/kvm/kvm_page_table_test.c +@@ -278,7 +278,7 @@ static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg) + else + guest_test_phys_mem = p->phys_offset; + #ifdef __s390x__ +- alignment = max(0x100000, alignment); ++ alignment = max(0x100000UL, alignment); + #endif + guest_test_phys_mem &= ~(alignment - 1); + +-- +2.35.1 + diff --git a/queue-5.15/kvm-x86-cpuid-only-provide-cpuid-leaf-0xa-if-host-ha.patch b/queue-5.15/kvm-x86-cpuid-only-provide-cpuid-leaf-0xa-if-host-ha.patch new file mode 100644 index 00000000000..7d81b7f998c --- /dev/null +++ b/queue-5.15/kvm-x86-cpuid-only-provide-cpuid-leaf-0xa-if-host-ha.patch @@ -0,0 +1,54 @@ +From d1bf2b8d8bd864a1db8cc3677e64f07594122d8f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 27 Apr 2022 17:01:49 +0530 +Subject: kvm: x86/cpuid: Only provide CPUID leaf 0xA if host has architectural + PMU + +From: Sandipan Das + +[ Upstream commit 5a1bde46f98b893cda6122b00e94c0c40a6ead3c ] + +On some x86 processors, CPUID leaf 0xA provides information +on Architectural Performance Monitoring features. It +advertises a PMU version which Qemu uses to determine the +availability of additional MSRs to manage the PMCs. + +Upon receiving a KVM_GET_SUPPORTED_CPUID ioctl request for +the same, the kernel constructs return values based on the +x86_pmu_capability irrespective of the vendor. + +This leaf and the additional MSRs are not supported on AMD +and Hygon processors. If AMD PerfMonV2 is detected, the PMU +version is set to 2 and guest startup breaks because of an +attempt to access a non-existent MSR. Return zeros to avoid +this. + +Fixes: a6c06ed1a60a ("KVM: Expose the architectural performance monitoring CPUID leaf") +Reported-by: Vasant Hegde +Signed-off-by: Sandipan Das +Message-Id: <3fef83d9c2b2f7516e8ff50d60851f29a4bcb716.1651058600.git.sandipan.das@amd.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/cpuid.c | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c +index 5f1d4a5aa871..b17c9b00669e 100644 +--- a/arch/x86/kvm/cpuid.c ++++ b/arch/x86/kvm/cpuid.c +@@ -725,6 +725,11 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) + union cpuid10_eax eax; + union cpuid10_edx edx; + ++ if (!static_cpu_has(X86_FEATURE_ARCH_PERFMON)) { ++ entry->eax = entry->ebx = entry->ecx = entry->edx = 0; ++ break; ++ } ++ + perf_get_x86_pmu_capability(&cap); + + /* +-- +2.35.1 + diff --git a/queue-5.15/kvm-x86-do-not-change-icr-on-write-to-apic_self_ipi.patch b/queue-5.15/kvm-x86-do-not-change-icr-on-write-to-apic_self_ipi.patch new file mode 100644 index 00000000000..d57a983de9e --- /dev/null +++ b/queue-5.15/kvm-x86-do-not-change-icr-on-write-to-apic_self_ipi.patch @@ -0,0 +1,43 @@ +From d7f8ff94b3ea03fd24acbfff4a12e1dd6fde3b03 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 24 Feb 2022 09:53:36 -0500 +Subject: KVM: x86: Do not change ICR on write to APIC_SELF_IPI + +From: Paolo Bonzini + +[ Upstream commit d22a81b304a27fca6124174a8e842e826c193466 ] + +Emulating writes to SELF_IPI with a write to ICR has an unwanted side effect: +the value of ICR in vAPIC page gets changed. The lists SELF_IPI as write-only, +with no associated MMIO offset, so any write should have no visible side +effect in the vAPIC page. + +Reported-by: Chao Gao +Reviewed-by: Sean Christopherson +Signed-off-by: Paolo Bonzini +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/lapic.c | 7 +++---- + 1 file changed, 3 insertions(+), 4 deletions(-) + +diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c +index 4d92fb4fdf69..83d1743a1dd0 100644 +--- a/arch/x86/kvm/lapic.c ++++ b/arch/x86/kvm/lapic.c +@@ -2125,10 +2125,9 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) + break; + + case APIC_SELF_IPI: +- if (apic_x2apic_mode(apic)) { +- kvm_lapic_reg_write(apic, APIC_ICR, +- APIC_DEST_SELF | (val & APIC_VECTOR_MASK)); +- } else ++ if (apic_x2apic_mode(apic)) ++ kvm_apic_send_ipi(apic, APIC_DEST_SELF | (val & APIC_VECTOR_MASK), 0); ++ else + ret = 1; + break; + default: +-- +2.35.1 + diff --git a/queue-5.15/kvm-x86-mmu-avoid-null-pointer-dereference-on-page-f.patch b/queue-5.15/kvm-x86-mmu-avoid-null-pointer-dereference-on-page-f.patch new file mode 100644 index 00000000000..c5d041d2196 --- /dev/null +++ b/queue-5.15/kvm-x86-mmu-avoid-null-pointer-dereference-on-page-f.patch @@ -0,0 +1,39 @@ +From f022b17acc9add41057f83b1314aa279448a072e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 8 Feb 2022 19:08:33 -0500 +Subject: KVM: x86/mmu: avoid NULL-pointer dereference on page freeing bugs + +From: Paolo Bonzini + +[ Upstream commit 9191b8f0745e63edf519e4a54a4aaae1d3d46fbd ] + +WARN and bail if KVM attempts to free a root that isn't backed by a shadow +page. KVM allocates a bare page for "special" roots, e.g. when using PAE +paging or shadowing 2/3/4-level page tables with 4/5-level, and so root_hpa +will be valid but won't be backed by a shadow page. It's all too easy to +blindly call mmu_free_root_page() on root_hpa, be nice and WARN instead of +crashing KVM and possibly the kernel. + +Reviewed-by: Sean Christopherson +Signed-off-by: Paolo Bonzini +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/mmu/mmu.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c +index 34e828badc51..806f9d42bcce 100644 +--- a/arch/x86/kvm/mmu/mmu.c ++++ b/arch/x86/kvm/mmu/mmu.c +@@ -3314,6 +3314,8 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, + return; + + sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK); ++ if (WARN_ON(!sp)) ++ return; + + if (is_tdp_mmu_page(sp)) + kvm_tdp_mmu_put_root(kvm, sp, false); +-- +2.35.1 + diff --git a/queue-5.15/net-mlx5-fix-slab-out-of-bounds-while-reading-resour.patch b/queue-5.15/net-mlx5-fix-slab-out-of-bounds-while-reading-resour.patch new file mode 100644 index 00000000000..844482178c7 --- /dev/null +++ b/queue-5.15/net-mlx5-fix-slab-out-of-bounds-while-reading-resour.patch @@ -0,0 +1,151 @@ +From 3bd84b07d25dd16503d38a932562bdc7d0840fa9 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 3 Mar 2022 19:02:03 +0200 +Subject: net/mlx5: Fix slab-out-of-bounds while reading resource dump menu + +From: Aya Levin + +[ Upstream commit 7ba2d9d8de96696c1451fee1b01da11f45bdc2b9 ] + +Resource dump menu may span over more than a single page, support it. +Otherwise, menu read may result in a memory access violation: reading +outside of the allocated page. +Note that page format of the first menu page contains menu headers while +the proceeding menu pages contain only records. + +The KASAN logs are as follows: +BUG: KASAN: slab-out-of-bounds in strcmp+0x9b/0xb0 +Read of size 1 at addr ffff88812b2e1fd0 by task systemd-udevd/496 + +CPU: 5 PID: 496 Comm: systemd-udevd Tainted: G B 5.16.0_for_upstream_debug_2022_01_10_23_12 #1 +Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 +Call Trace: + + dump_stack_lvl+0x57/0x7d + print_address_description.constprop.0+0x1f/0x140 + ? strcmp+0x9b/0xb0 + ? strcmp+0x9b/0xb0 + kasan_report.cold+0x83/0xdf + ? strcmp+0x9b/0xb0 + strcmp+0x9b/0xb0 + mlx5_rsc_dump_init+0x4ab/0x780 [mlx5_core] + ? mlx5_rsc_dump_destroy+0x80/0x80 [mlx5_core] + ? lockdep_hardirqs_on_prepare+0x286/0x400 + ? raw_spin_unlock_irqrestore+0x47/0x50 + ? aomic_notifier_chain_register+0x32/0x40 + mlx5_load+0x104/0x2e0 [mlx5_core] + mlx5_init_one+0x41b/0x610 [mlx5_core] + .... +The buggy address belongs to the object at ffff88812b2e0000 + which belongs to the cache kmalloc-4k of size 4096 +The buggy address is located 4048 bytes to the right of + 4096-byte region [ffff88812b2e0000, ffff88812b2e1000) +The buggy address belongs to the page: +page:000000009d69807a refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff88812b2e6000 pfn:0x12b2e0 +head:000000009d69807a order:3 compound_mapcount:0 compound_pincount:0 +flags: 0x8000000000010200(slab|head|zone=2) +raw: 8000000000010200 0000000000000000 dead000000000001 ffff888100043040 +raw: ffff88812b2e6000 0000000080040000 00000001ffffffff 0000000000000000 +page dumped because: kasan: bad access detected + +Memory state around the buggy address: + ffff88812b2e1e80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc + ffff88812b2e1f00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc +>ffff88812b2e1f80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc + ^ + ffff88812b2e2000: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb + ffff88812b2e2080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb +================================================================== + +Fixes: 12206b17235a ("net/mlx5: Add support for resource dump") +Signed-off-by: Aya Levin +Reviewed-by: Moshe Shemesh +Signed-off-by: Saeed Mahameed +Signed-off-by: Sasha Levin +--- + .../mellanox/mlx5/core/diag/rsc_dump.c | 31 +++++++++++++++---- + 1 file changed, 25 insertions(+), 6 deletions(-) + +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/rsc_dump.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/rsc_dump.c +index ed4fb79b4db7..75b6060f7a9a 100644 +--- a/drivers/net/ethernet/mellanox/mlx5/core/diag/rsc_dump.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/rsc_dump.c +@@ -31,6 +31,7 @@ static const char *const mlx5_rsc_sgmt_name[] = { + struct mlx5_rsc_dump { + u32 pdn; + struct mlx5_core_mkey mkey; ++ u32 number_of_menu_items; + u16 fw_segment_type[MLX5_SGMT_TYPE_NUM]; + }; + +@@ -50,21 +51,37 @@ static int mlx5_rsc_dump_sgmt_get_by_name(char *name) + return -EINVAL; + } + +-static void mlx5_rsc_dump_read_menu_sgmt(struct mlx5_rsc_dump *rsc_dump, struct page *page) ++#define MLX5_RSC_DUMP_MENU_HEADER_SIZE (MLX5_ST_SZ_BYTES(resource_dump_info_segment) + \ ++ MLX5_ST_SZ_BYTES(resource_dump_command_segment) + \ ++ MLX5_ST_SZ_BYTES(resource_dump_menu_segment)) ++ ++static int mlx5_rsc_dump_read_menu_sgmt(struct mlx5_rsc_dump *rsc_dump, struct page *page, ++ int read_size, int start_idx) + { + void *data = page_address(page); + enum mlx5_sgmt_type sgmt_idx; + int num_of_items; + char *sgmt_name; + void *member; ++ int size = 0; + void *menu; + int i; + +- menu = MLX5_ADDR_OF(menu_resource_dump_response, data, menu); +- num_of_items = MLX5_GET(resource_dump_menu_segment, menu, num_of_records); ++ if (!start_idx) { ++ menu = MLX5_ADDR_OF(menu_resource_dump_response, data, menu); ++ rsc_dump->number_of_menu_items = MLX5_GET(resource_dump_menu_segment, menu, ++ num_of_records); ++ size = MLX5_RSC_DUMP_MENU_HEADER_SIZE; ++ data += size; ++ } ++ num_of_items = rsc_dump->number_of_menu_items; ++ ++ for (i = 0; start_idx + i < num_of_items; i++) { ++ size += MLX5_ST_SZ_BYTES(resource_dump_menu_record); ++ if (size >= read_size) ++ return start_idx + i; + +- for (i = 0; i < num_of_items; i++) { +- member = MLX5_ADDR_OF(resource_dump_menu_segment, menu, record[i]); ++ member = data + MLX5_ST_SZ_BYTES(resource_dump_menu_record) * i; + sgmt_name = MLX5_ADDR_OF(resource_dump_menu_record, member, segment_name); + sgmt_idx = mlx5_rsc_dump_sgmt_get_by_name(sgmt_name); + if (sgmt_idx == -EINVAL) +@@ -72,6 +89,7 @@ static void mlx5_rsc_dump_read_menu_sgmt(struct mlx5_rsc_dump *rsc_dump, struct + rsc_dump->fw_segment_type[sgmt_idx] = MLX5_GET(resource_dump_menu_record, + member, segment_type); + } ++ return 0; + } + + static int mlx5_rsc_dump_trigger(struct mlx5_core_dev *dev, struct mlx5_rsc_dump_cmd *cmd, +@@ -168,6 +186,7 @@ static int mlx5_rsc_dump_menu(struct mlx5_core_dev *dev) + struct mlx5_rsc_dump_cmd *cmd = NULL; + struct mlx5_rsc_key key = {}; + struct page *page; ++ int start_idx = 0; + int size; + int err; + +@@ -189,7 +208,7 @@ static int mlx5_rsc_dump_menu(struct mlx5_core_dev *dev) + if (err < 0) + goto destroy_cmd; + +- mlx5_rsc_dump_read_menu_sgmt(dev->rsc_dump, page); ++ start_idx = mlx5_rsc_dump_read_menu_sgmt(dev->rsc_dump, page, size, start_idx); + + } while (err > 0); + +-- +2.35.1 + diff --git a/queue-5.15/net-mlx5e-lag-don-t-skip-fib-events-on-current-dst.patch b/queue-5.15/net-mlx5e-lag-don-t-skip-fib-events-on-current-dst.patch new file mode 100644 index 00000000000..922109b88b7 --- /dev/null +++ b/queue-5.15/net-mlx5e-lag-don-t-skip-fib-events-on-current-dst.patch @@ -0,0 +1,114 @@ +From 713bcccad531624658a1145da54f277378f13b80 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 18 Apr 2022 17:40:37 +0300 +Subject: net/mlx5e: Lag, Don't skip fib events on current dst + +From: Vlad Buslov + +[ Upstream commit 4a2a664ed87962c4ddb806a84b5c9634820bcf55 ] + +Referenced change added check to skip updating fib when new fib instance +has same or lower priority. However, new fib instance can be an update on +same dst address as existing one even though the structure is another +instance that has different address. Ignoring events on such instances +causes multipath LAG state to not be correctly updated. + +Track 'dst' and 'dst_len' fields of fib event fib_entry_notifier_info +structure and don't skip events that have the same value of that fields. + +Fixes: ad11c4f1d8fd ("net/mlx5e: Lag, Only handle events from highest priority multipath entry") +Signed-off-by: Vlad Buslov +Reviewed-by: Maor Dickman +Signed-off-by: Saeed Mahameed +Signed-off-by: Sasha Levin +--- + .../net/ethernet/mellanox/mlx5/core/lag_mp.c | 20 +++++++++++-------- + .../net/ethernet/mellanox/mlx5/core/lag_mp.h | 2 ++ + 2 files changed, 14 insertions(+), 8 deletions(-) + +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c b/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c +index 9d50b9c2db5e..81786a9a424c 100644 +--- a/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c +@@ -100,10 +100,12 @@ static void mlx5_lag_fib_event_flush(struct notifier_block *nb) + flush_workqueue(mp->wq); + } + +-static void mlx5_lag_fib_set(struct lag_mp *mp, struct fib_info *fi) ++static void mlx5_lag_fib_set(struct lag_mp *mp, struct fib_info *fi, u32 dst, int dst_len) + { + mp->fib.mfi = fi; + mp->fib.priority = fi->fib_priority; ++ mp->fib.dst = dst; ++ mp->fib.dst_len = dst_len; + } + + struct mlx5_fib_event_work { +@@ -116,10 +118,10 @@ struct mlx5_fib_event_work { + }; + }; + +-static void mlx5_lag_fib_route_event(struct mlx5_lag *ldev, +- unsigned long event, +- struct fib_info *fi) ++static void mlx5_lag_fib_route_event(struct mlx5_lag *ldev, unsigned long event, ++ struct fib_entry_notifier_info *fen_info) + { ++ struct fib_info *fi = fen_info->fi; + struct lag_mp *mp = &ldev->lag_mp; + struct fib_nh *fib_nh0, *fib_nh1; + unsigned int nhs; +@@ -133,7 +135,9 @@ static void mlx5_lag_fib_route_event(struct mlx5_lag *ldev, + } + + /* Handle multipath entry with lower priority value */ +- if (mp->fib.mfi && mp->fib.mfi != fi && fi->fib_priority >= mp->fib.priority) ++ if (mp->fib.mfi && mp->fib.mfi != fi && ++ (mp->fib.dst != fen_info->dst || mp->fib.dst_len != fen_info->dst_len) && ++ fi->fib_priority >= mp->fib.priority) + return; + + /* Handle add/replace event */ +@@ -149,7 +153,7 @@ static void mlx5_lag_fib_route_event(struct mlx5_lag *ldev, + + i++; + mlx5_lag_set_port_affinity(ldev, i); +- mlx5_lag_fib_set(mp, fi); ++ mlx5_lag_fib_set(mp, fi, fen_info->dst, fen_info->dst_len); + } + + return; +@@ -179,7 +183,7 @@ static void mlx5_lag_fib_route_event(struct mlx5_lag *ldev, + } + + mlx5_lag_set_port_affinity(ldev, MLX5_LAG_NORMAL_AFFINITY); +- mlx5_lag_fib_set(mp, fi); ++ mlx5_lag_fib_set(mp, fi, fen_info->dst, fen_info->dst_len); + } + + static void mlx5_lag_fib_nexthop_event(struct mlx5_lag *ldev, +@@ -220,7 +224,7 @@ static void mlx5_lag_fib_update(struct work_struct *work) + case FIB_EVENT_ENTRY_REPLACE: + case FIB_EVENT_ENTRY_DEL: + mlx5_lag_fib_route_event(ldev, fib_work->event, +- fib_work->fen_info.fi); ++ &fib_work->fen_info); + fib_info_put(fib_work->fen_info.fi); + break; + case FIB_EVENT_NH_ADD: +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.h b/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.h +index e8380eb0dd6a..b3a7f18b9e30 100644 +--- a/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.h +@@ -18,6 +18,8 @@ struct lag_mp { + struct { + const void *mfi; /* used in tracking fib events */ + u32 priority; ++ u32 dst; ++ int dst_len; + } fib; + struct workqueue_struct *wq; + }; +-- +2.35.1 + diff --git a/queue-5.15/net-mlx5e-lag-fix-fib_info-pointer-assignment.patch b/queue-5.15/net-mlx5e-lag-fix-fib_info-pointer-assignment.patch new file mode 100644 index 00000000000..8470372e7e9 --- /dev/null +++ b/queue-5.15/net-mlx5e-lag-fix-fib_info-pointer-assignment.patch @@ -0,0 +1,40 @@ +From 25544ec0a67cdd42e9d7ee795e8e2bd7aff82c3a Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 18 Apr 2022 17:32:54 +0300 +Subject: net/mlx5e: Lag, Fix fib_info pointer assignment + +From: Vlad Buslov + +[ Upstream commit a6589155ec9847918e00e7279b8aa6d4c272bea7 ] + +Referenced change incorrectly sets single path fib_info even when LAG is +not active. Fix it by moving call to mlx5_lag_fib_set() into conditional +that verifies LAG state. + +Fixes: ad11c4f1d8fd ("net/mlx5e: Lag, Only handle events from highest priority multipath entry") +Signed-off-by: Vlad Buslov +Reviewed-by: Maor Dickman +Signed-off-by: Saeed Mahameed +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c b/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c +index 8d278c45e7cc..9d50b9c2db5e 100644 +--- a/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c +@@ -149,9 +149,9 @@ static void mlx5_lag_fib_route_event(struct mlx5_lag *ldev, + + i++; + mlx5_lag_set_port_affinity(ldev, i); ++ mlx5_lag_fib_set(mp, fi); + } + +- mlx5_lag_fib_set(mp, fi); + return; + } + +-- +2.35.1 + diff --git a/queue-5.15/net-mlx5e-lag-fix-use-after-free-in-fib-event-handle.patch b/queue-5.15/net-mlx5e-lag-fix-use-after-free-in-fib-event-handle.patch new file mode 100644 index 00000000000..0ea53523808 --- /dev/null +++ b/queue-5.15/net-mlx5e-lag-fix-use-after-free-in-fib-event-handle.patch @@ -0,0 +1,247 @@ +From 402301a852f26350a1417bd70828a51eb9716dba Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 18 Apr 2022 17:32:19 +0300 +Subject: net/mlx5e: Lag, Fix use-after-free in fib event handler + +From: Vlad Buslov + +[ Upstream commit 27b0420fd959e38e3500e60b637d39dfab065645 ] + +Recent commit that modified fib route event handler to handle events +according to their priority introduced use-after-free[0] in mp->mfi pointer +usage. The pointer now is not just cached in order to be compared to +following fib_info instances, but is also dereferenced to obtain +fib_priority. However, since mlx5 lag code doesn't hold the reference to +fin_info during whole mp->mfi lifetime, it could be used after fib_info +instance has already been freed be kernel infrastructure code. + +Don't ever dereference mp->mfi pointer. Refactor it to be 'const void*' +type and cache fib_info priority in dedicated integer. Group +fib_info-related data into dedicated 'fib' structure that will be further +extended by following patches in the series. + +[0]: + +[ 203.588029] ================================================================== +[ 203.590161] BUG: KASAN: use-after-free in mlx5_lag_fib_update+0xabd/0xd60 [mlx5_core] +[ 203.592386] Read of size 4 at addr ffff888144df2050 by task kworker/u20:4/138 + +[ 203.594766] CPU: 3 PID: 138 Comm: kworker/u20:4 Tainted: G B 5.17.0-rc7+ #6 +[ 203.596751] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 +[ 203.598813] Workqueue: mlx5_lag_mp mlx5_lag_fib_update [mlx5_core] +[ 203.600053] Call Trace: +[ 203.600608] +[ 203.601110] dump_stack_lvl+0x48/0x5e +[ 203.601860] print_address_description.constprop.0+0x1f/0x160 +[ 203.602950] ? mlx5_lag_fib_update+0xabd/0xd60 [mlx5_core] +[ 203.604073] ? mlx5_lag_fib_update+0xabd/0xd60 [mlx5_core] +[ 203.605177] kasan_report.cold+0x83/0xdf +[ 203.605969] ? mlx5_lag_fib_update+0xabd/0xd60 [mlx5_core] +[ 203.607102] mlx5_lag_fib_update+0xabd/0xd60 [mlx5_core] +[ 203.608199] ? mlx5_lag_init_fib_work+0x1c0/0x1c0 [mlx5_core] +[ 203.609382] ? read_word_at_a_time+0xe/0x20 +[ 203.610463] ? strscpy+0xa0/0x2a0 +[ 203.611463] process_one_work+0x722/0x1270 +[ 203.612344] worker_thread+0x540/0x11e0 +[ 203.613136] ? rescuer_thread+0xd50/0xd50 +[ 203.613949] kthread+0x26e/0x300 +[ 203.614627] ? kthread_complete_and_exit+0x20/0x20 +[ 203.615542] ret_from_fork+0x1f/0x30 +[ 203.616273] + +[ 203.617174] Allocated by task 3746: +[ 203.617874] kasan_save_stack+0x1e/0x40 +[ 203.618644] __kasan_kmalloc+0x81/0xa0 +[ 203.619394] fib_create_info+0xb41/0x3c50 +[ 203.620213] fib_table_insert+0x190/0x1ff0 +[ 203.621020] fib_magic.isra.0+0x246/0x2e0 +[ 203.621803] fib_add_ifaddr+0x19f/0x670 +[ 203.622563] fib_inetaddr_event+0x13f/0x270 +[ 203.623377] blocking_notifier_call_chain+0xd4/0x130 +[ 203.624355] __inet_insert_ifa+0x641/0xb20 +[ 203.625185] inet_rtm_newaddr+0xc3d/0x16a0 +[ 203.626009] rtnetlink_rcv_msg+0x309/0x880 +[ 203.626826] netlink_rcv_skb+0x11d/0x340 +[ 203.627626] netlink_unicast+0x4cc/0x790 +[ 203.628430] netlink_sendmsg+0x762/0xc00 +[ 203.629230] sock_sendmsg+0xb2/0xe0 +[ 203.629955] ____sys_sendmsg+0x58a/0x770 +[ 203.630756] ___sys_sendmsg+0xd8/0x160 +[ 203.631523] __sys_sendmsg+0xb7/0x140 +[ 203.632294] do_syscall_64+0x35/0x80 +[ 203.633045] entry_SYSCALL_64_after_hwframe+0x44/0xae + +[ 203.634427] Freed by task 0: +[ 203.635063] kasan_save_stack+0x1e/0x40 +[ 203.635844] kasan_set_track+0x21/0x30 +[ 203.636618] kasan_set_free_info+0x20/0x30 +[ 203.637450] __kasan_slab_free+0xfc/0x140 +[ 203.638271] kfree+0x94/0x3b0 +[ 203.638903] rcu_core+0x5e4/0x1990 +[ 203.639640] __do_softirq+0x1ba/0x5d3 + +[ 203.640828] Last potentially related work creation: +[ 203.641785] kasan_save_stack+0x1e/0x40 +[ 203.642571] __kasan_record_aux_stack+0x9f/0xb0 +[ 203.643478] call_rcu+0x88/0x9c0 +[ 203.644178] fib_release_info+0x539/0x750 +[ 203.644997] fib_table_delete+0x659/0xb80 +[ 203.645809] fib_magic.isra.0+0x1a3/0x2e0 +[ 203.646617] fib_del_ifaddr+0x93f/0x1300 +[ 203.647415] fib_inetaddr_event+0x9f/0x270 +[ 203.648251] blocking_notifier_call_chain+0xd4/0x130 +[ 203.649225] __inet_del_ifa+0x474/0xc10 +[ 203.650016] devinet_ioctl+0x781/0x17f0 +[ 203.650788] inet_ioctl+0x1ad/0x290 +[ 203.651533] sock_do_ioctl+0xce/0x1c0 +[ 203.652315] sock_ioctl+0x27b/0x4f0 +[ 203.653058] __x64_sys_ioctl+0x124/0x190 +[ 203.653850] do_syscall_64+0x35/0x80 +[ 203.654608] entry_SYSCALL_64_after_hwframe+0x44/0xae + +[ 203.666952] The buggy address belongs to the object at ffff888144df2000 + which belongs to the cache kmalloc-256 of size 256 +[ 203.669250] The buggy address is located 80 bytes inside of + 256-byte region [ffff888144df2000, ffff888144df2100) +[ 203.671332] The buggy address belongs to the page: +[ 203.672273] page:00000000bf6c9314 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x144df0 +[ 203.674009] head:00000000bf6c9314 order:2 compound_mapcount:0 compound_pincount:0 +[ 203.675422] flags: 0x2ffff800010200(slab|head|node=0|zone=2|lastcpupid=0x1ffff) +[ 203.676819] raw: 002ffff800010200 0000000000000000 dead000000000122 ffff888100042b40 +[ 203.678384] raw: 0000000000000000 0000000080200020 00000001ffffffff 0000000000000000 +[ 203.679928] page dumped because: kasan: bad access detected + +[ 203.681455] Memory state around the buggy address: +[ 203.682421] ffff888144df1f00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc +[ 203.683863] ffff888144df1f80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc +[ 203.685310] >ffff888144df2000: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb +[ 203.686701] ^ +[ 203.687820] ffff888144df2080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb +[ 203.689226] ffff888144df2100: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc +[ 203.690620] ================================================================== + +Fixes: ad11c4f1d8fd ("net/mlx5e: Lag, Only handle events from highest priority multipath entry") +Signed-off-by: Vlad Buslov +Reviewed-by: Maor Dickman +Reviewed-by: Leon Romanovsky +Signed-off-by: Saeed Mahameed +Signed-off-by: Sasha Levin +--- + .../net/ethernet/mellanox/mlx5/core/lag_mp.c | 26 ++++++++++++------- + .../net/ethernet/mellanox/mlx5/core/lag_mp.h | 5 +++- + 2 files changed, 20 insertions(+), 11 deletions(-) + +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c b/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c +index cb0a48d374a3..8d278c45e7cc 100644 +--- a/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c +@@ -100,6 +100,12 @@ static void mlx5_lag_fib_event_flush(struct notifier_block *nb) + flush_workqueue(mp->wq); + } + ++static void mlx5_lag_fib_set(struct lag_mp *mp, struct fib_info *fi) ++{ ++ mp->fib.mfi = fi; ++ mp->fib.priority = fi->fib_priority; ++} ++ + struct mlx5_fib_event_work { + struct work_struct work; + struct mlx5_lag *ldev; +@@ -121,13 +127,13 @@ static void mlx5_lag_fib_route_event(struct mlx5_lag *ldev, + /* Handle delete event */ + if (event == FIB_EVENT_ENTRY_DEL) { + /* stop track */ +- if (mp->mfi == fi) +- mp->mfi = NULL; ++ if (mp->fib.mfi == fi) ++ mp->fib.mfi = NULL; + return; + } + + /* Handle multipath entry with lower priority value */ +- if (mp->mfi && mp->mfi != fi && fi->fib_priority >= mp->mfi->fib_priority) ++ if (mp->fib.mfi && mp->fib.mfi != fi && fi->fib_priority >= mp->fib.priority) + return; + + /* Handle add/replace event */ +@@ -145,7 +151,7 @@ static void mlx5_lag_fib_route_event(struct mlx5_lag *ldev, + mlx5_lag_set_port_affinity(ldev, i); + } + +- mp->mfi = fi; ++ mlx5_lag_fib_set(mp, fi); + return; + } + +@@ -165,7 +171,7 @@ static void mlx5_lag_fib_route_event(struct mlx5_lag *ldev, + } + + /* First time we see multipath route */ +- if (!mp->mfi && !__mlx5_lag_is_active(ldev)) { ++ if (!mp->fib.mfi && !__mlx5_lag_is_active(ldev)) { + struct lag_tracker tracker; + + tracker = ldev->tracker; +@@ -173,7 +179,7 @@ static void mlx5_lag_fib_route_event(struct mlx5_lag *ldev, + } + + mlx5_lag_set_port_affinity(ldev, MLX5_LAG_NORMAL_AFFINITY); +- mp->mfi = fi; ++ mlx5_lag_fib_set(mp, fi); + } + + static void mlx5_lag_fib_nexthop_event(struct mlx5_lag *ldev, +@@ -184,7 +190,7 @@ static void mlx5_lag_fib_nexthop_event(struct mlx5_lag *ldev, + struct lag_mp *mp = &ldev->lag_mp; + + /* Check the nh event is related to the route */ +- if (!mp->mfi || mp->mfi != fi) ++ if (!mp->fib.mfi || mp->fib.mfi != fi) + return; + + /* nh added/removed */ +@@ -313,7 +319,7 @@ void mlx5_lag_mp_reset(struct mlx5_lag *ldev) + /* Clear mfi, as it might become stale when a route delete event + * has been missed, see mlx5_lag_fib_route_event(). + */ +- ldev->lag_mp.mfi = NULL; ++ ldev->lag_mp.fib.mfi = NULL; + } + + int mlx5_lag_mp_init(struct mlx5_lag *ldev) +@@ -324,7 +330,7 @@ int mlx5_lag_mp_init(struct mlx5_lag *ldev) + /* always clear mfi, as it might become stale when a route delete event + * has been missed + */ +- mp->mfi = NULL; ++ mp->fib.mfi = NULL; + + if (mp->fib_nb.notifier_call) + return 0; +@@ -354,5 +360,5 @@ void mlx5_lag_mp_cleanup(struct mlx5_lag *ldev) + unregister_fib_notifier(&init_net, &mp->fib_nb); + destroy_workqueue(mp->wq); + mp->fib_nb.notifier_call = NULL; +- mp->mfi = NULL; ++ mp->fib.mfi = NULL; + } +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.h b/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.h +index dea199e79bed..e8380eb0dd6a 100644 +--- a/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.h +@@ -15,7 +15,10 @@ enum mlx5_lag_port_affinity { + + struct lag_mp { + struct notifier_block fib_nb; +- struct fib_info *mfi; /* used in tracking fib events */ ++ struct { ++ const void *mfi; /* used in tracking fib events */ ++ u32 priority; ++ } fib; + struct workqueue_struct *wq; + }; + +-- +2.35.1 + diff --git a/queue-5.15/nfc-netlink-fix-sleep-in-atomic-bug-when-firmware-do.patch b/queue-5.15/nfc-netlink-fix-sleep-in-atomic-bug-when-firmware-do.patch new file mode 100644 index 00000000000..f5dbc1bb2f3 --- /dev/null +++ b/queue-5.15/nfc-netlink-fix-sleep-in-atomic-bug-when-firmware-do.patch @@ -0,0 +1,70 @@ +From 27a214b70554e2dea2d36d8b6344d681049d826e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 4 May 2022 13:58:47 +0800 +Subject: NFC: netlink: fix sleep in atomic bug when firmware download timeout + +From: Duoming Zhou + +[ Upstream commit 4071bf121d59944d5cd2238de0642f3d7995a997 ] + +There are sleep in atomic bug that could cause kernel panic during +firmware download process. The root cause is that nlmsg_new with +GFP_KERNEL parameter is called in fw_dnld_timeout which is a timer +handler. The call trace is shown below: + +BUG: sleeping function called from invalid context at include/linux/sched/mm.h:265 +Call Trace: +kmem_cache_alloc_node +__alloc_skb +nfc_genl_fw_download_done +call_timer_fn +__run_timers.part.0 +run_timer_softirq +__do_softirq +... + +The nlmsg_new with GFP_KERNEL parameter may sleep during memory +allocation process, and the timer handler is run as the result of +a "software interrupt" that should not call any other function +that could sleep. + +This patch changes allocation mode of netlink message from GFP_KERNEL +to GFP_ATOMIC in order to prevent sleep in atomic bug. The GFP_ATOMIC +flag makes memory allocation operation could be used in atomic context. + +Fixes: 9674da8759df ("NFC: Add firmware upload netlink command") +Fixes: 9ea7187c53f6 ("NFC: netlink: Rename CMD_FW_UPLOAD to CMD_FW_DOWNLOAD") +Signed-off-by: Duoming Zhou +Reviewed-by: Krzysztof Kozlowski +Link: https://lore.kernel.org/r/20220504055847.38026-1-duoming@zju.edu.cn +Signed-off-by: Paolo Abeni +Signed-off-by: Sasha Levin +--- + net/nfc/netlink.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c +index a207f0b8137b..60fc85781373 100644 +--- a/net/nfc/netlink.c ++++ b/net/nfc/netlink.c +@@ -534,7 +534,7 @@ int nfc_genl_se_connectivity(struct nfc_dev *dev, u8 se_idx) + struct sk_buff *msg; + void *hdr; + +- msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); ++ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + if (!msg) + return -ENOMEM; + +@@ -554,7 +554,7 @@ int nfc_genl_se_connectivity(struct nfc_dev *dev, u8 se_idx) + + genlmsg_end(msg, hdr); + +- genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); ++ genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC); + + return 0; + +-- +2.35.1 + diff --git a/queue-5.15/selftest-vm-verify-mmap-addr-in-mremap_test.patch b/queue-5.15/selftest-vm-verify-mmap-addr-in-mremap_test.patch new file mode 100644 index 00000000000..06462b0f625 --- /dev/null +++ b/queue-5.15/selftest-vm-verify-mmap-addr-in-mremap_test.patch @@ -0,0 +1,73 @@ +From 2d5fcd6a350f33e110f5b7da453fd5db9bc0f3ad Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 21 Apr 2022 16:35:49 -0700 +Subject: selftest/vm: verify mmap addr in mremap_test + +From: Sidhartha Kumar + +[ Upstream commit 9c85a9bae267f6b5e5e374d0d023bbbe9db096d3 ] + +Avoid calling mmap with requested addresses that are less than the +system's mmap_min_addr. When run as root, mmap returns EACCES when +trying to map addresses < mmap_min_addr. This is not one of the error +codes for the condition to retry the mmap in the test. + +Rather than arbitrarily retrying on EACCES, don't attempt an mmap until +addr > vm.mmap_min_addr. + +Add a munmap call after an alignment check as the mappings are retained +after the retry and can reach the vm.max_map_count sysctl. + +Link: https://lkml.kernel.org/r/20220420215721.4868-1-sidhartha.kumar@oracle.com +Signed-off-by: Sidhartha Kumar +Reviewed-by: Shuah Khan +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Sasha Levin +--- + tools/testing/selftests/vm/mremap_test.c | 29 ++++++++++++++++++++++++ + 1 file changed, 29 insertions(+) + +diff --git a/tools/testing/selftests/vm/mremap_test.c b/tools/testing/selftests/vm/mremap_test.c +index e3ce33a9954e..efcbf537b3d5 100644 +--- a/tools/testing/selftests/vm/mremap_test.c ++++ b/tools/testing/selftests/vm/mremap_test.c +@@ -66,6 +66,35 @@ enum { + .expect_failure = should_fail \ + } + ++/* Returns mmap_min_addr sysctl tunable from procfs */ ++static unsigned long long get_mmap_min_addr(void) ++{ ++ FILE *fp; ++ int n_matched; ++ static unsigned long long addr; ++ ++ if (addr) ++ return addr; ++ ++ fp = fopen("/proc/sys/vm/mmap_min_addr", "r"); ++ if (fp == NULL) { ++ ksft_print_msg("Failed to open /proc/sys/vm/mmap_min_addr: %s\n", ++ strerror(errno)); ++ exit(KSFT_SKIP); ++ } ++ ++ n_matched = fscanf(fp, "%llu", &addr); ++ if (n_matched != 1) { ++ ksft_print_msg("Failed to read /proc/sys/vm/mmap_min_addr: %s\n", ++ strerror(errno)); ++ fclose(fp); ++ exit(KSFT_SKIP); ++ } ++ ++ fclose(fp); ++ return addr; ++} ++ + /* + * Returns false if the requested remap region overlaps with an + * existing mapping (e.g text, stack) else returns true. +-- +2.35.1 + diff --git a/queue-5.15/selftest-vm-verify-remap-destination-address-in-mrem.patch b/queue-5.15/selftest-vm-verify-remap-destination-address-in-mrem.patch new file mode 100644 index 00000000000..3e004f42d34 --- /dev/null +++ b/queue-5.15/selftest-vm-verify-remap-destination-address-in-mrem.patch @@ -0,0 +1,66 @@ +From 6d916cf1190f0a1c637fd714c8b129266c8b05a7 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 21 Apr 2022 16:35:52 -0700 +Subject: selftest/vm: verify remap destination address in mremap_test + +From: Sidhartha Kumar + +[ Upstream commit 18d609daa546c919fd36b62a7b510c18de4b4af8 ] + +Because mremap does not have a MAP_FIXED_NOREPLACE flag, it can destroy +existing mappings. This causes a segfault when regions such as text are +remapped and the permissions are changed. + +Verify the requested mremap destination address does not overlap any +existing mappings by using mmap's MAP_FIXED_NOREPLACE flag. Keep +incrementing the destination address until a valid mapping is found or +fail the current test once the max address is reached. + +Link: https://lkml.kernel.org/r/20220420215721.4868-2-sidhartha.kumar@oracle.com +Signed-off-by: Sidhartha Kumar +Reviewed-by: Shuah Khan +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Sasha Levin +--- + tools/testing/selftests/vm/mremap_test.c | 24 ++++++++++++++++++++++++ + 1 file changed, 24 insertions(+) + +diff --git a/tools/testing/selftests/vm/mremap_test.c b/tools/testing/selftests/vm/mremap_test.c +index efcbf537b3d5..8f4dbbd60c09 100644 +--- a/tools/testing/selftests/vm/mremap_test.c ++++ b/tools/testing/selftests/vm/mremap_test.c +@@ -66,6 +66,30 @@ enum { + .expect_failure = should_fail \ + } + ++/* ++ * Returns false if the requested remap region overlaps with an ++ * existing mapping (e.g text, stack) else returns true. ++ */ ++static bool is_remap_region_valid(void *addr, unsigned long long size) ++{ ++ void *remap_addr = NULL; ++ bool ret = true; ++ ++ /* Use MAP_FIXED_NOREPLACE flag to ensure region is not mapped */ ++ remap_addr = mmap(addr, size, PROT_READ | PROT_WRITE, ++ MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED, ++ -1, 0); ++ ++ if (remap_addr == MAP_FAILED) { ++ if (errno == EEXIST) ++ ret = false; ++ } else { ++ munmap(remap_addr, size); ++ } ++ ++ return ret; ++} ++ + /* Returns mmap_min_addr sysctl tunable from procfs */ + static unsigned long long get_mmap_min_addr(void) + { +-- +2.35.1 + diff --git a/queue-5.15/series b/queue-5.15/series index cb925cbf5c0..6245fd44481 100644 --- a/queue-5.15/series +++ b/queue-5.15/series @@ -83,3 +83,20 @@ drm-amdgpu-unify-bo-evicting-method-in-amdgpu_ttm.patch drm-amdgpu-explicitly-check-for-s0ix-when-evicting-resources.patch drm-amdgpu-don-t-set-s3-and-s0ix-at-the-same-time.patch drm-amdgpu-ensure-hda-function-is-suspended-before-asic-reset.patch +gpio-mvebu-drop-pwm-base-assignment.patch +kvm-x86-cpuid-only-provide-cpuid-leaf-0xa-if-host-ha.patch +fbdev-make-fb_release-return-enodev-if-fbdev-was-unr.patch +net-mlx5-fix-slab-out-of-bounds-while-reading-resour.patch +net-mlx5e-lag-fix-use-after-free-in-fib-event-handle.patch +net-mlx5e-lag-fix-fib_info-pointer-assignment.patch +net-mlx5e-lag-don-t-skip-fib-events-on-current-dst.patch +iommu-dart-add-missing-module-owner-to-ops-structure.patch +nfc-netlink-fix-sleep-in-atomic-bug-when-firmware-do.patch +kvm-selftests-do-not-use-bitfields-larger-than-32-bi.patch +kvm-selftests-silence-compiler-warning-in-the-kvm_pa.patch +x86-kvm-preserve-bsp-msr_kvm_poll_control-across-sus.patch +kvm-x86-do-not-change-icr-on-write-to-apic_self_ipi.patch +kvm-x86-mmu-avoid-null-pointer-dereference-on-page-f.patch +kvm-lapic-enable-timer-posted-interrupt-only-when-mw.patch +selftest-vm-verify-mmap-addr-in-mremap_test.patch +selftest-vm-verify-remap-destination-address-in-mrem.patch diff --git a/queue-5.15/x86-kvm-preserve-bsp-msr_kvm_poll_control-across-sus.patch b/queue-5.15/x86-kvm-preserve-bsp-msr_kvm_poll_control-across-sus.patch new file mode 100644 index 00000000000..27988e620b6 --- /dev/null +++ b/queue-5.15/x86-kvm-preserve-bsp-msr_kvm_poll_control-across-sus.patch @@ -0,0 +1,71 @@ +From 14f11f7d7e6120d6a539c60a1555a7db0a4cf726 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 18 Apr 2022 00:42:32 -0700 +Subject: x86/kvm: Preserve BSP MSR_KVM_POLL_CONTROL across suspend/resume + +From: Wanpeng Li + +[ Upstream commit 0361bdfddca20c8855ea3bdbbbc9c999912b10ff ] + +MSR_KVM_POLL_CONTROL is cleared on reset, thus reverting guests to +host-side polling after suspend/resume. Non-bootstrap CPUs are +restored correctly by the haltpoll driver because they are hot-unplugged +during suspend and hot-plugged during resume; however, the BSP +is not hotpluggable and remains in host-sde polling mode after +the guest resume. The makes the guest pay for the cost of vmexits +every time the guest enters idle. + +Fix it by recording BSP's haltpoll state and resuming it during guest +resume. + +Cc: Marcelo Tosatti +Signed-off-by: Wanpeng Li +Message-Id: <1650267752-46796-1-git-send-email-wanpengli@tencent.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Sasha Levin +--- + arch/x86/kernel/kvm.c | 13 +++++++++++++ + 1 file changed, 13 insertions(+) + +diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c +index bd7b65081eb0..d36b58e705b6 100644 +--- a/arch/x86/kernel/kvm.c ++++ b/arch/x86/kernel/kvm.c +@@ -66,6 +66,7 @@ static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __align + DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible; + static int has_steal_clock = 0; + ++static int has_guest_poll = 0; + /* + * No need for any "IO delay" on KVM + */ +@@ -650,14 +651,26 @@ static int kvm_cpu_down_prepare(unsigned int cpu) + + static int kvm_suspend(void) + { ++ u64 val = 0; ++ + kvm_guest_cpu_offline(false); + ++#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL ++ if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) ++ rdmsrl(MSR_KVM_POLL_CONTROL, val); ++ has_guest_poll = !(val & 1); ++#endif + return 0; + } + + static void kvm_resume(void) + { + kvm_cpu_online(raw_smp_processor_id()); ++ ++#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL ++ if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL) && has_guest_poll) ++ wrmsrl(MSR_KVM_POLL_CONTROL, 0); ++#endif + } + + static struct syscore_ops kvm_syscore_ops = { +-- +2.35.1 + -- 2.47.3