]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
Fixes for 5.17
authorSasha Levin <sashal@kernel.org>
Tue, 10 May 2022 00:56:59 +0000 (20:56 -0400)
committerSasha Levin <sashal@kernel.org>
Tue, 10 May 2022 00:56:59 +0000 (20:56 -0400)
Signed-off-by: Sasha Levin <sashal@kernel.org>
17 files changed:
queue-5.17/fbdev-make-fb_release-return-enodev-if-fbdev-was-unr.patch [new file with mode: 0644]
queue-5.17/gpio-mvebu-drop-pwm-base-assignment.patch [new file with mode: 0644]
queue-5.17/iommu-dart-add-missing-module-owner-to-ops-structure.patch [new file with mode: 0644]
queue-5.17/kvm-lapic-enable-timer-posted-interrupt-only-when-mw.patch [new file with mode: 0644]
queue-5.17/kvm-selftests-do-not-use-bitfields-larger-than-32-bi.patch [new file with mode: 0644]
queue-5.17/kvm-selftests-silence-compiler-warning-in-the-kvm_pa.patch [new file with mode: 0644]
queue-5.17/kvm-sev-mark-nested-locking-of-vcpu-lock.patch [new file with mode: 0644]
queue-5.17/kvm-vmx-exit-to-userspace-if-vcpu-has-injected-excep.patch [new file with mode: 0644]
queue-5.17/kvm-x86-cpuid-only-provide-cpuid-leaf-0xa-if-host-ha.patch [new file with mode: 0644]
queue-5.17/kvm-x86-do-not-change-icr-on-write-to-apic_self_ipi.patch [new file with mode: 0644]
queue-5.17/kvm-x86-mmu-avoid-null-pointer-dereference-on-page-f.patch [new file with mode: 0644]
queue-5.17/net-rds-acquire-refcount-on-tcp-sockets.patch [new file with mode: 0644]
queue-5.17/nfc-netlink-fix-sleep-in-atomic-bug-when-firmware-do.patch [new file with mode: 0644]
queue-5.17/selftest-vm-verify-mmap-addr-in-mremap_test.patch [new file with mode: 0644]
queue-5.17/selftest-vm-verify-remap-destination-address-in-mrem.patch [new file with mode: 0644]
queue-5.17/series
queue-5.17/x86-kvm-preserve-bsp-msr_kvm_poll_control-across-sus.patch [new file with mode: 0644]

diff --git a/queue-5.17/fbdev-make-fb_release-return-enodev-if-fbdev-was-unr.patch b/queue-5.17/fbdev-make-fb_release-return-enodev-if-fbdev-was-unr.patch
new file mode 100644 (file)
index 0000000..6292a67
--- /dev/null
@@ -0,0 +1,53 @@
+From a3f9c0d3c3156d066cc2a86ae8e14c7e75f651ad Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 2 May 2022 15:50:14 +0200
+Subject: fbdev: Make fb_release() return -ENODEV if fbdev was unregistered
+
+From: Javier Martinez Canillas <javierm@redhat.com>
+
+[ Upstream commit aafa025c76dcc7d1a8c8f0bdefcbe4eb480b2f6a ]
+
+A reference to the framebuffer device struct fb_info is stored in the file
+private data, but this reference could no longer be valid and must not be
+accessed directly. Instead, the file_fb_info() accessor function must be
+used since it does sanity checking to make sure that the fb_info is valid.
+
+This can happen for example if the registered framebuffer device is for a
+driver that just uses a framebuffer provided by the system firmware. In
+that case, the fbdev core would unregister the framebuffer device when a
+real video driver is probed and ask to remove conflicting framebuffers.
+
+The bug has been present for a long time but commit 27599aacbaef ("fbdev:
+Hot-unplug firmware fb devices on forced removal") unmasked it since the
+fbdev core started unregistering the framebuffers' devices associated.
+
+Fixes: 27599aacbaef ("fbdev: Hot-unplug firmware fb devices on forced removal")
+Reported-by: Maxime Ripard <maxime@cerno.tech>
+Reported-by: Junxiao Chang <junxiao.chang@intel.com>
+Signed-off-by: Javier Martinez Canillas <javierm@redhat.com>
+Reviewed-by: Thomas Zimmermann <tzimmermann@suse.de>
+Link: https://patchwork.freedesktop.org/patch/msgid/20220502135014.377945-1-javierm@redhat.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/video/fbdev/core/fbmem.c | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c
+index 00f0f282e7a1..10a9369c9dea 100644
+--- a/drivers/video/fbdev/core/fbmem.c
++++ b/drivers/video/fbdev/core/fbmem.c
+@@ -1438,7 +1438,10 @@ fb_release(struct inode *inode, struct file *file)
+ __acquires(&info->lock)
+ __releases(&info->lock)
+ {
+-      struct fb_info * const info = file->private_data;
++      struct fb_info * const info = file_fb_info(file);
++
++      if (!info)
++              return -ENODEV;
+       lock_fb_info(info);
+       if (info->fbops->fb_release)
+-- 
+2.35.1
+
diff --git a/queue-5.17/gpio-mvebu-drop-pwm-base-assignment.patch b/queue-5.17/gpio-mvebu-drop-pwm-base-assignment.patch
new file mode 100644 (file)
index 0000000..3e6d098
--- /dev/null
@@ -0,0 +1,48 @@
+From 2b15a2b6026846af01a1d9fd376ae6450a74aacd Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 11 Apr 2022 09:23:40 +0300
+Subject: gpio: mvebu: drop pwm base assignment
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Baruch Siach <baruch@tkos.co.il>
+
+[ Upstream commit e5f6e5d554ac274f9c8ba60078103d0425b93c19 ]
+
+pwmchip_add() unconditionally assigns the base ID dynamically. Commit
+f9a8ee8c8bcd1 ("pwm: Always allocate PWM chip base ID dynamically")
+dropped all base assignment from drivers under drivers/pwm/. It missed
+this driver. Fix that.
+
+Fixes: f9a8ee8c8bcd1 ("pwm: Always allocate PWM chip base ID dynamically")
+Signed-off-by: Baruch Siach <baruch@tkos.co.il>
+Reviewed-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
+Acked-by: Linus Walleij <linus.walleij@linaro.org>
+Signed-off-by: Bartosz Golaszewski <brgl@bgdev.pl>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/gpio/gpio-mvebu.c | 7 -------
+ 1 file changed, 7 deletions(-)
+
+diff --git a/drivers/gpio/gpio-mvebu.c b/drivers/gpio/gpio-mvebu.c
+index 4c1f9e1091b7..a2c8dd329b31 100644
+--- a/drivers/gpio/gpio-mvebu.c
++++ b/drivers/gpio/gpio-mvebu.c
+@@ -871,13 +871,6 @@ static int mvebu_pwm_probe(struct platform_device *pdev,
+       mvpwm->chip.dev = dev;
+       mvpwm->chip.ops = &mvebu_pwm_ops;
+       mvpwm->chip.npwm = mvchip->chip.ngpio;
+-      /*
+-       * There may already be some PWM allocated, so we can't force
+-       * mvpwm->chip.base to a fixed point like mvchip->chip.base.
+-       * So, we let pwmchip_add() do the numbering and take the next free
+-       * region.
+-       */
+-      mvpwm->chip.base = -1;
+       spin_lock_init(&mvpwm->lock);
+-- 
+2.35.1
+
diff --git a/queue-5.17/iommu-dart-add-missing-module-owner-to-ops-structure.patch b/queue-5.17/iommu-dart-add-missing-module-owner-to-ops-structure.patch
new file mode 100644 (file)
index 0000000..bfab7b5
--- /dev/null
@@ -0,0 +1,36 @@
+From a8ec4d2f98e12a967e0e32461a21e57ab758c01c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 2 May 2022 18:22:38 +0900
+Subject: iommu/dart: Add missing module owner to ops structure
+
+From: Hector Martin <marcan@marcan.st>
+
+[ Upstream commit 2ac2fab52917ae82cbca97cf6e5d2993530257ed ]
+
+This is required to make loading this as a module work.
+
+Signed-off-by: Hector Martin <marcan@marcan.st>
+Fixes: 46d1fb072e76 ("iommu/dart: Add DART iommu driver")
+Reviewed-by: Sven Peter <sven@svenpeter.dev>
+Link: https://lore.kernel.org/r/20220502092238.30486-1-marcan@marcan.st
+Signed-off-by: Joerg Roedel <jroedel@suse.de>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/iommu/apple-dart.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/drivers/iommu/apple-dart.c b/drivers/iommu/apple-dart.c
+index 6c111bd8283d..68821f86b063 100644
+--- a/drivers/iommu/apple-dart.c
++++ b/drivers/iommu/apple-dart.c
+@@ -782,6 +782,7 @@ static const struct iommu_ops apple_dart_iommu_ops = {
+       .get_resv_regions = apple_dart_get_resv_regions,
+       .put_resv_regions = generic_iommu_put_resv_regions,
+       .pgsize_bitmap = -1UL, /* Restricted during dart probe */
++      .owner = THIS_MODULE,
+ };
+ static irqreturn_t apple_dart_irq(int irq, void *dev)
+-- 
+2.35.1
+
diff --git a/queue-5.17/kvm-lapic-enable-timer-posted-interrupt-only-when-mw.patch b/queue-5.17/kvm-lapic-enable-timer-posted-interrupt-only-when-mw.patch
new file mode 100644 (file)
index 0000000..398a25f
--- /dev/null
@@ -0,0 +1,54 @@
+From 3232e0d9f14e63dbe25a9f9082cd9192ece0fe7a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 25 Jan 2022 04:08:58 -0800
+Subject: KVM: LAPIC: Enable timer posted-interrupt only when mwait/hlt is
+ advertised
+
+From: Wanpeng Li <wanpengli@tencent.com>
+
+[ Upstream commit 1714a4eb6fb0cb79f182873cd011a8ed60ac65e8 ]
+
+As commit 0c5f81dad46 ("KVM: LAPIC: Inject timer interrupt via posted
+interrupt") mentioned that the host admin should well tune the guest
+setup, so that vCPUs are placed on isolated pCPUs, and with several pCPUs
+surplus for *busy* housekeeping.  In this setup, it is preferrable to
+disable mwait/hlt/pause vmexits to keep the vCPUs in non-root mode.
+
+However, if only some guests isolated and others not, they would not
+have any benefit from posted timer interrupts, and at the same time lose
+VMX preemption timer fast paths because kvm_can_post_timer_interrupt()
+returns true and therefore forces kvm_can_use_hv_timer() to false.
+
+By guaranteeing that posted-interrupt timer is only used if MWAIT or
+HLT are done without vmexit, KVM can make a better choice and use the
+VMX preemption timer and the corresponding fast paths.
+
+Reported-by: Aili Yao <yaoaili@kingsoft.com>
+Reviewed-by: Sean Christopherson <seanjc@google.com>
+Cc: Aili Yao <yaoaili@kingsoft.com>
+Cc: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
+Message-Id: <1643112538-36743-1-git-send-email-wanpengli@tencent.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/lapic.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
+index 6b6f9359d29e..970d5c740b00 100644
+--- a/arch/x86/kvm/lapic.c
++++ b/arch/x86/kvm/lapic.c
+@@ -113,7 +113,8 @@ static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
+ static bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu)
+ {
+-      return pi_inject_timer && kvm_vcpu_apicv_active(vcpu);
++      return pi_inject_timer && kvm_vcpu_apicv_active(vcpu) &&
++              (kvm_mwait_in_guest(vcpu->kvm) || kvm_hlt_in_guest(vcpu->kvm));
+ }
+ bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu)
+-- 
+2.35.1
+
diff --git a/queue-5.17/kvm-selftests-do-not-use-bitfields-larger-than-32-bi.patch b/queue-5.17/kvm-selftests-do-not-use-bitfields-larger-than-32-bi.patch
new file mode 100644 (file)
index 0000000..c9c936f
--- /dev/null
@@ -0,0 +1,433 @@
+From 27bcef1a3f31be9235092604b7a4f5861c9c3c15 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 20 Apr 2022 06:27:27 -0400
+Subject: kvm: selftests: do not use bitfields larger than 32-bits for PTEs
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+[ Upstream commit f18b4aebe107d092e384b1ae680b1e1de7a0196d ]
+
+Red Hat's QE team reported test failure on access_tracking_perf_test:
+
+Testing guest mode: PA-bits:ANY, VA-bits:48,  4K pages
+guest physical test memory offset: 0x3fffbffff000
+
+Populating memory             : 0.684014577s
+Writing to populated memory   : 0.006230175s
+Reading from populated memory : 0.004557805s
+==== Test Assertion Failure ====
+  lib/kvm_util.c:1411: false
+  pid=125806 tid=125809 errno=4 - Interrupted system call
+     1  0x0000000000402f7c: addr_gpa2hva at kvm_util.c:1411
+     2   (inlined by) addr_gpa2hva at kvm_util.c:1405
+     3  0x0000000000401f52: lookup_pfn at access_tracking_perf_test.c:98
+     4   (inlined by) mark_vcpu_memory_idle at access_tracking_perf_test.c:152
+     5   (inlined by) vcpu_thread_main at access_tracking_perf_test.c:232
+     6  0x00007fefe9ff81ce: ?? ??:0
+     7  0x00007fefe9c64d82: ?? ??:0
+  No vm physical memory at 0xffbffff000
+
+I can easily reproduce it with a Intel(R) Xeon(R) CPU E5-2630 with 46 bits
+PA.
+
+It turns out that the address translation for clearing idle page tracking
+returned a wrong result; addr_gva2gpa()'s last step, which is based on
+"pte[index[0]].pfn", did the calculation with 40 bits length and the
+high 12 bits got truncated.  In above case the GPA address to be returned
+should be 0x3fffbffff000 for GVA 0xc0000000, but it got truncated into
+0xffbffff000 and the subsequent gpa2hva lookup failed.
+
+The width of operations on bit fields greater than 32-bit is
+implementation defined, and differs between GCC (which uses the bitfield
+precision) and clang (which uses 64-bit arithmetic), so this is a
+potential minefield.  Remove the bit fields and using manual masking
+instead.
+
+Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2075036
+Reported-by: Nana Liu <nanliu@redhat.com>
+Reviewed-by: Peter Xu <peterx@redhat.com>
+Tested-by: Peter Xu <peterx@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ .../selftests/kvm/include/x86_64/processor.h  |  15 ++
+ .../selftests/kvm/lib/x86_64/processor.c      | 192 +++++++-----------
+ 2 files changed, 92 insertions(+), 115 deletions(-)
+
+diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
+index 8a470da7b71a..15a2875698b5 100644
+--- a/tools/testing/selftests/kvm/include/x86_64/processor.h
++++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
+@@ -60,6 +60,21 @@
+ /* CPUID.0x8000_0001.EDX */
+ #define CPUID_GBPAGES         (1ul << 26)
++/* Page table bitfield declarations */
++#define PTE_PRESENT_MASK        BIT_ULL(0)
++#define PTE_WRITABLE_MASK       BIT_ULL(1)
++#define PTE_USER_MASK           BIT_ULL(2)
++#define PTE_ACCESSED_MASK       BIT_ULL(5)
++#define PTE_DIRTY_MASK          BIT_ULL(6)
++#define PTE_LARGE_MASK          BIT_ULL(7)
++#define PTE_GLOBAL_MASK         BIT_ULL(8)
++#define PTE_NX_MASK             BIT_ULL(63)
++
++#define PAGE_SHIFT            12
++
++#define PHYSICAL_PAGE_MASK      GENMASK_ULL(51, 12)
++#define PTE_GET_PFN(pte)        (((pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
++
+ /* General Registers in 64-Bit Mode */
+ struct gpr64_regs {
+       u64 rax;
+diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
+index 9f000dfb5594..0dd442c26015 100644
+--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
++++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
+@@ -19,38 +19,6 @@
+ vm_vaddr_t exception_handlers;
+-/* Virtual translation table structure declarations */
+-struct pageUpperEntry {
+-      uint64_t present:1;
+-      uint64_t writable:1;
+-      uint64_t user:1;
+-      uint64_t write_through:1;
+-      uint64_t cache_disable:1;
+-      uint64_t accessed:1;
+-      uint64_t ignored_06:1;
+-      uint64_t page_size:1;
+-      uint64_t ignored_11_08:4;
+-      uint64_t pfn:40;
+-      uint64_t ignored_62_52:11;
+-      uint64_t execute_disable:1;
+-};
+-
+-struct pageTableEntry {
+-      uint64_t present:1;
+-      uint64_t writable:1;
+-      uint64_t user:1;
+-      uint64_t write_through:1;
+-      uint64_t cache_disable:1;
+-      uint64_t accessed:1;
+-      uint64_t dirty:1;
+-      uint64_t reserved_07:1;
+-      uint64_t global:1;
+-      uint64_t ignored_11_09:3;
+-      uint64_t pfn:40;
+-      uint64_t ignored_62_52:11;
+-      uint64_t execute_disable:1;
+-};
+-
+ void regs_dump(FILE *stream, struct kvm_regs *regs,
+              uint8_t indent)
+ {
+@@ -195,23 +163,21 @@ static void *virt_get_pte(struct kvm_vm *vm, uint64_t pt_pfn, uint64_t vaddr,
+       return &page_table[index];
+ }
+-static struct pageUpperEntry *virt_create_upper_pte(struct kvm_vm *vm,
+-                                                  uint64_t pt_pfn,
+-                                                  uint64_t vaddr,
+-                                                  uint64_t paddr,
+-                                                  int level,
+-                                                  enum x86_page_size page_size)
++static uint64_t *virt_create_upper_pte(struct kvm_vm *vm,
++                                     uint64_t pt_pfn,
++                                     uint64_t vaddr,
++                                     uint64_t paddr,
++                                     int level,
++                                     enum x86_page_size page_size)
+ {
+-      struct pageUpperEntry *pte = virt_get_pte(vm, pt_pfn, vaddr, level);
+-
+-      if (!pte->present) {
+-              pte->writable = true;
+-              pte->present = true;
+-              pte->page_size = (level == page_size);
+-              if (pte->page_size)
+-                      pte->pfn = paddr >> vm->page_shift;
++      uint64_t *pte = virt_get_pte(vm, pt_pfn, vaddr, level);
++
++      if (!(*pte & PTE_PRESENT_MASK)) {
++              *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK;
++              if (level == page_size)
++                      *pte |= PTE_LARGE_MASK | (paddr & PHYSICAL_PAGE_MASK);
+               else
+-                      pte->pfn = vm_alloc_page_table(vm) >> vm->page_shift;
++                      *pte |= vm_alloc_page_table(vm) & PHYSICAL_PAGE_MASK;
+       } else {
+               /*
+                * Entry already present.  Assert that the caller doesn't want
+@@ -221,7 +187,7 @@ static struct pageUpperEntry *virt_create_upper_pte(struct kvm_vm *vm,
+               TEST_ASSERT(level != page_size,
+                           "Cannot create hugepage at level: %u, vaddr: 0x%lx\n",
+                           page_size, vaddr);
+-              TEST_ASSERT(!pte->page_size,
++              TEST_ASSERT(!(*pte & PTE_LARGE_MASK),
+                           "Cannot create page table at level: %u, vaddr: 0x%lx\n",
+                           level, vaddr);
+       }
+@@ -232,8 +198,8 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
+                  enum x86_page_size page_size)
+ {
+       const uint64_t pg_size = 1ull << ((page_size * 9) + 12);
+-      struct pageUpperEntry *pml4e, *pdpe, *pde;
+-      struct pageTableEntry *pte;
++      uint64_t *pml4e, *pdpe, *pde;
++      uint64_t *pte;
+       TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K,
+                   "Unknown or unsupported guest mode, mode: 0x%x", vm->mode);
+@@ -257,24 +223,22 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
+        */
+       pml4e = virt_create_upper_pte(vm, vm->pgd >> vm->page_shift,
+                                     vaddr, paddr, 3, page_size);
+-      if (pml4e->page_size)
++      if (*pml4e & PTE_LARGE_MASK)
+               return;
+-      pdpe = virt_create_upper_pte(vm, pml4e->pfn, vaddr, paddr, 2, page_size);
+-      if (pdpe->page_size)
++      pdpe = virt_create_upper_pte(vm, PTE_GET_PFN(*pml4e), vaddr, paddr, 2, page_size);
++      if (*pdpe & PTE_LARGE_MASK)
+               return;
+-      pde = virt_create_upper_pte(vm, pdpe->pfn, vaddr, paddr, 1, page_size);
+-      if (pde->page_size)
++      pde = virt_create_upper_pte(vm, PTE_GET_PFN(*pdpe), vaddr, paddr, 1, page_size);
++      if (*pde & PTE_LARGE_MASK)
+               return;
+       /* Fill in page table entry. */
+-      pte = virt_get_pte(vm, pde->pfn, vaddr, 0);
+-      TEST_ASSERT(!pte->present,
++      pte = virt_get_pte(vm, PTE_GET_PFN(*pde), vaddr, 0);
++      TEST_ASSERT(!(*pte & PTE_PRESENT_MASK),
+                   "PTE already present for 4k page at vaddr: 0x%lx\n", vaddr);
+-      pte->pfn = paddr >> vm->page_shift;
+-      pte->writable = true;
+-      pte->present = 1;
++      *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK | (paddr & PHYSICAL_PAGE_MASK);
+ }
+ void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr)
+@@ -282,12 +246,12 @@ void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr)
+       __virt_pg_map(vm, vaddr, paddr, X86_PAGE_SIZE_4K);
+ }
+-static struct pageTableEntry *_vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid,
++static uint64_t *_vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid,
+                                                      uint64_t vaddr)
+ {
+       uint16_t index[4];
+-      struct pageUpperEntry *pml4e, *pdpe, *pde;
+-      struct pageTableEntry *pte;
++      uint64_t *pml4e, *pdpe, *pde;
++      uint64_t *pte;
+       struct kvm_cpuid_entry2 *entry;
+       struct kvm_sregs sregs;
+       int max_phy_addr;
+@@ -329,30 +293,29 @@ static struct pageTableEntry *_vm_get_page_table_entry(struct kvm_vm *vm, int vc
+       index[3] = (vaddr >> 39) & 0x1ffu;
+       pml4e = addr_gpa2hva(vm, vm->pgd);
+-      TEST_ASSERT(pml4e[index[3]].present,
++      TEST_ASSERT(pml4e[index[3]] & PTE_PRESENT_MASK,
+               "Expected pml4e to be present for gva: 0x%08lx", vaddr);
+-      TEST_ASSERT((*(uint64_t*)(&pml4e[index[3]]) &
+-              (rsvd_mask | (1ull << 7))) == 0,
++      TEST_ASSERT((pml4e[index[3]] & (rsvd_mask | PTE_LARGE_MASK)) == 0,
+               "Unexpected reserved bits set.");
+-      pdpe = addr_gpa2hva(vm, pml4e[index[3]].pfn * vm->page_size);
+-      TEST_ASSERT(pdpe[index[2]].present,
++      pdpe = addr_gpa2hva(vm, PTE_GET_PFN(pml4e[index[3]]) * vm->page_size);
++      TEST_ASSERT(pdpe[index[2]] & PTE_PRESENT_MASK,
+               "Expected pdpe to be present for gva: 0x%08lx", vaddr);
+-      TEST_ASSERT(pdpe[index[2]].page_size == 0,
++      TEST_ASSERT(!(pdpe[index[2]] & PTE_LARGE_MASK),
+               "Expected pdpe to map a pde not a 1-GByte page.");
+-      TEST_ASSERT((*(uint64_t*)(&pdpe[index[2]]) & rsvd_mask) == 0,
++      TEST_ASSERT((pdpe[index[2]] & rsvd_mask) == 0,
+               "Unexpected reserved bits set.");
+-      pde = addr_gpa2hva(vm, pdpe[index[2]].pfn * vm->page_size);
+-      TEST_ASSERT(pde[index[1]].present,
++      pde = addr_gpa2hva(vm, PTE_GET_PFN(pdpe[index[2]]) * vm->page_size);
++      TEST_ASSERT(pde[index[1]] & PTE_PRESENT_MASK,
+               "Expected pde to be present for gva: 0x%08lx", vaddr);
+-      TEST_ASSERT(pde[index[1]].page_size == 0,
++      TEST_ASSERT(!(pde[index[1]] & PTE_LARGE_MASK),
+               "Expected pde to map a pte not a 2-MByte page.");
+-      TEST_ASSERT((*(uint64_t*)(&pde[index[1]]) & rsvd_mask) == 0,
++      TEST_ASSERT((pde[index[1]] & rsvd_mask) == 0,
+               "Unexpected reserved bits set.");
+-      pte = addr_gpa2hva(vm, pde[index[1]].pfn * vm->page_size);
+-      TEST_ASSERT(pte[index[0]].present,
++      pte = addr_gpa2hva(vm, PTE_GET_PFN(pde[index[1]]) * vm->page_size);
++      TEST_ASSERT(pte[index[0]] & PTE_PRESENT_MASK,
+               "Expected pte to be present for gva: 0x%08lx", vaddr);
+       return &pte[index[0]];
+@@ -360,7 +323,7 @@ static struct pageTableEntry *_vm_get_page_table_entry(struct kvm_vm *vm, int vc
+ uint64_t vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr)
+ {
+-      struct pageTableEntry *pte = _vm_get_page_table_entry(vm, vcpuid, vaddr);
++      uint64_t *pte = _vm_get_page_table_entry(vm, vcpuid, vaddr);
+       return *(uint64_t *)pte;
+ }
+@@ -368,18 +331,17 @@ uint64_t vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr)
+ void vm_set_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr,
+                            uint64_t pte)
+ {
+-      struct pageTableEntry *new_pte = _vm_get_page_table_entry(vm, vcpuid,
+-                                                                vaddr);
++      uint64_t *new_pte = _vm_get_page_table_entry(vm, vcpuid, vaddr);
+       *(uint64_t *)new_pte = pte;
+ }
+ void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
+ {
+-      struct pageUpperEntry *pml4e, *pml4e_start;
+-      struct pageUpperEntry *pdpe, *pdpe_start;
+-      struct pageUpperEntry *pde, *pde_start;
+-      struct pageTableEntry *pte, *pte_start;
++      uint64_t *pml4e, *pml4e_start;
++      uint64_t *pdpe, *pdpe_start;
++      uint64_t *pde, *pde_start;
++      uint64_t *pte, *pte_start;
+       if (!vm->pgd_created)
+               return;
+@@ -389,58 +351,58 @@ void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
+       fprintf(stream, "%*s      index hvaddr         gpaddr         "
+               "addr         w exec dirty\n",
+               indent, "");
+-      pml4e_start = (struct pageUpperEntry *) addr_gpa2hva(vm, vm->pgd);
++      pml4e_start = (uint64_t *) addr_gpa2hva(vm, vm->pgd);
+       for (uint16_t n1 = 0; n1 <= 0x1ffu; n1++) {
+               pml4e = &pml4e_start[n1];
+-              if (!pml4e->present)
++              if (!(*pml4e & PTE_PRESENT_MASK))
+                       continue;
+-              fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10lx %u "
++              fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10llx %u "
+                       " %u\n",
+                       indent, "",
+                       pml4e - pml4e_start, pml4e,
+-                      addr_hva2gpa(vm, pml4e), (uint64_t) pml4e->pfn,
+-                      pml4e->writable, pml4e->execute_disable);
++                      addr_hva2gpa(vm, pml4e), PTE_GET_PFN(*pml4e),
++                      !!(*pml4e & PTE_WRITABLE_MASK), !!(*pml4e & PTE_NX_MASK));
+-              pdpe_start = addr_gpa2hva(vm, pml4e->pfn * vm->page_size);
++              pdpe_start = addr_gpa2hva(vm, *pml4e & PHYSICAL_PAGE_MASK);
+               for (uint16_t n2 = 0; n2 <= 0x1ffu; n2++) {
+                       pdpe = &pdpe_start[n2];
+-                      if (!pdpe->present)
++                      if (!(*pdpe & PTE_PRESENT_MASK))
+                               continue;
+-                      fprintf(stream, "%*spdpe  0x%-3zx %p 0x%-12lx 0x%-10lx "
++                      fprintf(stream, "%*spdpe  0x%-3zx %p 0x%-12lx 0x%-10llx "
+                               "%u  %u\n",
+                               indent, "",
+                               pdpe - pdpe_start, pdpe,
+                               addr_hva2gpa(vm, pdpe),
+-                              (uint64_t) pdpe->pfn, pdpe->writable,
+-                              pdpe->execute_disable);
++                              PTE_GET_PFN(*pdpe), !!(*pdpe & PTE_WRITABLE_MASK),
++                              !!(*pdpe & PTE_NX_MASK));
+-                      pde_start = addr_gpa2hva(vm, pdpe->pfn * vm->page_size);
++                      pde_start = addr_gpa2hva(vm, *pdpe & PHYSICAL_PAGE_MASK);
+                       for (uint16_t n3 = 0; n3 <= 0x1ffu; n3++) {
+                               pde = &pde_start[n3];
+-                              if (!pde->present)
++                              if (!(*pde & PTE_PRESENT_MASK))
+                                       continue;
+                               fprintf(stream, "%*spde   0x%-3zx %p "
+-                                      "0x%-12lx 0x%-10lx %u  %u\n",
++                                      "0x%-12lx 0x%-10llx %u  %u\n",
+                                       indent, "", pde - pde_start, pde,
+                                       addr_hva2gpa(vm, pde),
+-                                      (uint64_t) pde->pfn, pde->writable,
+-                                      pde->execute_disable);
++                                      PTE_GET_PFN(*pde), !!(*pde & PTE_WRITABLE_MASK),
++                                      !!(*pde & PTE_NX_MASK));
+-                              pte_start = addr_gpa2hva(vm, pde->pfn * vm->page_size);
++                              pte_start = addr_gpa2hva(vm, *pde & PHYSICAL_PAGE_MASK);
+                               for (uint16_t n4 = 0; n4 <= 0x1ffu; n4++) {
+                                       pte = &pte_start[n4];
+-                                      if (!pte->present)
++                                      if (!(*pte & PTE_PRESENT_MASK))
+                                               continue;
+                                       fprintf(stream, "%*spte   0x%-3zx %p "
+-                                              "0x%-12lx 0x%-10lx %u  %u "
++                                              "0x%-12lx 0x%-10llx %u  %u "
+                                               "    %u    0x%-10lx\n",
+                                               indent, "",
+                                               pte - pte_start, pte,
+                                               addr_hva2gpa(vm, pte),
+-                                              (uint64_t) pte->pfn,
+-                                              pte->writable,
+-                                              pte->execute_disable,
+-                                              pte->dirty,
++                                              PTE_GET_PFN(*pte),
++                                              !!(*pte & PTE_WRITABLE_MASK),
++                                              !!(*pte & PTE_NX_MASK),
++                                              !!(*pte & PTE_DIRTY_MASK),
+                                               ((uint64_t) n1 << 27)
+                                                       | ((uint64_t) n2 << 18)
+                                                       | ((uint64_t) n3 << 9)
+@@ -558,8 +520,8 @@ static void kvm_seg_set_kernel_data_64bit(struct kvm_vm *vm, uint16_t selector,
+ vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
+ {
+       uint16_t index[4];
+-      struct pageUpperEntry *pml4e, *pdpe, *pde;
+-      struct pageTableEntry *pte;
++      uint64_t *pml4e, *pdpe, *pde;
++      uint64_t *pte;
+       TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
+               "unknown or unsupported guest mode, mode: 0x%x", vm->mode);
+@@ -572,22 +534,22 @@ vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
+       if (!vm->pgd_created)
+               goto unmapped_gva;
+       pml4e = addr_gpa2hva(vm, vm->pgd);
+-      if (!pml4e[index[3]].present)
++      if (!(pml4e[index[3]] & PTE_PRESENT_MASK))
+               goto unmapped_gva;
+-      pdpe = addr_gpa2hva(vm, pml4e[index[3]].pfn * vm->page_size);
+-      if (!pdpe[index[2]].present)
++      pdpe = addr_gpa2hva(vm, PTE_GET_PFN(pml4e[index[3]]) * vm->page_size);
++      if (!(pdpe[index[2]] & PTE_PRESENT_MASK))
+               goto unmapped_gva;
+-      pde = addr_gpa2hva(vm, pdpe[index[2]].pfn * vm->page_size);
+-      if (!pde[index[1]].present)
++      pde = addr_gpa2hva(vm, PTE_GET_PFN(pdpe[index[2]]) * vm->page_size);
++      if (!(pde[index[1]] & PTE_PRESENT_MASK))
+               goto unmapped_gva;
+-      pte = addr_gpa2hva(vm, pde[index[1]].pfn * vm->page_size);
+-      if (!pte[index[0]].present)
++      pte = addr_gpa2hva(vm, PTE_GET_PFN(pde[index[1]]) * vm->page_size);
++      if (!(pte[index[0]] & PTE_PRESENT_MASK))
+               goto unmapped_gva;
+-      return (pte[index[0]].pfn * vm->page_size) + (gva & 0xfffu);
++      return (PTE_GET_PFN(pte[index[0]]) * vm->page_size) + (gva & 0xfffu);
+ unmapped_gva:
+       TEST_FAIL("No mapping for vm virtual address, gva: 0x%lx", gva);
+-- 
+2.35.1
+
diff --git a/queue-5.17/kvm-selftests-silence-compiler-warning-in-the-kvm_pa.patch b/queue-5.17/kvm-selftests-silence-compiler-warning-in-the-kvm_pa.patch
new file mode 100644 (file)
index 0000000..2e76f47
--- /dev/null
@@ -0,0 +1,48 @@
+From b1a04d22db5ec060b0e3820f91c490c6cba495d0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Apr 2022 12:30:31 +0200
+Subject: KVM: selftests: Silence compiler warning in the kvm_page_table_test
+
+From: Thomas Huth <thuth@redhat.com>
+
+[ Upstream commit 266a19a0bc4fbfab4d981a47640ca98972a01865 ]
+
+When compiling kvm_page_table_test.c, I get this compiler warning
+with gcc 11.2:
+
+kvm_page_table_test.c: In function 'pre_init_before_test':
+../../../../tools/include/linux/kernel.h:44:24: warning: comparison of
+ distinct pointer types lacks a cast
+   44 |         (void) (&_max1 == &_max2);              \
+      |                        ^~
+kvm_page_table_test.c:281:21: note: in expansion of macro 'max'
+  281 |         alignment = max(0x100000, alignment);
+      |                     ^~~
+
+Fix it by adjusting the type of the absolute value.
+
+Signed-off-by: Thomas Huth <thuth@redhat.com>
+Reviewed-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
+Message-Id: <20220414103031.565037-1-thuth@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ tools/testing/selftests/kvm/kvm_page_table_test.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c
+index ba1fdc3dcf4a..2c4a7563a4f8 100644
+--- a/tools/testing/selftests/kvm/kvm_page_table_test.c
++++ b/tools/testing/selftests/kvm/kvm_page_table_test.c
+@@ -278,7 +278,7 @@ static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg)
+       else
+               guest_test_phys_mem = p->phys_offset;
+ #ifdef __s390x__
+-      alignment = max(0x100000, alignment);
++      alignment = max(0x100000UL, alignment);
+ #endif
+       guest_test_phys_mem = align_down(guest_test_phys_mem, alignment);
+-- 
+2.35.1
+
diff --git a/queue-5.17/kvm-sev-mark-nested-locking-of-vcpu-lock.patch b/queue-5.17/kvm-sev-mark-nested-locking-of-vcpu-lock.patch
new file mode 100644 (file)
index 0000000..daf2fa5
--- /dev/null
@@ -0,0 +1,122 @@
+From b3033435794eaf2e5f29ef686ed42bc17d3bc4cd Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 2 May 2022 09:58:07 -0700
+Subject: KVM: SEV: Mark nested locking of vcpu->lock
+
+From: Peter Gonda <pgonda@google.com>
+
+[ Upstream commit 0c2c7c069285374fc8feacddc0498f8ab7627117 ]
+
+svm_vm_migrate_from() uses sev_lock_vcpus_for_migration() to lock all
+source and target vcpu->locks. Unfortunately there is an 8 subclass
+limit, so a new subclass cannot be used for each vCPU. Instead maintain
+ownership of the first vcpu's mutex.dep_map using a role specific
+subclass: source vs target. Release the other vcpu's mutex.dep_maps.
+
+Fixes: b56639318bb2b ("KVM: SEV: Add support for SEV intra host migration")
+Reported-by: John Sperbeck<jsperbeck@google.com>
+Suggested-by: David Rientjes <rientjes@google.com>
+Suggested-by: Sean Christopherson <seanjc@google.com>
+Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
+Cc: Hillf Danton <hdanton@sina.com>
+Cc: kvm@vger.kernel.org
+Cc: linux-kernel@vger.kernel.org
+Signed-off-by: Peter Gonda <pgonda@google.com>
+
+Message-Id: <20220502165807.529624-1-pgonda@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/svm/sev.c | 42 ++++++++++++++++++++++++++++++++++++++----
+ 1 file changed, 38 insertions(+), 4 deletions(-)
+
+diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
+index e5cecd4ad2d4..76e6411d4dde 100644
+--- a/arch/x86/kvm/svm/sev.c
++++ b/arch/x86/kvm/svm/sev.c
+@@ -1590,24 +1590,51 @@ static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
+       atomic_set_release(&src_sev->migration_in_progress, 0);
+ }
++/* vCPU mutex subclasses.  */
++enum sev_migration_role {
++      SEV_MIGRATION_SOURCE = 0,
++      SEV_MIGRATION_TARGET,
++      SEV_NR_MIGRATION_ROLES,
++};
+-static int sev_lock_vcpus_for_migration(struct kvm *kvm)
++static int sev_lock_vcpus_for_migration(struct kvm *kvm,
++                                      enum sev_migration_role role)
+ {
+       struct kvm_vcpu *vcpu;
+       unsigned long i, j;
++      bool first = true;
+       kvm_for_each_vcpu(i, vcpu, kvm) {
+-              if (mutex_lock_killable(&vcpu->mutex))
++              if (mutex_lock_killable_nested(&vcpu->mutex, role))
+                       goto out_unlock;
++
++              if (first) {
++                      /*
++                       * Reset the role to one that avoids colliding with
++                       * the role used for the first vcpu mutex.
++                       */
++                      role = SEV_NR_MIGRATION_ROLES;
++                      first = false;
++              } else {
++                      mutex_release(&vcpu->mutex.dep_map, _THIS_IP_);
++              }
+       }
+       return 0;
+ out_unlock:
++
++      first = true;
+       kvm_for_each_vcpu(j, vcpu, kvm) {
+               if (i == j)
+                       break;
++              if (first)
++                      first = false;
++              else
++                      mutex_acquire(&vcpu->mutex.dep_map, role, 0, _THIS_IP_);
++
++
+               mutex_unlock(&vcpu->mutex);
+       }
+       return -EINTR;
+@@ -1617,8 +1644,15 @@ static void sev_unlock_vcpus_for_migration(struct kvm *kvm)
+ {
+       struct kvm_vcpu *vcpu;
+       unsigned long i;
++      bool first = true;
+       kvm_for_each_vcpu(i, vcpu, kvm) {
++              if (first)
++                      first = false;
++              else
++                      mutex_acquire(&vcpu->mutex.dep_map,
++                                    SEV_NR_MIGRATION_ROLES, 0, _THIS_IP_);
++
+               mutex_unlock(&vcpu->mutex);
+       }
+ }
+@@ -1726,10 +1760,10 @@ int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd)
+               charged = true;
+       }
+-      ret = sev_lock_vcpus_for_migration(kvm);
++      ret = sev_lock_vcpus_for_migration(kvm, SEV_MIGRATION_SOURCE);
+       if (ret)
+               goto out_dst_cgroup;
+-      ret = sev_lock_vcpus_for_migration(source_kvm);
++      ret = sev_lock_vcpus_for_migration(source_kvm, SEV_MIGRATION_TARGET);
+       if (ret)
+               goto out_dst_vcpu;
+-- 
+2.35.1
+
diff --git a/queue-5.17/kvm-vmx-exit-to-userspace-if-vcpu-has-injected-excep.patch b/queue-5.17/kvm-vmx-exit-to-userspace-if-vcpu-has-injected-excep.patch
new file mode 100644 (file)
index 0000000..429920b
--- /dev/null
@@ -0,0 +1,47 @@
+From 73269526818f6ff1cb19f06817df6e27664ef609 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 2 May 2022 22:18:50 +0000
+Subject: KVM: VMX: Exit to userspace if vCPU has injected exception and
+ invalid state
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 053d2290c0307e3642e75e0185ddadf084dc36c1 ]
+
+Exit to userspace with an emulation error if KVM encounters an injected
+exception with invalid guest state, in addition to the existing check of
+bailing if there's a pending exception (KVM doesn't support emulating
+exceptions except when emulating real mode via vm86).
+
+In theory, KVM should never get to such a situation as KVM is supposed to
+exit to userspace before injecting an exception with invalid guest state.
+But in practice, userspace can intervene and manually inject an exception
+and/or stuff registers to force invalid guest state while a previously
+injected exception is awaiting reinjection.
+
+Fixes: fc4fad79fc3d ("KVM: VMX: Reject KVM_RUN if emulation is required with pending exception")
+Reported-by: syzbot+cfafed3bb76d3e37581b@syzkaller.appspotmail.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Message-Id: <20220502221850.131873-1-seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/vmx/vmx.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index ef63cfd57029..267d6dc4b818 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -5473,7 +5473,7 @@ static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu)
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       return vmx->emulation_required && !vmx->rmode.vm86_active &&
+-             vcpu->arch.exception.pending;
++             (vcpu->arch.exception.pending || vcpu->arch.exception.injected);
+ }
+ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
+-- 
+2.35.1
+
diff --git a/queue-5.17/kvm-x86-cpuid-only-provide-cpuid-leaf-0xa-if-host-ha.patch b/queue-5.17/kvm-x86-cpuid-only-provide-cpuid-leaf-0xa-if-host-ha.patch
new file mode 100644 (file)
index 0000000..8112af6
--- /dev/null
@@ -0,0 +1,54 @@
+From d26a1165592ab38bd4c6efb3389f50566929e568 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 27 Apr 2022 17:01:49 +0530
+Subject: kvm: x86/cpuid: Only provide CPUID leaf 0xA if host has architectural
+ PMU
+
+From: Sandipan Das <sandipan.das@amd.com>
+
+[ Upstream commit 5a1bde46f98b893cda6122b00e94c0c40a6ead3c ]
+
+On some x86 processors, CPUID leaf 0xA provides information
+on Architectural Performance Monitoring features. It
+advertises a PMU version which Qemu uses to determine the
+availability of additional MSRs to manage the PMCs.
+
+Upon receiving a KVM_GET_SUPPORTED_CPUID ioctl request for
+the same, the kernel constructs return values based on the
+x86_pmu_capability irrespective of the vendor.
+
+This leaf and the additional MSRs are not supported on AMD
+and Hygon processors. If AMD PerfMonV2 is detected, the PMU
+version is set to 2 and guest startup breaks because of an
+attempt to access a non-existent MSR. Return zeros to avoid
+this.
+
+Fixes: a6c06ed1a60a ("KVM: Expose the architectural performance monitoring CPUID leaf")
+Reported-by: Vasant Hegde <vasant.hegde@amd.com>
+Signed-off-by: Sandipan Das <sandipan.das@amd.com>
+Message-Id: <3fef83d9c2b2f7516e8ff50d60851f29a4bcb716.1651058600.git.sandipan.das@amd.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/cpuid.c | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
+index b8f8d268d058..ee15db75fd62 100644
+--- a/arch/x86/kvm/cpuid.c
++++ b/arch/x86/kvm/cpuid.c
+@@ -865,6 +865,11 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
+               union cpuid10_eax eax;
+               union cpuid10_edx edx;
++              if (!static_cpu_has(X86_FEATURE_ARCH_PERFMON)) {
++                      entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
++                      break;
++              }
++
+               perf_get_x86_pmu_capability(&cap);
+               /*
+-- 
+2.35.1
+
diff --git a/queue-5.17/kvm-x86-do-not-change-icr-on-write-to-apic_self_ipi.patch b/queue-5.17/kvm-x86-do-not-change-icr-on-write-to-apic_self_ipi.patch
new file mode 100644 (file)
index 0000000..ca94df3
--- /dev/null
@@ -0,0 +1,43 @@
+From f011c5d593bdf8d3e080c1843ddba6379d1462c3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 24 Feb 2022 09:53:36 -0500
+Subject: KVM: x86: Do not change ICR on write to APIC_SELF_IPI
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+[ Upstream commit d22a81b304a27fca6124174a8e842e826c193466 ]
+
+Emulating writes to SELF_IPI with a write to ICR has an unwanted side effect:
+the value of ICR in vAPIC page gets changed.  The lists SELF_IPI as write-only,
+with no associated MMIO offset, so any write should have no visible side
+effect in the vAPIC page.
+
+Reported-by: Chao Gao <chao.gao@intel.com>
+Reviewed-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/lapic.c | 7 +++----
+ 1 file changed, 3 insertions(+), 4 deletions(-)
+
+diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
+index 2a10d0033c96..6b6f9359d29e 100644
+--- a/arch/x86/kvm/lapic.c
++++ b/arch/x86/kvm/lapic.c
+@@ -2125,10 +2125,9 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
+               break;
+       case APIC_SELF_IPI:
+-              if (apic_x2apic_mode(apic)) {
+-                      kvm_lapic_reg_write(apic, APIC_ICR,
+-                                          APIC_DEST_SELF | (val & APIC_VECTOR_MASK));
+-              } else
++              if (apic_x2apic_mode(apic))
++                      kvm_apic_send_ipi(apic, APIC_DEST_SELF | (val & APIC_VECTOR_MASK), 0);
++              else
+                       ret = 1;
+               break;
+       default:
+-- 
+2.35.1
+
diff --git a/queue-5.17/kvm-x86-mmu-avoid-null-pointer-dereference-on-page-f.patch b/queue-5.17/kvm-x86-mmu-avoid-null-pointer-dereference-on-page-f.patch
new file mode 100644 (file)
index 0000000..8314fbb
--- /dev/null
@@ -0,0 +1,39 @@
+From 0944ea3101ac263ce605337fa00a9b79c177509d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 8 Feb 2022 19:08:33 -0500
+Subject: KVM: x86/mmu: avoid NULL-pointer dereference on page freeing bugs
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+[ Upstream commit 9191b8f0745e63edf519e4a54a4aaae1d3d46fbd ]
+
+WARN and bail if KVM attempts to free a root that isn't backed by a shadow
+page.  KVM allocates a bare page for "special" roots, e.g. when using PAE
+paging or shadowing 2/3/4-level page tables with 4/5-level, and so root_hpa
+will be valid but won't be backed by a shadow page.  It's all too easy to
+blindly call mmu_free_root_page() on root_hpa, be nice and WARN instead of
+crashing KVM and possibly the kernel.
+
+Reviewed-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/mmu/mmu.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
+index 7f009ebb319a..e7cd16e1e0a0 100644
+--- a/arch/x86/kvm/mmu/mmu.c
++++ b/arch/x86/kvm/mmu/mmu.c
+@@ -3239,6 +3239,8 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
+               return;
+       sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
++      if (WARN_ON(!sp))
++              return;
+       if (is_tdp_mmu_page(sp))
+               kvm_tdp_mmu_put_root(kvm, sp, false);
+-- 
+2.35.1
+
diff --git a/queue-5.17/net-rds-acquire-refcount-on-tcp-sockets.patch b/queue-5.17/net-rds-acquire-refcount-on-tcp-sockets.patch
new file mode 100644 (file)
index 0000000..6e53360
--- /dev/null
@@ -0,0 +1,50 @@
+From 6ceabe05ef3469e4dfe3274d9ced791e044a3d7a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 2 May 2022 10:40:18 +0900
+Subject: net: rds: acquire refcount on TCP sockets
+
+From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
+
+[ Upstream commit 3a58f13a881ed351198ffab4cf9953cf19d2ab3a ]
+
+syzbot is reporting use-after-free read in tcp_retransmit_timer() [1],
+for TCP socket used by RDS is accessing sock_net() without acquiring a
+refcount on net namespace. Since TCP's retransmission can happen after
+a process which created net namespace terminated, we need to explicitly
+acquire a refcount.
+
+Link: https://syzkaller.appspot.com/bug?extid=694120e1002c117747ed [1]
+Reported-by: syzbot <syzbot+694120e1002c117747ed@syzkaller.appspotmail.com>
+Fixes: 26abe14379f8e2fa ("net: Modify sk_alloc to not reference count the netns of kernel sockets.")
+Fixes: 8a68173691f03661 ("net: sk_clone_lock() should only do get_net() if the parent is not a kernel socket")
+Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
+Tested-by: syzbot <syzbot+694120e1002c117747ed@syzkaller.appspotmail.com>
+Link: https://lore.kernel.org/r/a5fb1fc4-2284-3359-f6a0-e4e390239d7b@I-love.SAKURA.ne.jp
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/rds/tcp.c | 8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+diff --git a/net/rds/tcp.c b/net/rds/tcp.c
+index 5327d130c4b5..2f638f8b7b1e 100644
+--- a/net/rds/tcp.c
++++ b/net/rds/tcp.c
+@@ -495,6 +495,14 @@ void rds_tcp_tune(struct socket *sock)
+       tcp_sock_set_nodelay(sock->sk);
+       lock_sock(sk);
++      /* TCP timer functions might access net namespace even after
++       * a process which created this net namespace terminated.
++       */
++      if (!sk->sk_net_refcnt) {
++              sk->sk_net_refcnt = 1;
++              get_net_track(net, &sk->ns_tracker, GFP_KERNEL);
++              sock_inuse_add(net, 1);
++      }
+       if (rtn->sndbuf_size > 0) {
+               sk->sk_sndbuf = rtn->sndbuf_size;
+               sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+-- 
+2.35.1
+
diff --git a/queue-5.17/nfc-netlink-fix-sleep-in-atomic-bug-when-firmware-do.patch b/queue-5.17/nfc-netlink-fix-sleep-in-atomic-bug-when-firmware-do.patch
new file mode 100644 (file)
index 0000000..31aa2d8
--- /dev/null
@@ -0,0 +1,70 @@
+From 3704254200cd569167a7c4f81923cb6be71901ef Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 4 May 2022 13:58:47 +0800
+Subject: NFC: netlink: fix sleep in atomic bug when firmware download timeout
+
+From: Duoming Zhou <duoming@zju.edu.cn>
+
+[ Upstream commit 4071bf121d59944d5cd2238de0642f3d7995a997 ]
+
+There are sleep in atomic bug that could cause kernel panic during
+firmware download process. The root cause is that nlmsg_new with
+GFP_KERNEL parameter is called in fw_dnld_timeout which is a timer
+handler. The call trace is shown below:
+
+BUG: sleeping function called from invalid context at include/linux/sched/mm.h:265
+Call Trace:
+kmem_cache_alloc_node
+__alloc_skb
+nfc_genl_fw_download_done
+call_timer_fn
+__run_timers.part.0
+run_timer_softirq
+__do_softirq
+...
+
+The nlmsg_new with GFP_KERNEL parameter may sleep during memory
+allocation process, and the timer handler is run as the result of
+a "software interrupt" that should not call any other function
+that could sleep.
+
+This patch changes allocation mode of netlink message from GFP_KERNEL
+to GFP_ATOMIC in order to prevent sleep in atomic bug. The GFP_ATOMIC
+flag makes memory allocation operation could be used in atomic context.
+
+Fixes: 9674da8759df ("NFC: Add firmware upload netlink command")
+Fixes: 9ea7187c53f6 ("NFC: netlink: Rename CMD_FW_UPLOAD to CMD_FW_DOWNLOAD")
+Signed-off-by: Duoming Zhou <duoming@zju.edu.cn>
+Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
+Link: https://lore.kernel.org/r/20220504055847.38026-1-duoming@zju.edu.cn
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/nfc/netlink.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c
+index 7c62417ccfd7..5c429e25bcf7 100644
+--- a/net/nfc/netlink.c
++++ b/net/nfc/netlink.c
+@@ -534,7 +534,7 @@ int nfc_genl_se_connectivity(struct nfc_dev *dev, u8 se_idx)
+       struct sk_buff *msg;
+       void *hdr;
+-      msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
++      msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+       if (!msg)
+               return -ENOMEM;
+@@ -554,7 +554,7 @@ int nfc_genl_se_connectivity(struct nfc_dev *dev, u8 se_idx)
+       genlmsg_end(msg, hdr);
+-      genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL);
++      genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC);
+       return 0;
+-- 
+2.35.1
+
diff --git a/queue-5.17/selftest-vm-verify-mmap-addr-in-mremap_test.patch b/queue-5.17/selftest-vm-verify-mmap-addr-in-mremap_test.patch
new file mode 100644 (file)
index 0000000..0d5b08a
--- /dev/null
@@ -0,0 +1,73 @@
+From 8c9460b1c291a9ea9f7b9ae2c3c588670e33de2b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 21 Apr 2022 16:35:49 -0700
+Subject: selftest/vm: verify mmap addr in mremap_test
+
+From: Sidhartha Kumar <sidhartha.kumar@oracle.com>
+
+[ Upstream commit 9c85a9bae267f6b5e5e374d0d023bbbe9db096d3 ]
+
+Avoid calling mmap with requested addresses that are less than the
+system's mmap_min_addr.  When run as root, mmap returns EACCES when
+trying to map addresses < mmap_min_addr.  This is not one of the error
+codes for the condition to retry the mmap in the test.
+
+Rather than arbitrarily retrying on EACCES, don't attempt an mmap until
+addr > vm.mmap_min_addr.
+
+Add a munmap call after an alignment check as the mappings are retained
+after the retry and can reach the vm.max_map_count sysctl.
+
+Link: https://lkml.kernel.org/r/20220420215721.4868-1-sidhartha.kumar@oracle.com
+Signed-off-by: Sidhartha Kumar <sidhartha.kumar@oracle.com>
+Reviewed-by: Shuah Khan <skhan@linuxfoundation.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ tools/testing/selftests/vm/mremap_test.c | 29 ++++++++++++++++++++++++
+ 1 file changed, 29 insertions(+)
+
+diff --git a/tools/testing/selftests/vm/mremap_test.c b/tools/testing/selftests/vm/mremap_test.c
+index 58775dab3cc6..380a4593dbd6 100644
+--- a/tools/testing/selftests/vm/mremap_test.c
++++ b/tools/testing/selftests/vm/mremap_test.c
+@@ -65,6 +65,35 @@ enum {
+       .expect_failure = should_fail                           \
+ }
++/* Returns mmap_min_addr sysctl tunable from procfs */
++static unsigned long long get_mmap_min_addr(void)
++{
++      FILE *fp;
++      int n_matched;
++      static unsigned long long addr;
++
++      if (addr)
++              return addr;
++
++      fp = fopen("/proc/sys/vm/mmap_min_addr", "r");
++      if (fp == NULL) {
++              ksft_print_msg("Failed to open /proc/sys/vm/mmap_min_addr: %s\n",
++                      strerror(errno));
++              exit(KSFT_SKIP);
++      }
++
++      n_matched = fscanf(fp, "%llu", &addr);
++      if (n_matched != 1) {
++              ksft_print_msg("Failed to read /proc/sys/vm/mmap_min_addr: %s\n",
++                      strerror(errno));
++              fclose(fp);
++              exit(KSFT_SKIP);
++      }
++
++      fclose(fp);
++      return addr;
++}
++
+ /*
+  * Returns false if the requested remap region overlaps with an
+  * existing mapping (e.g text, stack) else returns true.
+-- 
+2.35.1
+
diff --git a/queue-5.17/selftest-vm-verify-remap-destination-address-in-mrem.patch b/queue-5.17/selftest-vm-verify-remap-destination-address-in-mrem.patch
new file mode 100644 (file)
index 0000000..9283477
--- /dev/null
@@ -0,0 +1,66 @@
+From fbc2ee9e0bb4f0bdc3494d5120fabc046b6b2f2c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 21 Apr 2022 16:35:52 -0700
+Subject: selftest/vm: verify remap destination address in mremap_test
+
+From: Sidhartha Kumar <sidhartha.kumar@oracle.com>
+
+[ Upstream commit 18d609daa546c919fd36b62a7b510c18de4b4af8 ]
+
+Because mremap does not have a MAP_FIXED_NOREPLACE flag, it can destroy
+existing mappings.  This causes a segfault when regions such as text are
+remapped and the permissions are changed.
+
+Verify the requested mremap destination address does not overlap any
+existing mappings by using mmap's MAP_FIXED_NOREPLACE flag.  Keep
+incrementing the destination address until a valid mapping is found or
+fail the current test once the max address is reached.
+
+Link: https://lkml.kernel.org/r/20220420215721.4868-2-sidhartha.kumar@oracle.com
+Signed-off-by: Sidhartha Kumar <sidhartha.kumar@oracle.com>
+Reviewed-by: Shuah Khan <skhan@linuxfoundation.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ tools/testing/selftests/vm/mremap_test.c | 24 ++++++++++++++++++++++++
+ 1 file changed, 24 insertions(+)
+
+diff --git a/tools/testing/selftests/vm/mremap_test.c b/tools/testing/selftests/vm/mremap_test.c
+index 380a4593dbd6..5ef41640d657 100644
+--- a/tools/testing/selftests/vm/mremap_test.c
++++ b/tools/testing/selftests/vm/mremap_test.c
+@@ -65,6 +65,30 @@ enum {
+       .expect_failure = should_fail                           \
+ }
++/*
++ * Returns false if the requested remap region overlaps with an
++ * existing mapping (e.g text, stack) else returns true.
++ */
++static bool is_remap_region_valid(void *addr, unsigned long long size)
++{
++      void *remap_addr = NULL;
++      bool ret = true;
++
++      /* Use MAP_FIXED_NOREPLACE flag to ensure region is not mapped */
++      remap_addr = mmap(addr, size, PROT_READ | PROT_WRITE,
++                                       MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED,
++                                       -1, 0);
++
++      if (remap_addr == MAP_FAILED) {
++              if (errno == EEXIST)
++                      ret = false;
++      } else {
++              munmap(remap_addr, size);
++      }
++
++      return ret;
++}
++
+ /* Returns mmap_min_addr sysctl tunable from procfs */
+ static unsigned long long get_mmap_min_addr(void)
+ {
+-- 
+2.35.1
+
index b6c9a90a7a7420d19729dc5802ccdf4e551f79c1..1b8cecc461120ebc457dd32a2680a157391de5fb 100644 (file)
@@ -103,3 +103,19 @@ selftests-ocelot-tc_flower_chains-specify-conform-exceed-action-for-policer.patc
 smsc911x-allow-using-irq0.patch
 btrfs-always-log-symlinks-in-full-mode.patch
 parisc-mark-cr16-clock-unstable-on-all-smp-machines.patch
+gpio-mvebu-drop-pwm-base-assignment.patch
+net-rds-acquire-refcount-on-tcp-sockets.patch
+kvm-x86-cpuid-only-provide-cpuid-leaf-0xa-if-host-ha.patch
+fbdev-make-fb_release-return-enodev-if-fbdev-was-unr.patch
+iommu-dart-add-missing-module-owner-to-ops-structure.patch
+nfc-netlink-fix-sleep-in-atomic-bug-when-firmware-do.patch
+kvm-sev-mark-nested-locking-of-vcpu-lock.patch
+kvm-vmx-exit-to-userspace-if-vcpu-has-injected-excep.patch
+kvm-selftests-do-not-use-bitfields-larger-than-32-bi.patch
+kvm-selftests-silence-compiler-warning-in-the-kvm_pa.patch
+x86-kvm-preserve-bsp-msr_kvm_poll_control-across-sus.patch
+kvm-x86-do-not-change-icr-on-write-to-apic_self_ipi.patch
+kvm-x86-mmu-avoid-null-pointer-dereference-on-page-f.patch
+kvm-lapic-enable-timer-posted-interrupt-only-when-mw.patch
+selftest-vm-verify-mmap-addr-in-mremap_test.patch
+selftest-vm-verify-remap-destination-address-in-mrem.patch
diff --git a/queue-5.17/x86-kvm-preserve-bsp-msr_kvm_poll_control-across-sus.patch b/queue-5.17/x86-kvm-preserve-bsp-msr_kvm_poll_control-across-sus.patch
new file mode 100644 (file)
index 0000000..79611f2
--- /dev/null
@@ -0,0 +1,71 @@
+From 0b2e6b6b44f0e4afd018c4e1e07388e50c564d1f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 18 Apr 2022 00:42:32 -0700
+Subject: x86/kvm: Preserve BSP MSR_KVM_POLL_CONTROL across suspend/resume
+
+From: Wanpeng Li <wanpengli@tencent.com>
+
+[ Upstream commit 0361bdfddca20c8855ea3bdbbbc9c999912b10ff ]
+
+MSR_KVM_POLL_CONTROL is cleared on reset, thus reverting guests to
+host-side polling after suspend/resume.  Non-bootstrap CPUs are
+restored correctly by the haltpoll driver because they are hot-unplugged
+during suspend and hot-plugged during resume; however, the BSP
+is not hotpluggable and remains in host-sde polling mode after
+the guest resume.  The makes the guest pay for the cost of vmexits
+every time the guest enters idle.
+
+Fix it by recording BSP's haltpoll state and resuming it during guest
+resume.
+
+Cc: Marcelo Tosatti <mtosatti@redhat.com>
+Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
+Message-Id: <1650267752-46796-1-git-send-email-wanpengli@tencent.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kernel/kvm.c | 13 +++++++++++++
+ 1 file changed, 13 insertions(+)
+
+diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
+index ed8a13ac4ab2..4c2a158bb6c4 100644
+--- a/arch/x86/kernel/kvm.c
++++ b/arch/x86/kernel/kvm.c
+@@ -69,6 +69,7 @@ static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __align
+ DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible;
+ static int has_steal_clock = 0;
++static int has_guest_poll = 0;
+ /*
+  * No need for any "IO delay" on KVM
+  */
+@@ -706,14 +707,26 @@ static int kvm_cpu_down_prepare(unsigned int cpu)
+ static int kvm_suspend(void)
+ {
++      u64 val = 0;
++
+       kvm_guest_cpu_offline(false);
++#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
++      if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
++              rdmsrl(MSR_KVM_POLL_CONTROL, val);
++      has_guest_poll = !(val & 1);
++#endif
+       return 0;
+ }
+ static void kvm_resume(void)
+ {
+       kvm_cpu_online(raw_smp_processor_id());
++
++#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
++      if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL) && has_guest_poll)
++              wrmsrl(MSR_KVM_POLL_CONTROL, 0);
++#endif
+ }
+ static struct syscore_ops kvm_syscore_ops = {
+-- 
+2.35.1
+