]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
Fixes for 5.18
authorSasha Levin <sashal@kernel.org>
Sat, 6 Aug 2022 15:31:09 +0000 (11:31 -0400)
committerSasha Levin <sashal@kernel.org>
Sat, 6 Aug 2022 15:31:09 +0000 (11:31 -0400)
Signed-off-by: Sasha Levin <sashal@kernel.org>
12 files changed:
queue-5.18/entry-kvm-exit-to-user-mode-when-tif_notify_signal-i.patch [new file with mode: 0644]
queue-5.18/kvm-selftests-make-hyperv_clock-selftest-more-stable.patch [new file with mode: 0644]
queue-5.18/kvm-selftests-restrict-test-region-to-48-bit-physica.patch [new file with mode: 0644]
queue-5.18/kvm-x86-disable-preemption-around-the-call-to-kvm_ar.patch [new file with mode: 0644]
queue-5.18/kvm-x86-disable-preemption-while-updating-apicv-inhi.patch [new file with mode: 0644]
queue-5.18/kvm-x86-do-not-report-a-vcpu-as-preempted-outside-in.patch [new file with mode: 0644]
queue-5.18/kvm-x86-do-not-set-st-preempted-when-going-back-to-u.patch [new file with mode: 0644]
queue-5.18/kvm-x86-mmu-zap-non-leaf-sptes-when-disabling-dirty-.patch [new file with mode: 0644]
queue-5.18/kvm-x86-svm-add-__gfp_account-to-__sev_dbg_-en-de-cr.patch [new file with mode: 0644]
queue-5.18/selftests-kvm-handle-compiler-optimizations-in-ucall.patch [new file with mode: 0644]
queue-5.18/series
queue-5.18/tools-kvm_stat-fix-display-of-error-when-multiple-pr.patch [new file with mode: 0644]

diff --git a/queue-5.18/entry-kvm-exit-to-user-mode-when-tif_notify_signal-i.patch b/queue-5.18/entry-kvm-exit-to-user-mode-when-tif_notify_signal-i.patch
new file mode 100644 (file)
index 0000000..f3ec713
--- /dev/null
@@ -0,0 +1,57 @@
+From 2464e2253fce3d174d78d95e6266a72659ab8476 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 4 May 2022 13:08:40 -0500
+Subject: entry/kvm: Exit to user mode when TIF_NOTIFY_SIGNAL is set
+
+From: Seth Forshee <sforshee@digitalocean.com>
+
+[ Upstream commit 3e684903a8574ffc9475fdf13c4780a7adb506ad ]
+
+A livepatch transition may stall indefinitely when a kvm vCPU is heavily
+loaded. To the host, the vCPU task is a user thread which is spending a
+very long time in the ioctl(KVM_RUN) syscall. During livepatch
+transition, set_notify_signal() will be called on such tasks to
+interrupt the syscall so that the task can be transitioned. This
+interrupts guest execution, but when xfer_to_guest_mode_work() sees that
+TIF_NOTIFY_SIGNAL is set but not TIF_SIGPENDING it concludes that an
+exit to user mode is unnecessary, and guest execution is resumed without
+transitioning the task for the livepatch.
+
+This handling of TIF_NOTIFY_SIGNAL is incorrect, as set_notify_signal()
+is expected to break tasks out of interruptible kernel loops and cause
+them to return to userspace. Change xfer_to_guest_mode_work() to handle
+TIF_NOTIFY_SIGNAL the same as TIF_SIGPENDING, signaling to the vCPU run
+loop that an exit to userpsace is needed. Any pending task_work will be
+run when get_signal() is called from exit_to_user_mode_loop(), so there
+is no longer any need to run task work from xfer_to_guest_mode_work().
+
+Suggested-by: "Eric W. Biederman" <ebiederm@xmission.com>
+Cc: Petr Mladek <pmladek@suse.com>
+Signed-off-by: Seth Forshee <sforshee@digitalocean.com>
+Message-Id: <20220504180840.2907296-1-sforshee@digitalocean.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/entry/kvm.c | 6 ------
+ 1 file changed, 6 deletions(-)
+
+diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c
+index 9d09f489b60e..2e0f75bcb7fd 100644
+--- a/kernel/entry/kvm.c
++++ b/kernel/entry/kvm.c
+@@ -9,12 +9,6 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
+               int ret;
+               if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) {
+-                      clear_notify_signal();
+-                      if (task_work_pending(current))
+-                              task_work_run();
+-              }
+-
+-              if (ti_work & _TIF_SIGPENDING) {
+                       kvm_handle_signal_exit(vcpu);
+                       return -EINTR;
+               }
+-- 
+2.35.1
+
diff --git a/queue-5.18/kvm-selftests-make-hyperv_clock-selftest-more-stable.patch b/queue-5.18/kvm-selftests-make-hyperv_clock-selftest-more-stable.patch
new file mode 100644 (file)
index 0000000..02750c5
--- /dev/null
@@ -0,0 +1,75 @@
+From e7c7b8f2f24abbc4915f07595c198d089552495b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 1 Jun 2022 16:43:22 +0200
+Subject: KVM: selftests: Make hyperv_clock selftest more stable
+
+From: Vitaly Kuznetsov <vkuznets@redhat.com>
+
+[ Upstream commit eae260be3a0111a28fe95923e117a55dddec0384 ]
+
+hyperv_clock doesn't always give a stable test result, especially with
+AMD CPUs. The test compares Hyper-V MSR clocksource (acquired either
+with rdmsr() from within the guest or KVM_GET_MSRS from the host)
+against rdtsc(). To increase the accuracy, increase the measured delay
+(done with nop loop) by two orders of magnitude and take the mean rdtsc()
+value before and after rdmsr()/KVM_GET_MSRS.
+
+Reported-by: Maxim Levitsky <mlevitsk@redhat.com>
+Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
+Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
+Tested-by: Maxim Levitsky <mlevitsk@redhat.com>
+Message-Id: <20220601144322.1968742-1-vkuznets@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ tools/testing/selftests/kvm/x86_64/hyperv_clock.c | 10 +++++++---
+ 1 file changed, 7 insertions(+), 3 deletions(-)
+
+diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_clock.c b/tools/testing/selftests/kvm/x86_64/hyperv_clock.c
+index e0b2bb1339b1..3330fb183c68 100644
+--- a/tools/testing/selftests/kvm/x86_64/hyperv_clock.c
++++ b/tools/testing/selftests/kvm/x86_64/hyperv_clock.c
+@@ -44,7 +44,7 @@ static inline void nop_loop(void)
+ {
+       int i;
+-      for (i = 0; i < 1000000; i++)
++      for (i = 0; i < 100000000; i++)
+               asm volatile("nop");
+ }
+@@ -56,12 +56,14 @@ static inline void check_tsc_msr_rdtsc(void)
+       tsc_freq = rdmsr(HV_X64_MSR_TSC_FREQUENCY);
+       GUEST_ASSERT(tsc_freq > 0);
+-      /* First, check MSR-based clocksource */
++      /* For increased accuracy, take mean rdtsc() before and afrer rdmsr() */
+       r1 = rdtsc();
+       t1 = rdmsr(HV_X64_MSR_TIME_REF_COUNT);
++      r1 = (r1 + rdtsc()) / 2;
+       nop_loop();
+       r2 = rdtsc();
+       t2 = rdmsr(HV_X64_MSR_TIME_REF_COUNT);
++      r2 = (r2 + rdtsc()) / 2;
+       GUEST_ASSERT(r2 > r1 && t2 > t1);
+@@ -181,12 +183,14 @@ static void host_check_tsc_msr_rdtsc(struct kvm_vm *vm)
+       tsc_freq = vcpu_get_msr(vm, VCPU_ID, HV_X64_MSR_TSC_FREQUENCY);
+       TEST_ASSERT(tsc_freq > 0, "TSC frequency must be nonzero");
+-      /* First, check MSR-based clocksource */
++      /* For increased accuracy, take mean rdtsc() before and afrer ioctl */
+       r1 = rdtsc();
+       t1 = vcpu_get_msr(vm, VCPU_ID, HV_X64_MSR_TIME_REF_COUNT);
++      r1 = (r1 + rdtsc()) / 2;
+       nop_loop();
+       r2 = rdtsc();
+       t2 = vcpu_get_msr(vm, VCPU_ID, HV_X64_MSR_TIME_REF_COUNT);
++      r2 = (r2 + rdtsc()) / 2;
+       TEST_ASSERT(t2 > t1, "Time reference MSR is not monotonic (%ld <= %ld)", t1, t2);
+-- 
+2.35.1
+
diff --git a/queue-5.18/kvm-selftests-restrict-test-region-to-48-bit-physica.patch b/queue-5.18/kvm-selftests-restrict-test-region-to-48-bit-physica.patch
new file mode 100644 (file)
index 0000000..d962179
--- /dev/null
@@ -0,0 +1,79 @@
+From 153db4a959fd4efda9196c92f49e655a9204d9da Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 20 May 2022 23:32:49 +0000
+Subject: KVM: selftests: Restrict test region to 48-bit physical addresses
+ when using nested
+
+From: David Matlack <dmatlack@google.com>
+
+[ Upstream commit e0f3f46e42064a51573914766897b4ab95d943e3 ]
+
+The selftests nested code only supports 4-level paging at the moment.
+This means it cannot map nested guest physical addresses with more than
+48 bits. Allow perf_test_util nested mode to work on hosts with more
+than 48 physical addresses by restricting the guest test region to
+48-bits.
+
+While here, opportunistically fix an off-by-one error when dealing with
+vm_get_max_gfn(). perf_test_util.c was treating this as the maximum
+number of GFNs, rather than the maximum allowed GFN. This didn't result
+in any correctness issues, but it did end up shifting the test region
+down slightly when using huge pages.
+
+Suggested-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: David Matlack <dmatlack@google.com>
+Message-Id: <20220520233249.3776001-12-dmatlack@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ .../testing/selftests/kvm/lib/perf_test_util.c | 18 +++++++++++++++---
+ 1 file changed, 15 insertions(+), 3 deletions(-)
+
+diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c
+index 722df3a28791..ddd68ba0c99f 100644
+--- a/tools/testing/selftests/kvm/lib/perf_test_util.c
++++ b/tools/testing/selftests/kvm/lib/perf_test_util.c
+@@ -110,6 +110,7 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus,
+       struct kvm_vm *vm;
+       uint64_t guest_num_pages;
+       uint64_t backing_src_pagesz = get_backing_src_pagesz(backing_src);
++      uint64_t region_end_gfn;
+       int i;
+       pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode));
+@@ -144,18 +145,29 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus,
+       pta->vm = vm;
++      /* Put the test region at the top guest physical memory. */
++      region_end_gfn = vm_get_max_gfn(vm) + 1;
++
++#ifdef __x86_64__
++      /*
++       * When running vCPUs in L2, restrict the test region to 48 bits to
++       * avoid needing 5-level page tables to identity map L2.
++       */
++      if (pta->nested)
++              region_end_gfn = min(region_end_gfn, (1UL << 48) / pta->guest_page_size);
++#endif
+       /*
+        * If there should be more memory in the guest test region than there
+        * can be pages in the guest, it will definitely cause problems.
+        */
+-      TEST_ASSERT(guest_num_pages < vm_get_max_gfn(vm),
++      TEST_ASSERT(guest_num_pages < region_end_gfn,
+                   "Requested more guest memory than address space allows.\n"
+                   "    guest pages: %" PRIx64 " max gfn: %" PRIx64
+                   " vcpus: %d wss: %" PRIx64 "]\n",
+-                  guest_num_pages, vm_get_max_gfn(vm), vcpus,
++                  guest_num_pages, region_end_gfn - 1, vcpus,
+                   vcpu_memory_bytes);
+-      pta->gpa = (vm_get_max_gfn(vm) - guest_num_pages) * pta->guest_page_size;
++      pta->gpa = (region_end_gfn - guest_num_pages) * pta->guest_page_size;
+       pta->gpa = align_down(pta->gpa, backing_src_pagesz);
+ #ifdef __s390x__
+       /* Align to 1M (segment size) */
+-- 
+2.35.1
+
diff --git a/queue-5.18/kvm-x86-disable-preemption-around-the-call-to-kvm_ar.patch b/queue-5.18/kvm-x86-disable-preemption-around-the-call-to-kvm_ar.patch
new file mode 100644 (file)
index 0000000..b8dcb1f
--- /dev/null
@@ -0,0 +1,62 @@
+From 51da2a430f49e1115c2e805118fe91667ced63cf Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 6 Jun 2022 21:08:28 +0300
+Subject: KVM: x86: disable preemption around the call to
+ kvm_arch_vcpu_{un|}blocking
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+[ Upstream commit 18869f26df1a11ed11031dfb7392bc7d774062e8 ]
+
+On SVM, if preemption happens right after the call to finish_rcuwait
+but before call to kvm_arch_vcpu_unblocking on SVM/AVIC, it itself
+will re-enable AVIC, and then we will try to re-enable it again
+in kvm_arch_vcpu_unblocking which will lead to a warning
+in __avic_vcpu_load.
+
+The same problem can happen if the vCPU is preempted right after the call
+to kvm_arch_vcpu_blocking but before the call to prepare_to_rcuwait
+and in this case, we will end up with AVIC enabled during sleep -
+Ooops.
+
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Message-Id: <20220606180829.102503-7-mlevitsk@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ virt/kvm/kvm_main.c | 8 ++++++--
+ 1 file changed, 6 insertions(+), 2 deletions(-)
+
+diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
+index 24cb37d19c63..7f1d19689701 100644
+--- a/virt/kvm/kvm_main.c
++++ b/virt/kvm/kvm_main.c
+@@ -3327,9 +3327,11 @@ bool kvm_vcpu_block(struct kvm_vcpu *vcpu)
+       vcpu->stat.generic.blocking = 1;
++      preempt_disable();
+       kvm_arch_vcpu_blocking(vcpu);
+-
+       prepare_to_rcuwait(wait);
++      preempt_enable();
++
+       for (;;) {
+               set_current_state(TASK_INTERRUPTIBLE);
+@@ -3339,9 +3341,11 @@ bool kvm_vcpu_block(struct kvm_vcpu *vcpu)
+               waited = true;
+               schedule();
+       }
+-      finish_rcuwait(wait);
++      preempt_disable();
++      finish_rcuwait(wait);
+       kvm_arch_vcpu_unblocking(vcpu);
++      preempt_enable();
+       vcpu->stat.generic.blocking = 0;
+-- 
+2.35.1
+
diff --git a/queue-5.18/kvm-x86-disable-preemption-while-updating-apicv-inhi.patch b/queue-5.18/kvm-x86-disable-preemption-while-updating-apicv-inhi.patch
new file mode 100644 (file)
index 0000000..0f27881
--- /dev/null
@@ -0,0 +1,53 @@
+From 4fc1688d87063c8fba524640fb2ab8073c7836c3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 6 Jun 2022 21:08:27 +0300
+Subject: KVM: x86: disable preemption while updating apicv inhibition
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+[ Upstream commit 66c768d30e64e1280520f34dbef83419f55f3459 ]
+
+Currently nothing prevents preemption in kvm_vcpu_update_apicv.
+
+On SVM, If the preemption happens after we update the
+vcpu->arch.apicv_active, the preemption itself will
+'update' the inhibition since the AVIC will be first disabled
+on vCPU unload and then enabled, when the current task
+is loaded again.
+
+Then we will try to update it again, which will lead to a warning
+in __avic_vcpu_load, that the AVIC is already enabled.
+
+Fix this by disabling preemption in this code.
+
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Message-Id: <20220606180829.102503-6-mlevitsk@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/x86.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index 91d887fd10ab..65b0ec28bd52 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -9784,6 +9784,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
+               return;
+       down_read(&vcpu->kvm->arch.apicv_update_lock);
++      preempt_disable();
+       activate = kvm_apicv_activated(vcpu->kvm);
+       if (vcpu->arch.apicv_active == activate)
+@@ -9803,6 +9804,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
+               kvm_make_request(KVM_REQ_EVENT, vcpu);
+ out:
++      preempt_enable();
+       up_read(&vcpu->kvm->arch.apicv_update_lock);
+ }
+ EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
+-- 
+2.35.1
+
diff --git a/queue-5.18/kvm-x86-do-not-report-a-vcpu-as-preempted-outside-in.patch b/queue-5.18/kvm-x86-do-not-report-a-vcpu-as-preempted-outside-in.patch
new file mode 100644 (file)
index 0000000..8c3126f
--- /dev/null
@@ -0,0 +1,136 @@
+From 491ff8c3605ee8f022817a87843afbb2dea86fc9 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 7 Jun 2022 10:09:03 -0400
+Subject: KVM: x86: do not report a vCPU as preempted outside instruction
+ boundaries
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+[ Upstream commit 6cd88243c7e03845a450795e134b488fc2afb736 ]
+
+If a vCPU is outside guest mode and is scheduled out, it might be in the
+process of making a memory access.  A problem occurs if another vCPU uses
+the PV TLB flush feature during the period when the vCPU is scheduled
+out, and a virtual address has already been translated but has not yet
+been accessed, because this is equivalent to using a stale TLB entry.
+
+To avoid this, only report a vCPU as preempted if sure that the guest
+is at an instruction boundary.  A rescheduling request will be delivered
+to the host physical CPU as an external interrupt, so for simplicity
+consider any vmexit *not* instruction boundary except for external
+interrupts.
+
+It would in principle be okay to report the vCPU as preempted also
+if it is sleeping in kvm_vcpu_block(): a TLB flush IPI will incur the
+vmentry/vmexit overhead unnecessarily, and optimistic spinning is
+also unlikely to succeed.  However, leave it for later because right
+now kvm_vcpu_check_block() is doing memory accesses.  Even
+though the TLB flush issue only applies to virtual memory address,
+it's very much preferrable to be conservative.
+
+Reported-by: Jann Horn <jannh@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/kvm_host.h |  3 +++
+ arch/x86/kvm/svm/svm.c          |  2 ++
+ arch/x86/kvm/vmx/vmx.c          |  1 +
+ arch/x86/kvm/x86.c              | 22 ++++++++++++++++++++++
+ 4 files changed, 28 insertions(+)
+
+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
+index 4ff36610af6a..9fdaa847d4b6 100644
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -651,6 +651,7 @@ struct kvm_vcpu_arch {
+       u64 ia32_misc_enable_msr;
+       u64 smbase;
+       u64 smi_count;
++      bool at_instruction_boundary;
+       bool tpr_access_reporting;
+       bool xsaves_enabled;
+       bool xfd_no_write_intercept;
+@@ -1289,6 +1290,8 @@ struct kvm_vcpu_stat {
+       u64 nested_run;
+       u64 directed_yield_attempted;
+       u64 directed_yield_successful;
++      u64 preemption_reported;
++      u64 preemption_other;
+       u64 guest_mode;
+ };
+diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
+index 6bfb0b0e66bd..c667214c630b 100644
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -4166,6 +4166,8 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
+ static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
+ {
++      if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_INTR)
++              vcpu->arch.at_instruction_boundary = true;
+ }
+ static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 4b6a0268c78e..597c3c08da50 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -6630,6 +6630,7 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
+               return;
+       handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
++      vcpu->arch.at_instruction_boundary = true;
+ }
+ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index 53b6fdf30c99..df74ec51c7f3 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -291,6 +291,8 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
+       STATS_DESC_COUNTER(VCPU, nested_run),
+       STATS_DESC_COUNTER(VCPU, directed_yield_attempted),
+       STATS_DESC_COUNTER(VCPU, directed_yield_successful),
++      STATS_DESC_COUNTER(VCPU, preemption_reported),
++      STATS_DESC_COUNTER(VCPU, preemption_other),
+       STATS_DESC_ICOUNTER(VCPU, guest_mode)
+ };
+@@ -4607,6 +4609,19 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
+       struct kvm_memslots *slots;
+       static const u8 preempted = KVM_VCPU_PREEMPTED;
++      /*
++       * The vCPU can be marked preempted if and only if the VM-Exit was on
++       * an instruction boundary and will not trigger guest emulation of any
++       * kind (see vcpu_run).  Vendor specific code controls (conservatively)
++       * when this is true, for example allowing the vCPU to be marked
++       * preempted if and only if the VM-Exit was due to a host interrupt.
++       */
++      if (!vcpu->arch.at_instruction_boundary) {
++              vcpu->stat.preemption_other++;
++              return;
++      }
++
++      vcpu->stat.preemption_reported++;
+       if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
+               return;
+@@ -10363,6 +10378,13 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
+       vcpu->arch.l1tf_flush_l1d = true;
+       for (;;) {
++              /*
++               * If another guest vCPU requests a PV TLB flush in the middle
++               * of instruction emulation, the rest of the emulation could
++               * use a stale page translation. Assume that any code after
++               * this point can start executing an instruction.
++               */
++              vcpu->arch.at_instruction_boundary = false;
+               if (kvm_vcpu_running(vcpu)) {
+                       r = vcpu_enter_guest(vcpu);
+               } else {
+-- 
+2.35.1
+
diff --git a/queue-5.18/kvm-x86-do-not-set-st-preempted-when-going-back-to-u.patch b/queue-5.18/kvm-x86-do-not-set-st-preempted-when-going-back-to-u.patch
new file mode 100644 (file)
index 0000000..90ffbdb
--- /dev/null
@@ -0,0 +1,81 @@
+From 24c49abc49511c32c73ab946cd5b29bdf71c21cb Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 7 Jun 2022 10:07:11 -0400
+Subject: KVM: x86: do not set st->preempted when going back to user space
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+[ Upstream commit 54aa83c90198e68eee8b0850c749bc70efb548da ]
+
+Similar to the Xen path, only change the vCPU's reported state if the vCPU
+was actually preempted.  The reason for KVM's behavior is that for example
+optimistic spinning might not be a good idea if the guest is doing repeated
+exits to userspace; however, it is confusing and unlikely to make a difference,
+because well-tuned guests will hardly ever exit KVM_RUN in the first place.
+
+Suggested-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/x86.c | 26 ++++++++++++++------------
+ arch/x86/kvm/xen.h |  6 ++++--
+ 2 files changed, 18 insertions(+), 14 deletions(-)
+
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index df74ec51c7f3..91d887fd10ab 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -4651,19 +4651,21 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
+ {
+       int idx;
+-      if (vcpu->preempted && !vcpu->arch.guest_state_protected)
+-              vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu);
++      if (vcpu->preempted) {
++              if (!vcpu->arch.guest_state_protected)
++                      vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu);
+-      /*
+-       * Take the srcu lock as memslots will be accessed to check the gfn
+-       * cache generation against the memslots generation.
+-       */
+-      idx = srcu_read_lock(&vcpu->kvm->srcu);
+-      if (kvm_xen_msr_enabled(vcpu->kvm))
+-              kvm_xen_runstate_set_preempted(vcpu);
+-      else
+-              kvm_steal_time_set_preempted(vcpu);
+-      srcu_read_unlock(&vcpu->kvm->srcu, idx);
++              /*
++               * Take the srcu lock as memslots will be accessed to check the gfn
++               * cache generation against the memslots generation.
++               */
++              idx = srcu_read_lock(&vcpu->kvm->srcu);
++              if (kvm_xen_msr_enabled(vcpu->kvm))
++                      kvm_xen_runstate_set_preempted(vcpu);
++              else
++                      kvm_steal_time_set_preempted(vcpu);
++              srcu_read_unlock(&vcpu->kvm->srcu, idx);
++      }
+       static_call(kvm_x86_vcpu_put)(vcpu);
+       vcpu->arch.last_host_tsc = rdtsc();
+diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h
+index adbcc9ed59db..fda1413f8af9 100644
+--- a/arch/x86/kvm/xen.h
++++ b/arch/x86/kvm/xen.h
+@@ -103,8 +103,10 @@ static inline void kvm_xen_runstate_set_preempted(struct kvm_vcpu *vcpu)
+        * behalf of the vCPU. Only if the VMM does actually block
+        * does it need to enter RUNSTATE_blocked.
+        */
+-      if (vcpu->preempted)
+-              kvm_xen_update_runstate_guest(vcpu, RUNSTATE_runnable);
++      if (WARN_ON_ONCE(!vcpu->preempted))
++              return;
++
++      kvm_xen_update_runstate_guest(vcpu, RUNSTATE_runnable);
+ }
+ /* 32-bit compatibility definitions, also used natively in 32-bit build */
+-- 
+2.35.1
+
diff --git a/queue-5.18/kvm-x86-mmu-zap-non-leaf-sptes-when-disabling-dirty-.patch b/queue-5.18/kvm-x86-mmu-zap-non-leaf-sptes-when-disabling-dirty-.patch
new file mode 100644 (file)
index 0000000..4aca9d8
--- /dev/null
@@ -0,0 +1,131 @@
+From c8b327fe59257653dfee8d80ebfce83def9c1d2c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 25 May 2022 23:09:04 +0000
+Subject: KVM: x86/MMU: Zap non-leaf SPTEs when disabling dirty logging
+
+From: Ben Gardon <bgardon@google.com>
+
+[ Upstream commit 5ba7c4c6d1c7af47a916f728bb5940669684a087 ]
+
+Currently disabling dirty logging with the TDP MMU is extremely slow.
+On a 96 vCPU / 96G VM backed with gigabyte pages, it takes ~200 seconds
+to disable dirty logging with the TDP MMU, as opposed to ~4 seconds with
+the shadow MMU.
+
+When disabling dirty logging, zap non-leaf parent entries to allow
+replacement with huge pages instead of recursing and zapping all of the
+child, leaf entries. This reduces the number of TLB flushes required.
+and reduces the disable dirty log time with the TDP MMU to ~3 seconds.
+
+Opportunistically add a WARN() to catch GFNs that are mapped at a
+higher level than their max level.
+
+Signed-off-by: Ben Gardon <bgardon@google.com>
+Message-Id: <20220525230904.1584480-1-bgardon@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/mmu/tdp_iter.c |  9 +++++++++
+ arch/x86/kvm/mmu/tdp_iter.h |  1 +
+ arch/x86/kvm/mmu/tdp_mmu.c  | 38 +++++++++++++++++++++++++++++++------
+ 3 files changed, 42 insertions(+), 6 deletions(-)
+
+diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
+index 6d3b3e5a5533..ee4802d7b36c 100644
+--- a/arch/x86/kvm/mmu/tdp_iter.c
++++ b/arch/x86/kvm/mmu/tdp_iter.c
+@@ -145,6 +145,15 @@ static bool try_step_up(struct tdp_iter *iter)
+       return true;
+ }
++/*
++ * Step the iterator back up a level in the paging structure. Should only be
++ * used when the iterator is below the root level.
++ */
++void tdp_iter_step_up(struct tdp_iter *iter)
++{
++      WARN_ON(!try_step_up(iter));
++}
++
+ /*
+  * Step to the next SPTE in a pre-order traversal of the paging structure.
+  * To get to the next SPTE, the iterator either steps down towards the goal
+diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
+index f0af385c56e0..adfca0cf94d3 100644
+--- a/arch/x86/kvm/mmu/tdp_iter.h
++++ b/arch/x86/kvm/mmu/tdp_iter.h
+@@ -114,5 +114,6 @@ void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
+                   int min_level, gfn_t next_last_level_gfn);
+ void tdp_iter_next(struct tdp_iter *iter);
+ void tdp_iter_restart(struct tdp_iter *iter);
++void tdp_iter_step_up(struct tdp_iter *iter);
+ #endif /* __KVM_X86_MMU_TDP_ITER_H */
+diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
+index 922b06bf4b94..b61a11d462cc 100644
+--- a/arch/x86/kvm/mmu/tdp_mmu.c
++++ b/arch/x86/kvm/mmu/tdp_mmu.c
+@@ -1748,12 +1748,12 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
+       gfn_t start = slot->base_gfn;
+       gfn_t end = start + slot->npages;
+       struct tdp_iter iter;
++      int max_mapping_level;
+       kvm_pfn_t pfn;
+       rcu_read_lock();
+       tdp_root_for_each_pte(iter, root, start, end) {
+-retry:
+               if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
+                       continue;
+@@ -1761,15 +1761,41 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
+                   !is_last_spte(iter.old_spte, iter.level))
+                       continue;
++              /*
++               * This is a leaf SPTE. Check if the PFN it maps can
++               * be mapped at a higher level.
++               */
+               pfn = spte_to_pfn(iter.old_spte);
+-              if (kvm_is_reserved_pfn(pfn) ||
+-                  iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
+-                                                          pfn, PG_LEVEL_NUM))
++
++              if (kvm_is_reserved_pfn(pfn))
+                       continue;
++              max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
++                              iter.gfn, pfn, PG_LEVEL_NUM);
++
++              WARN_ON(max_mapping_level < iter.level);
++
++              /*
++               * If this page is already mapped at the highest
++               * viable level, there's nothing more to do.
++               */
++              if (max_mapping_level == iter.level)
++                      continue;
++
++              /*
++               * The page can be remapped at a higher level, so step
++               * up to zap the parent SPTE.
++               */
++              while (max_mapping_level > iter.level)
++                      tdp_iter_step_up(&iter);
++
+               /* Note, a successful atomic zap also does a remote TLB flush. */
+-              if (tdp_mmu_zap_spte_atomic(kvm, &iter))
+-                      goto retry;
++              tdp_mmu_zap_spte_atomic(kvm, &iter);
++
++              /*
++               * If the atomic zap fails, the iter will recurse back into
++               * the same subtree to retry.
++               */
+       }
+       rcu_read_unlock();
+-- 
+2.35.1
+
diff --git a/queue-5.18/kvm-x86-svm-add-__gfp_account-to-__sev_dbg_-en-de-cr.patch b/queue-5.18/kvm-x86-svm-add-__gfp_account-to-__sev_dbg_-en-de-cr.patch
new file mode 100644 (file)
index 0000000..b006c48
--- /dev/null
@@ -0,0 +1,47 @@
+From dc63942c964430e4cd9989784898f4bc54b59b96 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 23 Jun 2022 17:18:58 +0000
+Subject: KVM: x86/svm: add __GFP_ACCOUNT to __sev_dbg_{en,de}crypt_user()
+
+From: Mingwei Zhang <mizhang@google.com>
+
+[ Upstream commit ebdec859faa8cfbfef9f6c1f83d79dd6c8f4ab8c ]
+
+Adding the accounting flag when allocating pages within the SEV function,
+since these memory pages should belong to individual VM.
+
+No functional change intended.
+
+Signed-off-by: Mingwei Zhang <mizhang@google.com>
+Message-Id: <20220623171858.2083637-1-mizhang@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/svm/sev.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
+index 76e9e6eb71d6..7aa1ce34a520 100644
+--- a/arch/x86/kvm/svm/sev.c
++++ b/arch/x86/kvm/svm/sev.c
+@@ -844,7 +844,7 @@ static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
+       /* If source buffer is not aligned then use an intermediate buffer */
+       if (!IS_ALIGNED((unsigned long)vaddr, 16)) {
+-              src_tpage = alloc_page(GFP_KERNEL);
++              src_tpage = alloc_page(GFP_KERNEL_ACCOUNT);
+               if (!src_tpage)
+                       return -ENOMEM;
+@@ -865,7 +865,7 @@ static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
+       if (!IS_ALIGNED((unsigned long)dst_vaddr, 16) || !IS_ALIGNED(size, 16)) {
+               int dst_offset;
+-              dst_tpage = alloc_page(GFP_KERNEL);
++              dst_tpage = alloc_page(GFP_KERNEL_ACCOUNT);
+               if (!dst_tpage) {
+                       ret = -ENOMEM;
+                       goto e_free;
+-- 
+2.35.1
+
diff --git a/queue-5.18/selftests-kvm-handle-compiler-optimizations-in-ucall.patch b/queue-5.18/selftests-kvm-handle-compiler-optimizations-in-ucall.patch
new file mode 100644 (file)
index 0000000..1060926
--- /dev/null
@@ -0,0 +1,61 @@
+From 803343021ef3a73530cf46c9ad37759888b6f078 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 15 Jun 2022 18:57:06 +0000
+Subject: selftests: KVM: Handle compiler optimizations in ucall
+
+From: Raghavendra Rao Ananta <rananta@google.com>
+
+[ Upstream commit 9e2f6498efbbc880d7caa7935839e682b64fe5a6 ]
+
+The selftests, when built with newer versions of clang, is found
+to have over optimized guests' ucall() function, and eliminating
+the stores for uc.cmd (perhaps due to no immediate readers). This
+resulted in the userspace side always reading a value of '0', and
+causing multiple test failures.
+
+As a result, prevent the compiler from optimizing the stores in
+ucall() with WRITE_ONCE().
+
+Suggested-by: Ricardo Koller <ricarkol@google.com>
+Suggested-by: Reiji Watanabe <reijiw@google.com>
+Signed-off-by: Raghavendra Rao Ananta <rananta@google.com>
+Message-Id: <20220615185706.1099208-1-rananta@google.com>
+Reviewed-by: Andrew Jones <drjones@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ tools/testing/selftests/kvm/lib/aarch64/ucall.c | 9 ++++-----
+ 1 file changed, 4 insertions(+), 5 deletions(-)
+
+diff --git a/tools/testing/selftests/kvm/lib/aarch64/ucall.c b/tools/testing/selftests/kvm/lib/aarch64/ucall.c
+index e0b0164e9af8..be1d9728c4ce 100644
+--- a/tools/testing/selftests/kvm/lib/aarch64/ucall.c
++++ b/tools/testing/selftests/kvm/lib/aarch64/ucall.c
+@@ -73,20 +73,19 @@ void ucall_uninit(struct kvm_vm *vm)
+ void ucall(uint64_t cmd, int nargs, ...)
+ {
+-      struct ucall uc = {
+-              .cmd = cmd,
+-      };
++      struct ucall uc = {};
+       va_list va;
+       int i;
++      WRITE_ONCE(uc.cmd, cmd);
+       nargs = nargs <= UCALL_MAX_ARGS ? nargs : UCALL_MAX_ARGS;
+       va_start(va, nargs);
+       for (i = 0; i < nargs; ++i)
+-              uc.args[i] = va_arg(va, uint64_t);
++              WRITE_ONCE(uc.args[i], va_arg(va, uint64_t));
+       va_end(va);
+-      *ucall_exit_mmio_addr = (vm_vaddr_t)&uc;
++      WRITE_ONCE(*ucall_exit_mmio_addr, (vm_vaddr_t)&uc);
+ }
+ uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc)
+-- 
+2.35.1
+
index 377329a7ca10068ffab4194d6b042f4276ccbc34..f1c221207fe4326abace0051b94f6af794f66efe 100644 (file)
@@ -5,3 +5,14 @@ acpi-video-force-backlight-native-for-some-tongfang-devices.patch
 acpi-video-shortening-quirk-list-by-identifying-clevo-by-board_name-only.patch
 acpi-apei-better-fix-to-avoid-spamming-the-console-with-old-error-logs.patch
 crypto-arm64-poly1305-fix-a-read-out-of-bound.patch
+kvm-x86-do-not-report-a-vcpu-as-preempted-outside-in.patch
+kvm-x86-do-not-set-st-preempted-when-going-back-to-u.patch
+kvm-selftests-make-hyperv_clock-selftest-more-stable.patch
+kvm-x86-mmu-zap-non-leaf-sptes-when-disabling-dirty-.patch
+entry-kvm-exit-to-user-mode-when-tif_notify_signal-i.patch
+kvm-x86-disable-preemption-while-updating-apicv-inhi.patch
+kvm-x86-disable-preemption-around-the-call-to-kvm_ar.patch
+kvm-selftests-restrict-test-region-to-48-bit-physica.patch
+tools-kvm_stat-fix-display-of-error-when-multiple-pr.patch
+selftests-kvm-handle-compiler-optimizations-in-ucall.patch
+kvm-x86-svm-add-__gfp_account-to-__sev_dbg_-en-de-cr.patch
diff --git a/queue-5.18/tools-kvm_stat-fix-display-of-error-when-multiple-pr.patch b/queue-5.18/tools-kvm_stat-fix-display-of-error-when-multiple-pr.patch
new file mode 100644 (file)
index 0000000..6934efc
--- /dev/null
@@ -0,0 +1,64 @@
+From 10ae01db75615479483fc1f09f3970477727bee3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 14 Jun 2022 15:11:41 +0300
+Subject: tools/kvm_stat: fix display of error when multiple processes are
+ found
+
+From: Dmitry Klochkov <kdmitry556@gmail.com>
+
+[ Upstream commit 933b5f9f98da29af646b51b36a0753692908ef64 ]
+
+Instead of printing an error message, kvm_stat script fails when we
+restrict statistics to a guest by its name and there are multiple guests
+with such name:
+
+  # kvm_stat -g my_vm
+  Traceback (most recent call last):
+    File "/usr/bin/kvm_stat", line 1819, in <module>
+      main()
+    File "/usr/bin/kvm_stat", line 1779, in main
+      options = get_options()
+    File "/usr/bin/kvm_stat", line 1718, in get_options
+      options = argparser.parse_args()
+    File "/usr/lib64/python3.10/argparse.py", line 1825, in parse_args
+      args, argv = self.parse_known_args(args, namespace)
+    File "/usr/lib64/python3.10/argparse.py", line 1858, in parse_known_args
+      namespace, args = self._parse_known_args(args, namespace)
+    File "/usr/lib64/python3.10/argparse.py", line 2067, in _parse_known_args
+      start_index = consume_optional(start_index)
+    File "/usr/lib64/python3.10/argparse.py", line 2007, in consume_optional
+      take_action(action, args, option_string)
+    File "/usr/lib64/python3.10/argparse.py", line 1935, in take_action
+      action(self, namespace, argument_values, option_string)
+    File "/usr/bin/kvm_stat", line 1649, in __call__
+      ' to specify the desired pid'.format(" ".join(pids)))
+  TypeError: sequence item 0: expected str instance, int found
+
+To avoid this, it's needed to convert pids int values to strings before
+pass them to join().
+
+Signed-off-by: Dmitry Klochkov <kdmitry556@gmail.com>
+Message-Id: <20220614121141.160689-1-kdmitry556@gmail.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ tools/kvm/kvm_stat/kvm_stat | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat
+index 5a5bd74f55bd..9c366b3a676d 100755
+--- a/tools/kvm/kvm_stat/kvm_stat
++++ b/tools/kvm/kvm_stat/kvm_stat
+@@ -1646,7 +1646,8 @@ Press any other key to refresh statistics immediately.
+                          .format(values))
+             if len(pids) > 1:
+                 sys.exit('Error: Multiple processes found (pids: {}). Use "-p"'
+-                         ' to specify the desired pid'.format(" ".join(pids)))
++                         ' to specify the desired pid'
++                         .format(" ".join(map(str, pids))))
+             namespace.pid = pids[0]
+     argparser = argparse.ArgumentParser(description=description_text,
+-- 
+2.35.1
+