From: Sasha Levin <sashal@kernel.org>
Date: Sat, 6 Aug 2022 15:31:09 +0000 (-0400)
Subject: Fixes for 5.18
X-Git-Tag: v4.19.255~23
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=24db5d7bd6b6b22b2f701e99eb00ec088e57cb38;p=thirdparty%2Fkernel%2Fstable-queue.git

Fixes for 5.18

Signed-off-by: Sasha Levin <sashal@kernel.org>
---

diff --git a/queue-5.18/entry-kvm-exit-to-user-mode-when-tif_notify_signal-i.patch b/queue-5.18/entry-kvm-exit-to-user-mode-when-tif_notify_signal-i.patch
new file mode 100644
index 00000000000..f3ec713157c
--- /dev/null
+++ b/queue-5.18/entry-kvm-exit-to-user-mode-when-tif_notify_signal-i.patch
@@ -0,0 +1,57 @@
+From 2464e2253fce3d174d78d95e6266a72659ab8476 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 4 May 2022 13:08:40 -0500
+Subject: entry/kvm: Exit to user mode when TIF_NOTIFY_SIGNAL is set
+
+From: Seth Forshee <sforshee@digitalocean.com>
+
+[ Upstream commit 3e684903a8574ffc9475fdf13c4780a7adb506ad ]
+
+A livepatch transition may stall indefinitely when a kvm vCPU is heavily
+loaded. To the host, the vCPU task is a user thread which is spending a
+very long time in the ioctl(KVM_RUN) syscall. During livepatch
+transition, set_notify_signal() will be called on such tasks to
+interrupt the syscall so that the task can be transitioned. This
+interrupts guest execution, but when xfer_to_guest_mode_work() sees that
+TIF_NOTIFY_SIGNAL is set but not TIF_SIGPENDING it concludes that an
+exit to user mode is unnecessary, and guest execution is resumed without
+transitioning the task for the livepatch.
+
+This handling of TIF_NOTIFY_SIGNAL is incorrect, as set_notify_signal()
+is expected to break tasks out of interruptible kernel loops and cause
+them to return to userspace. Change xfer_to_guest_mode_work() to handle
+TIF_NOTIFY_SIGNAL the same as TIF_SIGPENDING, signaling to the vCPU run
+loop that an exit to userpsace is needed. Any pending task_work will be
+run when get_signal() is called from exit_to_user_mode_loop(), so there
+is no longer any need to run task work from xfer_to_guest_mode_work().
+
+Suggested-by: "Eric W. Biederman" <ebiederm@xmission.com>
+Cc: Petr Mladek <pmladek@suse.com>
+Signed-off-by: Seth Forshee <sforshee@digitalocean.com>
+Message-Id: <20220504180840.2907296-1-sforshee@digitalocean.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/entry/kvm.c | 6 ------
+ 1 file changed, 6 deletions(-)
+
+diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c
+index 9d09f489b60e..2e0f75bcb7fd 100644
+--- a/kernel/entry/kvm.c
++++ b/kernel/entry/kvm.c
+@@ -9,12 +9,6 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
+ 		int ret;
+ 
+ 		if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) {
+-			clear_notify_signal();
+-			if (task_work_pending(current))
+-				task_work_run();
+-		}
+-
+-		if (ti_work & _TIF_SIGPENDING) {
+ 			kvm_handle_signal_exit(vcpu);
+ 			return -EINTR;
+ 		}
+-- 
+2.35.1
+
diff --git a/queue-5.18/kvm-selftests-make-hyperv_clock-selftest-more-stable.patch b/queue-5.18/kvm-selftests-make-hyperv_clock-selftest-more-stable.patch
new file mode 100644
index 00000000000..02750c57ffe
--- /dev/null
+++ b/queue-5.18/kvm-selftests-make-hyperv_clock-selftest-more-stable.patch
@@ -0,0 +1,75 @@
+From e7c7b8f2f24abbc4915f07595c198d089552495b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 1 Jun 2022 16:43:22 +0200
+Subject: KVM: selftests: Make hyperv_clock selftest more stable
+
+From: Vitaly Kuznetsov <vkuznets@redhat.com>
+
+[ Upstream commit eae260be3a0111a28fe95923e117a55dddec0384 ]
+
+hyperv_clock doesn't always give a stable test result, especially with
+AMD CPUs. The test compares Hyper-V MSR clocksource (acquired either
+with rdmsr() from within the guest or KVM_GET_MSRS from the host)
+against rdtsc(). To increase the accuracy, increase the measured delay
+(done with nop loop) by two orders of magnitude and take the mean rdtsc()
+value before and after rdmsr()/KVM_GET_MSRS.
+
+Reported-by: Maxim Levitsky <mlevitsk@redhat.com>
+Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
+Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
+Tested-by: Maxim Levitsky <mlevitsk@redhat.com>
+Message-Id: <20220601144322.1968742-1-vkuznets@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ tools/testing/selftests/kvm/x86_64/hyperv_clock.c | 10 +++++++---
+ 1 file changed, 7 insertions(+), 3 deletions(-)
+
+diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_clock.c b/tools/testing/selftests/kvm/x86_64/hyperv_clock.c
+index e0b2bb1339b1..3330fb183c68 100644
+--- a/tools/testing/selftests/kvm/x86_64/hyperv_clock.c
++++ b/tools/testing/selftests/kvm/x86_64/hyperv_clock.c
+@@ -44,7 +44,7 @@ static inline void nop_loop(void)
+ {
+ 	int i;
+ 
+-	for (i = 0; i < 1000000; i++)
++	for (i = 0; i < 100000000; i++)
+ 		asm volatile("nop");
+ }
+ 
+@@ -56,12 +56,14 @@ static inline void check_tsc_msr_rdtsc(void)
+ 	tsc_freq = rdmsr(HV_X64_MSR_TSC_FREQUENCY);
+ 	GUEST_ASSERT(tsc_freq > 0);
+ 
+-	/* First, check MSR-based clocksource */
++	/* For increased accuracy, take mean rdtsc() before and afrer rdmsr() */
+ 	r1 = rdtsc();
+ 	t1 = rdmsr(HV_X64_MSR_TIME_REF_COUNT);
++	r1 = (r1 + rdtsc()) / 2;
+ 	nop_loop();
+ 	r2 = rdtsc();
+ 	t2 = rdmsr(HV_X64_MSR_TIME_REF_COUNT);
++	r2 = (r2 + rdtsc()) / 2;
+ 
+ 	GUEST_ASSERT(r2 > r1 && t2 > t1);
+ 
+@@ -181,12 +183,14 @@ static void host_check_tsc_msr_rdtsc(struct kvm_vm *vm)
+ 	tsc_freq = vcpu_get_msr(vm, VCPU_ID, HV_X64_MSR_TSC_FREQUENCY);
+ 	TEST_ASSERT(tsc_freq > 0, "TSC frequency must be nonzero");
+ 
+-	/* First, check MSR-based clocksource */
++	/* For increased accuracy, take mean rdtsc() before and afrer ioctl */
+ 	r1 = rdtsc();
+ 	t1 = vcpu_get_msr(vm, VCPU_ID, HV_X64_MSR_TIME_REF_COUNT);
++	r1 = (r1 + rdtsc()) / 2;
+ 	nop_loop();
+ 	r2 = rdtsc();
+ 	t2 = vcpu_get_msr(vm, VCPU_ID, HV_X64_MSR_TIME_REF_COUNT);
++	r2 = (r2 + rdtsc()) / 2;
+ 
+ 	TEST_ASSERT(t2 > t1, "Time reference MSR is not monotonic (%ld <= %ld)", t1, t2);
+ 
+-- 
+2.35.1
+
diff --git a/queue-5.18/kvm-selftests-restrict-test-region-to-48-bit-physica.patch b/queue-5.18/kvm-selftests-restrict-test-region-to-48-bit-physica.patch
new file mode 100644
index 00000000000..d9621796a01
--- /dev/null
+++ b/queue-5.18/kvm-selftests-restrict-test-region-to-48-bit-physica.patch
@@ -0,0 +1,79 @@
+From 153db4a959fd4efda9196c92f49e655a9204d9da Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 20 May 2022 23:32:49 +0000
+Subject: KVM: selftests: Restrict test region to 48-bit physical addresses
+ when using nested
+
+From: David Matlack <dmatlack@google.com>
+
+[ Upstream commit e0f3f46e42064a51573914766897b4ab95d943e3 ]
+
+The selftests nested code only supports 4-level paging at the moment.
+This means it cannot map nested guest physical addresses with more than
+48 bits. Allow perf_test_util nested mode to work on hosts with more
+than 48 physical addresses by restricting the guest test region to
+48-bits.
+
+While here, opportunistically fix an off-by-one error when dealing with
+vm_get_max_gfn(). perf_test_util.c was treating this as the maximum
+number of GFNs, rather than the maximum allowed GFN. This didn't result
+in any correctness issues, but it did end up shifting the test region
+down slightly when using huge pages.
+
+Suggested-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: David Matlack <dmatlack@google.com>
+Message-Id: <20220520233249.3776001-12-dmatlack@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ .../testing/selftests/kvm/lib/perf_test_util.c | 18 +++++++++++++++---
+ 1 file changed, 15 insertions(+), 3 deletions(-)
+
+diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c
+index 722df3a28791..ddd68ba0c99f 100644
+--- a/tools/testing/selftests/kvm/lib/perf_test_util.c
++++ b/tools/testing/selftests/kvm/lib/perf_test_util.c
+@@ -110,6 +110,7 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus,
+ 	struct kvm_vm *vm;
+ 	uint64_t guest_num_pages;
+ 	uint64_t backing_src_pagesz = get_backing_src_pagesz(backing_src);
++	uint64_t region_end_gfn;
+ 	int i;
+ 
+ 	pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode));
+@@ -144,18 +145,29 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus,
+ 
+ 	pta->vm = vm;
+ 
++	/* Put the test region at the top guest physical memory. */
++	region_end_gfn = vm_get_max_gfn(vm) + 1;
++
++#ifdef __x86_64__
++	/*
++	 * When running vCPUs in L2, restrict the test region to 48 bits to
++	 * avoid needing 5-level page tables to identity map L2.
++	 */
++	if (pta->nested)
++		region_end_gfn = min(region_end_gfn, (1UL << 48) / pta->guest_page_size);
++#endif
+ 	/*
+ 	 * If there should be more memory in the guest test region than there
+ 	 * can be pages in the guest, it will definitely cause problems.
+ 	 */
+-	TEST_ASSERT(guest_num_pages < vm_get_max_gfn(vm),
++	TEST_ASSERT(guest_num_pages < region_end_gfn,
+ 		    "Requested more guest memory than address space allows.\n"
+ 		    "    guest pages: %" PRIx64 " max gfn: %" PRIx64
+ 		    " vcpus: %d wss: %" PRIx64 "]\n",
+-		    guest_num_pages, vm_get_max_gfn(vm), vcpus,
++		    guest_num_pages, region_end_gfn - 1, vcpus,
+ 		    vcpu_memory_bytes);
+ 
+-	pta->gpa = (vm_get_max_gfn(vm) - guest_num_pages) * pta->guest_page_size;
++	pta->gpa = (region_end_gfn - guest_num_pages) * pta->guest_page_size;
+ 	pta->gpa = align_down(pta->gpa, backing_src_pagesz);
+ #ifdef __s390x__
+ 	/* Align to 1M (segment size) */
+-- 
+2.35.1
+
diff --git a/queue-5.18/kvm-x86-disable-preemption-around-the-call-to-kvm_ar.patch b/queue-5.18/kvm-x86-disable-preemption-around-the-call-to-kvm_ar.patch
new file mode 100644
index 00000000000..b8dcb1fccf2
--- /dev/null
+++ b/queue-5.18/kvm-x86-disable-preemption-around-the-call-to-kvm_ar.patch
@@ -0,0 +1,62 @@
+From 51da2a430f49e1115c2e805118fe91667ced63cf Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 6 Jun 2022 21:08:28 +0300
+Subject: KVM: x86: disable preemption around the call to
+ kvm_arch_vcpu_{un|}blocking
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+[ Upstream commit 18869f26df1a11ed11031dfb7392bc7d774062e8 ]
+
+On SVM, if preemption happens right after the call to finish_rcuwait
+but before call to kvm_arch_vcpu_unblocking on SVM/AVIC, it itself
+will re-enable AVIC, and then we will try to re-enable it again
+in kvm_arch_vcpu_unblocking which will lead to a warning
+in __avic_vcpu_load.
+
+The same problem can happen if the vCPU is preempted right after the call
+to kvm_arch_vcpu_blocking but before the call to prepare_to_rcuwait
+and in this case, we will end up with AVIC enabled during sleep -
+Ooops.
+
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Message-Id: <20220606180829.102503-7-mlevitsk@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ virt/kvm/kvm_main.c | 8 ++++++--
+ 1 file changed, 6 insertions(+), 2 deletions(-)
+
+diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
+index 24cb37d19c63..7f1d19689701 100644
+--- a/virt/kvm/kvm_main.c
++++ b/virt/kvm/kvm_main.c
+@@ -3327,9 +3327,11 @@ bool kvm_vcpu_block(struct kvm_vcpu *vcpu)
+ 
+ 	vcpu->stat.generic.blocking = 1;
+ 
++	preempt_disable();
+ 	kvm_arch_vcpu_blocking(vcpu);
+-
+ 	prepare_to_rcuwait(wait);
++	preempt_enable();
++
+ 	for (;;) {
+ 		set_current_state(TASK_INTERRUPTIBLE);
+ 
+@@ -3339,9 +3341,11 @@ bool kvm_vcpu_block(struct kvm_vcpu *vcpu)
+ 		waited = true;
+ 		schedule();
+ 	}
+-	finish_rcuwait(wait);
+ 
++	preempt_disable();
++	finish_rcuwait(wait);
+ 	kvm_arch_vcpu_unblocking(vcpu);
++	preempt_enable();
+ 
+ 	vcpu->stat.generic.blocking = 0;
+ 
+-- 
+2.35.1
+
diff --git a/queue-5.18/kvm-x86-disable-preemption-while-updating-apicv-inhi.patch b/queue-5.18/kvm-x86-disable-preemption-while-updating-apicv-inhi.patch
new file mode 100644
index 00000000000..0f278814083
--- /dev/null
+++ b/queue-5.18/kvm-x86-disable-preemption-while-updating-apicv-inhi.patch
@@ -0,0 +1,53 @@
+From 4fc1688d87063c8fba524640fb2ab8073c7836c3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 6 Jun 2022 21:08:27 +0300
+Subject: KVM: x86: disable preemption while updating apicv inhibition
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+[ Upstream commit 66c768d30e64e1280520f34dbef83419f55f3459 ]
+
+Currently nothing prevents preemption in kvm_vcpu_update_apicv.
+
+On SVM, If the preemption happens after we update the
+vcpu->arch.apicv_active, the preemption itself will
+'update' the inhibition since the AVIC will be first disabled
+on vCPU unload and then enabled, when the current task
+is loaded again.
+
+Then we will try to update it again, which will lead to a warning
+in __avic_vcpu_load, that the AVIC is already enabled.
+
+Fix this by disabling preemption in this code.
+
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Message-Id: <20220606180829.102503-6-mlevitsk@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/x86.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index 91d887fd10ab..65b0ec28bd52 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -9784,6 +9784,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
+ 		return;
+ 
+ 	down_read(&vcpu->kvm->arch.apicv_update_lock);
++	preempt_disable();
+ 
+ 	activate = kvm_apicv_activated(vcpu->kvm);
+ 	if (vcpu->arch.apicv_active == activate)
+@@ -9803,6 +9804,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
+ 		kvm_make_request(KVM_REQ_EVENT, vcpu);
+ 
+ out:
++	preempt_enable();
+ 	up_read(&vcpu->kvm->arch.apicv_update_lock);
+ }
+ EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
+-- 
+2.35.1
+
diff --git a/queue-5.18/kvm-x86-do-not-report-a-vcpu-as-preempted-outside-in.patch b/queue-5.18/kvm-x86-do-not-report-a-vcpu-as-preempted-outside-in.patch
new file mode 100644
index 00000000000..8c3126f1192
--- /dev/null
+++ b/queue-5.18/kvm-x86-do-not-report-a-vcpu-as-preempted-outside-in.patch
@@ -0,0 +1,136 @@
+From 491ff8c3605ee8f022817a87843afbb2dea86fc9 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 7 Jun 2022 10:09:03 -0400
+Subject: KVM: x86: do not report a vCPU as preempted outside instruction
+ boundaries
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+[ Upstream commit 6cd88243c7e03845a450795e134b488fc2afb736 ]
+
+If a vCPU is outside guest mode and is scheduled out, it might be in the
+process of making a memory access.  A problem occurs if another vCPU uses
+the PV TLB flush feature during the period when the vCPU is scheduled
+out, and a virtual address has already been translated but has not yet
+been accessed, because this is equivalent to using a stale TLB entry.
+
+To avoid this, only report a vCPU as preempted if sure that the guest
+is at an instruction boundary.  A rescheduling request will be delivered
+to the host physical CPU as an external interrupt, so for simplicity
+consider any vmexit *not* instruction boundary except for external
+interrupts.
+
+It would in principle be okay to report the vCPU as preempted also
+if it is sleeping in kvm_vcpu_block(): a TLB flush IPI will incur the
+vmentry/vmexit overhead unnecessarily, and optimistic spinning is
+also unlikely to succeed.  However, leave it for later because right
+now kvm_vcpu_check_block() is doing memory accesses.  Even
+though the TLB flush issue only applies to virtual memory address,
+it's very much preferrable to be conservative.
+
+Reported-by: Jann Horn <jannh@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/kvm_host.h |  3 +++
+ arch/x86/kvm/svm/svm.c          |  2 ++
+ arch/x86/kvm/vmx/vmx.c          |  1 +
+ arch/x86/kvm/x86.c              | 22 ++++++++++++++++++++++
+ 4 files changed, 28 insertions(+)
+
+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
+index 4ff36610af6a..9fdaa847d4b6 100644
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -651,6 +651,7 @@ struct kvm_vcpu_arch {
+ 	u64 ia32_misc_enable_msr;
+ 	u64 smbase;
+ 	u64 smi_count;
++	bool at_instruction_boundary;
+ 	bool tpr_access_reporting;
+ 	bool xsaves_enabled;
+ 	bool xfd_no_write_intercept;
+@@ -1289,6 +1290,8 @@ struct kvm_vcpu_stat {
+ 	u64 nested_run;
+ 	u64 directed_yield_attempted;
+ 	u64 directed_yield_successful;
++	u64 preemption_reported;
++	u64 preemption_other;
+ 	u64 guest_mode;
+ };
+ 
+diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
+index 6bfb0b0e66bd..c667214c630b 100644
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -4166,6 +4166,8 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
+ 
+ static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
+ {
++	if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_INTR)
++		vcpu->arch.at_instruction_boundary = true;
+ }
+ 
+ static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 4b6a0268c78e..597c3c08da50 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -6630,6 +6630,7 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
+ 		return;
+ 
+ 	handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
++	vcpu->arch.at_instruction_boundary = true;
+ }
+ 
+ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index 53b6fdf30c99..df74ec51c7f3 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -291,6 +291,8 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
+ 	STATS_DESC_COUNTER(VCPU, nested_run),
+ 	STATS_DESC_COUNTER(VCPU, directed_yield_attempted),
+ 	STATS_DESC_COUNTER(VCPU, directed_yield_successful),
++	STATS_DESC_COUNTER(VCPU, preemption_reported),
++	STATS_DESC_COUNTER(VCPU, preemption_other),
+ 	STATS_DESC_ICOUNTER(VCPU, guest_mode)
+ };
+ 
+@@ -4607,6 +4609,19 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
+ 	struct kvm_memslots *slots;
+ 	static const u8 preempted = KVM_VCPU_PREEMPTED;
+ 
++	/*
++	 * The vCPU can be marked preempted if and only if the VM-Exit was on
++	 * an instruction boundary and will not trigger guest emulation of any
++	 * kind (see vcpu_run).  Vendor specific code controls (conservatively)
++	 * when this is true, for example allowing the vCPU to be marked
++	 * preempted if and only if the VM-Exit was due to a host interrupt.
++	 */
++	if (!vcpu->arch.at_instruction_boundary) {
++		vcpu->stat.preemption_other++;
++		return;
++	}
++
++	vcpu->stat.preemption_reported++;
+ 	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
+ 		return;
+ 
+@@ -10363,6 +10378,13 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
+ 	vcpu->arch.l1tf_flush_l1d = true;
+ 
+ 	for (;;) {
++		/*
++		 * If another guest vCPU requests a PV TLB flush in the middle
++		 * of instruction emulation, the rest of the emulation could
++		 * use a stale page translation. Assume that any code after
++		 * this point can start executing an instruction.
++		 */
++		vcpu->arch.at_instruction_boundary = false;
+ 		if (kvm_vcpu_running(vcpu)) {
+ 			r = vcpu_enter_guest(vcpu);
+ 		} else {
+-- 
+2.35.1
+
diff --git a/queue-5.18/kvm-x86-do-not-set-st-preempted-when-going-back-to-u.patch b/queue-5.18/kvm-x86-do-not-set-st-preempted-when-going-back-to-u.patch
new file mode 100644
index 00000000000..90ffbdb0fca
--- /dev/null
+++ b/queue-5.18/kvm-x86-do-not-set-st-preempted-when-going-back-to-u.patch
@@ -0,0 +1,81 @@
+From 24c49abc49511c32c73ab946cd5b29bdf71c21cb Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 7 Jun 2022 10:07:11 -0400
+Subject: KVM: x86: do not set st->preempted when going back to user space
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+[ Upstream commit 54aa83c90198e68eee8b0850c749bc70efb548da ]
+
+Similar to the Xen path, only change the vCPU's reported state if the vCPU
+was actually preempted.  The reason for KVM's behavior is that for example
+optimistic spinning might not be a good idea if the guest is doing repeated
+exits to userspace; however, it is confusing and unlikely to make a difference,
+because well-tuned guests will hardly ever exit KVM_RUN in the first place.
+
+Suggested-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/x86.c | 26 ++++++++++++++------------
+ arch/x86/kvm/xen.h |  6 ++++--
+ 2 files changed, 18 insertions(+), 14 deletions(-)
+
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index df74ec51c7f3..91d887fd10ab 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -4651,19 +4651,21 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
+ {
+ 	int idx;
+ 
+-	if (vcpu->preempted && !vcpu->arch.guest_state_protected)
+-		vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu);
++	if (vcpu->preempted) {
++		if (!vcpu->arch.guest_state_protected)
++			vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu);
+ 
+-	/*
+-	 * Take the srcu lock as memslots will be accessed to check the gfn
+-	 * cache generation against the memslots generation.
+-	 */
+-	idx = srcu_read_lock(&vcpu->kvm->srcu);
+-	if (kvm_xen_msr_enabled(vcpu->kvm))
+-		kvm_xen_runstate_set_preempted(vcpu);
+-	else
+-		kvm_steal_time_set_preempted(vcpu);
+-	srcu_read_unlock(&vcpu->kvm->srcu, idx);
++		/*
++		 * Take the srcu lock as memslots will be accessed to check the gfn
++		 * cache generation against the memslots generation.
++		 */
++		idx = srcu_read_lock(&vcpu->kvm->srcu);
++		if (kvm_xen_msr_enabled(vcpu->kvm))
++			kvm_xen_runstate_set_preempted(vcpu);
++		else
++			kvm_steal_time_set_preempted(vcpu);
++		srcu_read_unlock(&vcpu->kvm->srcu, idx);
++	}
+ 
+ 	static_call(kvm_x86_vcpu_put)(vcpu);
+ 	vcpu->arch.last_host_tsc = rdtsc();
+diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h
+index adbcc9ed59db..fda1413f8af9 100644
+--- a/arch/x86/kvm/xen.h
++++ b/arch/x86/kvm/xen.h
+@@ -103,8 +103,10 @@ static inline void kvm_xen_runstate_set_preempted(struct kvm_vcpu *vcpu)
+ 	 * behalf of the vCPU. Only if the VMM does actually block
+ 	 * does it need to enter RUNSTATE_blocked.
+ 	 */
+-	if (vcpu->preempted)
+-		kvm_xen_update_runstate_guest(vcpu, RUNSTATE_runnable);
++	if (WARN_ON_ONCE(!vcpu->preempted))
++		return;
++
++	kvm_xen_update_runstate_guest(vcpu, RUNSTATE_runnable);
+ }
+ 
+ /* 32-bit compatibility definitions, also used natively in 32-bit build */
+-- 
+2.35.1
+
diff --git a/queue-5.18/kvm-x86-mmu-zap-non-leaf-sptes-when-disabling-dirty-.patch b/queue-5.18/kvm-x86-mmu-zap-non-leaf-sptes-when-disabling-dirty-.patch
new file mode 100644
index 00000000000..4aca9d8677e
--- /dev/null
+++ b/queue-5.18/kvm-x86-mmu-zap-non-leaf-sptes-when-disabling-dirty-.patch
@@ -0,0 +1,131 @@
+From c8b327fe59257653dfee8d80ebfce83def9c1d2c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 25 May 2022 23:09:04 +0000
+Subject: KVM: x86/MMU: Zap non-leaf SPTEs when disabling dirty logging
+
+From: Ben Gardon <bgardon@google.com>
+
+[ Upstream commit 5ba7c4c6d1c7af47a916f728bb5940669684a087 ]
+
+Currently disabling dirty logging with the TDP MMU is extremely slow.
+On a 96 vCPU / 96G VM backed with gigabyte pages, it takes ~200 seconds
+to disable dirty logging with the TDP MMU, as opposed to ~4 seconds with
+the shadow MMU.
+
+When disabling dirty logging, zap non-leaf parent entries to allow
+replacement with huge pages instead of recursing and zapping all of the
+child, leaf entries. This reduces the number of TLB flushes required.
+and reduces the disable dirty log time with the TDP MMU to ~3 seconds.
+
+Opportunistically add a WARN() to catch GFNs that are mapped at a
+higher level than their max level.
+
+Signed-off-by: Ben Gardon <bgardon@google.com>
+Message-Id: <20220525230904.1584480-1-bgardon@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/mmu/tdp_iter.c |  9 +++++++++
+ arch/x86/kvm/mmu/tdp_iter.h |  1 +
+ arch/x86/kvm/mmu/tdp_mmu.c  | 38 +++++++++++++++++++++++++++++++------
+ 3 files changed, 42 insertions(+), 6 deletions(-)
+
+diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
+index 6d3b3e5a5533..ee4802d7b36c 100644
+--- a/arch/x86/kvm/mmu/tdp_iter.c
++++ b/arch/x86/kvm/mmu/tdp_iter.c
+@@ -145,6 +145,15 @@ static bool try_step_up(struct tdp_iter *iter)
+ 	return true;
+ }
+ 
++/*
++ * Step the iterator back up a level in the paging structure. Should only be
++ * used when the iterator is below the root level.
++ */
++void tdp_iter_step_up(struct tdp_iter *iter)
++{
++	WARN_ON(!try_step_up(iter));
++}
++
+ /*
+  * Step to the next SPTE in a pre-order traversal of the paging structure.
+  * To get to the next SPTE, the iterator either steps down towards the goal
+diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
+index f0af385c56e0..adfca0cf94d3 100644
+--- a/arch/x86/kvm/mmu/tdp_iter.h
++++ b/arch/x86/kvm/mmu/tdp_iter.h
+@@ -114,5 +114,6 @@ void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
+ 		    int min_level, gfn_t next_last_level_gfn);
+ void tdp_iter_next(struct tdp_iter *iter);
+ void tdp_iter_restart(struct tdp_iter *iter);
++void tdp_iter_step_up(struct tdp_iter *iter);
+ 
+ #endif /* __KVM_X86_MMU_TDP_ITER_H */
+diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
+index 922b06bf4b94..b61a11d462cc 100644
+--- a/arch/x86/kvm/mmu/tdp_mmu.c
++++ b/arch/x86/kvm/mmu/tdp_mmu.c
+@@ -1748,12 +1748,12 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
+ 	gfn_t start = slot->base_gfn;
+ 	gfn_t end = start + slot->npages;
+ 	struct tdp_iter iter;
++	int max_mapping_level;
+ 	kvm_pfn_t pfn;
+ 
+ 	rcu_read_lock();
+ 
+ 	tdp_root_for_each_pte(iter, root, start, end) {
+-retry:
+ 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
+ 			continue;
+ 
+@@ -1761,15 +1761,41 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
+ 		    !is_last_spte(iter.old_spte, iter.level))
+ 			continue;
+ 
++		/*
++		 * This is a leaf SPTE. Check if the PFN it maps can
++		 * be mapped at a higher level.
++		 */
+ 		pfn = spte_to_pfn(iter.old_spte);
+-		if (kvm_is_reserved_pfn(pfn) ||
+-		    iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
+-							    pfn, PG_LEVEL_NUM))
++
++		if (kvm_is_reserved_pfn(pfn))
+ 			continue;
+ 
++		max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
++				iter.gfn, pfn, PG_LEVEL_NUM);
++
++		WARN_ON(max_mapping_level < iter.level);
++
++		/*
++		 * If this page is already mapped at the highest
++		 * viable level, there's nothing more to do.
++		 */
++		if (max_mapping_level == iter.level)
++			continue;
++
++		/*
++		 * The page can be remapped at a higher level, so step
++		 * up to zap the parent SPTE.
++		 */
++		while (max_mapping_level > iter.level)
++			tdp_iter_step_up(&iter);
++
+ 		/* Note, a successful atomic zap also does a remote TLB flush. */
+-		if (tdp_mmu_zap_spte_atomic(kvm, &iter))
+-			goto retry;
++		tdp_mmu_zap_spte_atomic(kvm, &iter);
++
++		/*
++		 * If the atomic zap fails, the iter will recurse back into
++		 * the same subtree to retry.
++		 */
+ 	}
+ 
+ 	rcu_read_unlock();
+-- 
+2.35.1
+
diff --git a/queue-5.18/kvm-x86-svm-add-__gfp_account-to-__sev_dbg_-en-de-cr.patch b/queue-5.18/kvm-x86-svm-add-__gfp_account-to-__sev_dbg_-en-de-cr.patch
new file mode 100644
index 00000000000..b006c48472f
--- /dev/null
+++ b/queue-5.18/kvm-x86-svm-add-__gfp_account-to-__sev_dbg_-en-de-cr.patch
@@ -0,0 +1,47 @@
+From dc63942c964430e4cd9989784898f4bc54b59b96 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 23 Jun 2022 17:18:58 +0000
+Subject: KVM: x86/svm: add __GFP_ACCOUNT to __sev_dbg_{en,de}crypt_user()
+
+From: Mingwei Zhang <mizhang@google.com>
+
+[ Upstream commit ebdec859faa8cfbfef9f6c1f83d79dd6c8f4ab8c ]
+
+Adding the accounting flag when allocating pages within the SEV function,
+since these memory pages should belong to individual VM.
+
+No functional change intended.
+
+Signed-off-by: Mingwei Zhang <mizhang@google.com>
+Message-Id: <20220623171858.2083637-1-mizhang@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/svm/sev.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
+index 76e9e6eb71d6..7aa1ce34a520 100644
+--- a/arch/x86/kvm/svm/sev.c
++++ b/arch/x86/kvm/svm/sev.c
+@@ -844,7 +844,7 @@ static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
+ 
+ 	/* If source buffer is not aligned then use an intermediate buffer */
+ 	if (!IS_ALIGNED((unsigned long)vaddr, 16)) {
+-		src_tpage = alloc_page(GFP_KERNEL);
++		src_tpage = alloc_page(GFP_KERNEL_ACCOUNT);
+ 		if (!src_tpage)
+ 			return -ENOMEM;
+ 
+@@ -865,7 +865,7 @@ static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
+ 	if (!IS_ALIGNED((unsigned long)dst_vaddr, 16) || !IS_ALIGNED(size, 16)) {
+ 		int dst_offset;
+ 
+-		dst_tpage = alloc_page(GFP_KERNEL);
++		dst_tpage = alloc_page(GFP_KERNEL_ACCOUNT);
+ 		if (!dst_tpage) {
+ 			ret = -ENOMEM;
+ 			goto e_free;
+-- 
+2.35.1
+
diff --git a/queue-5.18/selftests-kvm-handle-compiler-optimizations-in-ucall.patch b/queue-5.18/selftests-kvm-handle-compiler-optimizations-in-ucall.patch
new file mode 100644
index 00000000000..1060926ca10
--- /dev/null
+++ b/queue-5.18/selftests-kvm-handle-compiler-optimizations-in-ucall.patch
@@ -0,0 +1,61 @@
+From 803343021ef3a73530cf46c9ad37759888b6f078 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 15 Jun 2022 18:57:06 +0000
+Subject: selftests: KVM: Handle compiler optimizations in ucall
+
+From: Raghavendra Rao Ananta <rananta@google.com>
+
+[ Upstream commit 9e2f6498efbbc880d7caa7935839e682b64fe5a6 ]
+
+The selftests, when built with newer versions of clang, is found
+to have over optimized guests' ucall() function, and eliminating
+the stores for uc.cmd (perhaps due to no immediate readers). This
+resulted in the userspace side always reading a value of '0', and
+causing multiple test failures.
+
+As a result, prevent the compiler from optimizing the stores in
+ucall() with WRITE_ONCE().
+
+Suggested-by: Ricardo Koller <ricarkol@google.com>
+Suggested-by: Reiji Watanabe <reijiw@google.com>
+Signed-off-by: Raghavendra Rao Ananta <rananta@google.com>
+Message-Id: <20220615185706.1099208-1-rananta@google.com>
+Reviewed-by: Andrew Jones <drjones@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ tools/testing/selftests/kvm/lib/aarch64/ucall.c | 9 ++++-----
+ 1 file changed, 4 insertions(+), 5 deletions(-)
+
+diff --git a/tools/testing/selftests/kvm/lib/aarch64/ucall.c b/tools/testing/selftests/kvm/lib/aarch64/ucall.c
+index e0b0164e9af8..be1d9728c4ce 100644
+--- a/tools/testing/selftests/kvm/lib/aarch64/ucall.c
++++ b/tools/testing/selftests/kvm/lib/aarch64/ucall.c
+@@ -73,20 +73,19 @@ void ucall_uninit(struct kvm_vm *vm)
+ 
+ void ucall(uint64_t cmd, int nargs, ...)
+ {
+-	struct ucall uc = {
+-		.cmd = cmd,
+-	};
++	struct ucall uc = {};
+ 	va_list va;
+ 	int i;
+ 
++	WRITE_ONCE(uc.cmd, cmd);
+ 	nargs = nargs <= UCALL_MAX_ARGS ? nargs : UCALL_MAX_ARGS;
+ 
+ 	va_start(va, nargs);
+ 	for (i = 0; i < nargs; ++i)
+-		uc.args[i] = va_arg(va, uint64_t);
++		WRITE_ONCE(uc.args[i], va_arg(va, uint64_t));
+ 	va_end(va);
+ 
+-	*ucall_exit_mmio_addr = (vm_vaddr_t)&uc;
++	WRITE_ONCE(*ucall_exit_mmio_addr, (vm_vaddr_t)&uc);
+ }
+ 
+ uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc)
+-- 
+2.35.1
+
diff --git a/queue-5.18/series b/queue-5.18/series
index 377329a7ca1..f1c221207fe 100644
--- a/queue-5.18/series
+++ b/queue-5.18/series
@@ -5,3 +5,14 @@ acpi-video-force-backlight-native-for-some-tongfang-devices.patch
 acpi-video-shortening-quirk-list-by-identifying-clevo-by-board_name-only.patch
 acpi-apei-better-fix-to-avoid-spamming-the-console-with-old-error-logs.patch
 crypto-arm64-poly1305-fix-a-read-out-of-bound.patch
+kvm-x86-do-not-report-a-vcpu-as-preempted-outside-in.patch
+kvm-x86-do-not-set-st-preempted-when-going-back-to-u.patch
+kvm-selftests-make-hyperv_clock-selftest-more-stable.patch
+kvm-x86-mmu-zap-non-leaf-sptes-when-disabling-dirty-.patch
+entry-kvm-exit-to-user-mode-when-tif_notify_signal-i.patch
+kvm-x86-disable-preemption-while-updating-apicv-inhi.patch
+kvm-x86-disable-preemption-around-the-call-to-kvm_ar.patch
+kvm-selftests-restrict-test-region-to-48-bit-physica.patch
+tools-kvm_stat-fix-display-of-error-when-multiple-pr.patch
+selftests-kvm-handle-compiler-optimizations-in-ucall.patch
+kvm-x86-svm-add-__gfp_account-to-__sev_dbg_-en-de-cr.patch
diff --git a/queue-5.18/tools-kvm_stat-fix-display-of-error-when-multiple-pr.patch b/queue-5.18/tools-kvm_stat-fix-display-of-error-when-multiple-pr.patch
new file mode 100644
index 00000000000..6934efc5478
--- /dev/null
+++ b/queue-5.18/tools-kvm_stat-fix-display-of-error-when-multiple-pr.patch
@@ -0,0 +1,64 @@
+From 10ae01db75615479483fc1f09f3970477727bee3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 14 Jun 2022 15:11:41 +0300
+Subject: tools/kvm_stat: fix display of error when multiple processes are
+ found
+
+From: Dmitry Klochkov <kdmitry556@gmail.com>
+
+[ Upstream commit 933b5f9f98da29af646b51b36a0753692908ef64 ]
+
+Instead of printing an error message, kvm_stat script fails when we
+restrict statistics to a guest by its name and there are multiple guests
+with such name:
+
+  # kvm_stat -g my_vm
+  Traceback (most recent call last):
+    File "/usr/bin/kvm_stat", line 1819, in <module>
+      main()
+    File "/usr/bin/kvm_stat", line 1779, in main
+      options = get_options()
+    File "/usr/bin/kvm_stat", line 1718, in get_options
+      options = argparser.parse_args()
+    File "/usr/lib64/python3.10/argparse.py", line 1825, in parse_args
+      args, argv = self.parse_known_args(args, namespace)
+    File "/usr/lib64/python3.10/argparse.py", line 1858, in parse_known_args
+      namespace, args = self._parse_known_args(args, namespace)
+    File "/usr/lib64/python3.10/argparse.py", line 2067, in _parse_known_args
+      start_index = consume_optional(start_index)
+    File "/usr/lib64/python3.10/argparse.py", line 2007, in consume_optional
+      take_action(action, args, option_string)
+    File "/usr/lib64/python3.10/argparse.py", line 1935, in take_action
+      action(self, namespace, argument_values, option_string)
+    File "/usr/bin/kvm_stat", line 1649, in __call__
+      ' to specify the desired pid'.format(" ".join(pids)))
+  TypeError: sequence item 0: expected str instance, int found
+
+To avoid this, it's needed to convert pids int values to strings before
+pass them to join().
+
+Signed-off-by: Dmitry Klochkov <kdmitry556@gmail.com>
+Message-Id: <20220614121141.160689-1-kdmitry556@gmail.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ tools/kvm/kvm_stat/kvm_stat | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat
+index 5a5bd74f55bd..9c366b3a676d 100755
+--- a/tools/kvm/kvm_stat/kvm_stat
++++ b/tools/kvm/kvm_stat/kvm_stat
+@@ -1646,7 +1646,8 @@ Press any other key to refresh statistics immediately.
+                          .format(values))
+             if len(pids) > 1:
+                 sys.exit('Error: Multiple processes found (pids: {}). Use "-p"'
+-                         ' to specify the desired pid'.format(" ".join(pids)))
++                         ' to specify the desired pid'
++                         .format(" ".join(map(str, pids))))
+             namespace.pid = pids[0]
+ 
+     argparser = argparse.ArgumentParser(description=description_text,
+-- 
+2.35.1
+