--- /dev/null
+From 3899d94e3831ee07ea6821c032dc297aec80586a Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Christoph=20B=C3=B6hmwalder?=
+ <christoph.boehmwalder@linbit.com>
+Date: Wed, 3 May 2023 14:19:37 +0200
+Subject: drbd: correctly submit flush bio on barrier
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Christoph Böhmwalder <christoph.boehmwalder@linbit.com>
+
+commit 3899d94e3831ee07ea6821c032dc297aec80586a upstream.
+
+When we receive a flush command (or "barrier" in DRBD), we currently use
+a REQ_OP_FLUSH with the REQ_PREFLUSH flag set.
+
+The correct way to submit a flush bio is by using a REQ_OP_WRITE without
+any data, and set the REQ_PREFLUSH flag.
+
+Since commit b4a6bb3a67aa ("block: add a sanity check for non-write
+flush/fua bios"), this triggers a warning in the block layer, but this
+has been broken for quite some time before that.
+
+So use the correct set of flags to actually make the flush happen.
+
+Cc: Christoph Hellwig <hch@infradead.org>
+Cc: stable@vger.kernel.org
+Fixes: f9ff0da56437 ("drbd: allow parallel flushes for multi-volume resources")
+Reported-by: Thomas Voegtle <tv@lio96.de>
+Signed-off-by: Christoph Böhmwalder <christoph.boehmwalder@linbit.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Link: https://lore.kernel.org/r/20230503121937.17232-1-christoph.boehmwalder@linbit.com
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/block/drbd/drbd_receiver.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/block/drbd/drbd_receiver.c
++++ b/drivers/block/drbd/drbd_receiver.c
+@@ -1299,7 +1299,7 @@ static void submit_one_flush(struct drbd
+ bio_set_dev(bio, device->ldev->backing_bdev);
+ bio->bi_private = octx;
+ bio->bi_end_io = one_flush_endio;
+- bio->bi_opf = REQ_OP_FLUSH | REQ_PREFLUSH;
++ bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
+
+ device->flush_jif = jiffies;
+ set_bit(FLUSH_PENDING, &device->flags);
--- /dev/null
+From stable-owner@vger.kernel.org Wed May 10 20:16:36 2023
+From: Rishabh Bhatnagar <risbhat@amazon.com>
+Date: Wed, 10 May 2023 18:15:41 +0000
+Subject: KVM: Fix steal time asm constraints
+To: <gregkh@linuxfoundation.org>, <stable@vger.kernel.org>
+Cc: <lee@kernel.org>, <seanjc@google.com>, <kvm@vger.kernel.org>, <bp@alien8.de>, <mingo@redhat.com>, <tglx@linutronix.de>, <pbonzini@redhat.com>, <vkuznets@redhat.com>, <wanpengli@tencent.com>, <jmattson@google.com>, <joro@8bytes.org>, David Woodhouse <dwmw@amazon.co.uk>, kernel test robot <lkp@intel.com>, Rishabh Bhatnagar <risbhat@amazon.com>, Allen Pais <apais@linux.microsoft.com>
+Message-ID: <20230510181547.22451-4-risbhat@amazon.com>
+
+From: Rishabh Bhatnagar <risbhat@amazon.com>
+
+From: David Woodhouse <dwmw@amazon.co.uk>
+
+commit 964b7aa0b040bdc6ec1c543ee620cda3f8b4c68a upstream.
+
+In 64-bit mode, x86 instruction encoding allows us to use the low 8 bits
+of any GPR as an 8-bit operand. In 32-bit mode, however, we can only use
+the [abcd] registers. For which, GCC has the "q" constraint instead of
+the less restrictive "r".
+
+Also fix st->preempted, which is an input/output operand rather than an
+input.
+
+Fixes: 7e2175ebd695 ("KVM: x86: Fix recording of guest steal time / preempted status")
+Reported-by: kernel test robot <lkp@intel.com>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Message-Id: <89bf72db1b859990355f9c40713a34e0d2d86c98.camel@infradead.org>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Rishabh Bhatnagar <risbhat@amazon.com>
+Tested-by: Allen Pais <apais@linux.microsoft.com>
+Acked-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/x86.c | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -3064,9 +3064,9 @@ static void record_steal_time(struct kvm
+ "xor %1, %1\n"
+ "2:\n"
+ _ASM_EXTABLE_UA(1b, 2b)
+- : "+r" (st_preempted),
+- "+&r" (err)
+- : "m" (st->preempted));
++ : "+q" (st_preempted),
++ "+&r" (err),
++ "+m" (st->preempted));
+ if (err)
+ goto out;
+
--- /dev/null
+From stable-owner@vger.kernel.org Wed May 10 20:16:28 2023
+From: Rishabh Bhatnagar <risbhat@amazon.com>
+Date: Wed, 10 May 2023 18:15:46 +0000
+Subject: KVM: x86: do not report preemption if the steal time cache is stale
+To: <gregkh@linuxfoundation.org>, <stable@vger.kernel.org>
+Cc: <lee@kernel.org>, <seanjc@google.com>, <kvm@vger.kernel.org>, <bp@alien8.de>, <mingo@redhat.com>, <tglx@linutronix.de>, <pbonzini@redhat.com>, <vkuznets@redhat.com>, <wanpengli@tencent.com>, <jmattson@google.com>, <joro@8bytes.org>, David Woodhouse <dwmw@amazon.co.uk>, Rishabh Bhatnagar <risbhat@amazon.com>, Allen Pais <apais@linux.microsoft.com>
+Message-ID: <20230510181547.22451-9-risbhat@amazon.com>
+
+From: Rishabh Bhatnagar <risbhat@amazon.com>
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+commit c3c28d24d910a746b02f496d190e0e8c6560224b upstream.
+
+Commit 7e2175ebd695 ("KVM: x86: Fix recording of guest steal time
+/ preempted status", 2021-11-11) open coded the previous call to
+kvm_map_gfn, but in doing so it dropped the comparison between the cached
+guest physical address and the one in the MSR. This cause an incorrect
+cache hit if the guest modifies the steal time address while the memslots
+remain the same. This can happen with kexec, in which case the preempted
+bit is written at the address used by the old kernel instead of
+the old one.
+
+Cc: David Woodhouse <dwmw@amazon.co.uk>
+Cc: stable@vger.kernel.org
+Fixes: 7e2175ebd695 ("KVM: x86: Fix recording of guest steal time / preempted status")
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Rishabh Bhatnagar <risbhat@amazon.com>
+Tested-by: Allen Pais <apais@linux.microsoft.com>
+Acked-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/x86.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -4096,6 +4096,7 @@ static void kvm_steal_time_set_preempted
+ struct kvm_steal_time __user *st;
+ struct kvm_memslots *slots;
+ static const u8 preempted = KVM_VCPU_PREEMPTED;
++ gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
+
+ /*
+ * The vCPU can be marked preempted if and only if the VM-Exit was on
+@@ -4123,6 +4124,7 @@ static void kvm_steal_time_set_preempted
+ slots = kvm_memslots(vcpu->kvm);
+
+ if (unlikely(slots->generation != ghc->generation ||
++ gpa != ghc->gpa ||
+ kvm_is_error_hva(ghc->hva) || !ghc->memslot))
+ return;
+
--- /dev/null
+From stable-owner@vger.kernel.org Wed May 10 20:16:31 2023
+From: Rishabh Bhatnagar <risbhat@amazon.com>
+Date: Wed, 10 May 2023 18:15:43 +0000
+Subject: KVM: x86: do not set st->preempted when going back to user space
+To: <gregkh@linuxfoundation.org>, <stable@vger.kernel.org>
+Cc: <lee@kernel.org>, <seanjc@google.com>, <kvm@vger.kernel.org>, <bp@alien8.de>, <mingo@redhat.com>, <tglx@linutronix.de>, <pbonzini@redhat.com>, <vkuznets@redhat.com>, <wanpengli@tencent.com>, <jmattson@google.com>, <joro@8bytes.org>, Rishabh Bhatnagar <risbhat@amazon.com>, Allen Pais <apais@linux.microsoft.com>
+Message-ID: <20230510181547.22451-6-risbhat@amazon.com>
+
+From: Rishabh Bhatnagar <risbhat@amazon.com>
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+commit 54aa83c90198e68eee8b0850c749bc70efb548da upstream.
+
+Similar to the Xen path, only change the vCPU's reported state if the vCPU
+was actually preempted. The reason for KVM's behavior is that for example
+optimistic spinning might not be a good idea if the guest is doing repeated
+exits to userspace; however, it is confusing and unlikely to make a difference,
+because well-tuned guests will hardly ever exit KVM_RUN in the first place.
+
+Suggested-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+[risbhat@amazon.com: Don't check for xen msr as support is not available
+and skip the SEV-ES condition]
+Signed-off-by: Rishabh Bhatnagar <risbhat@amazon.com>
+Tested-by: Allen Pais <apais@linux.microsoft.com>
+Acked-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/x86.c | 18 ++++++++++--------
+ 1 file changed, 10 insertions(+), 8 deletions(-)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -4139,16 +4139,18 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *
+ {
+ int idx;
+
+- if (vcpu->preempted)
++ if (vcpu->preempted) {
+ vcpu->arch.preempted_in_kernel = !kvm_x86_ops.get_cpl(vcpu);
+
+- /*
+- * kvm_memslots() will be called by
+- * kvm_write_guest_offset_cached() so take the srcu lock.
+- */
+- idx = srcu_read_lock(&vcpu->kvm->srcu);
+- kvm_steal_time_set_preempted(vcpu);
+- srcu_read_unlock(&vcpu->kvm->srcu, idx);
++ /*
++ * Take the srcu lock as memslots will be accessed to check the gfn
++ * cache generation against the memslots generation.
++ */
++ idx = srcu_read_lock(&vcpu->kvm->srcu);
++ kvm_steal_time_set_preempted(vcpu);
++ srcu_read_unlock(&vcpu->kvm->srcu, idx);
++ }
++
+ kvm_x86_ops.vcpu_put(vcpu);
+ vcpu->arch.last_host_tsc = rdtsc();
+ /*
--- /dev/null
+From stable-owner@vger.kernel.org Wed May 10 20:16:27 2023
+From: Rishabh Bhatnagar <risbhat@amazon.com>
+Date: Wed, 10 May 2023 18:15:39 +0000
+Subject: KVM: x86: Ensure PV TLB flush tracepoint reflects KVM behavior
+To: <gregkh@linuxfoundation.org>, <stable@vger.kernel.org>
+Cc: <lee@kernel.org>, <seanjc@google.com>, <kvm@vger.kernel.org>, <bp@alien8.de>, <mingo@redhat.com>, <tglx@linutronix.de>, <pbonzini@redhat.com>, <vkuznets@redhat.com>, <wanpengli@tencent.com>, <jmattson@google.com>, <joro@8bytes.org>, Lai Jiangshan <laijs@linux.alibaba.com>, Rishabh Bhatnagar <risbhat@amazon.com>, Allen Pais <apais@linux.microsoft.com>
+Message-ID: <20230510181547.22451-2-risbhat@amazon.com>
+
+From: Rishabh Bhatnagar <risbhat@amazon.com>
+
+From: Lai Jiangshan <laijs@linux.alibaba.com>
+
+commit af3511ff7fa2107d6410831f3d71030f5e8d2b25 upstream.
+
+In record_steal_time(), st->preempted is read twice, and
+trace_kvm_pv_tlb_flush() might output result inconsistent if
+kvm_vcpu_flush_tlb_guest() see a different st->preempted later.
+
+It is a very trivial problem and hardly has actual harm and can be
+avoided by reseting and reading st->preempted in atomic way via xchg().
+
+Signed-off-by: Lai Jiangshan <laijs@linux.alibaba.com>
+
+Message-Id: <20210531174628.10265-1-jiangshanlai@gmail.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Rishabh Bhatnagar <risbhat@amazon.com>
+Tested-by: Allen Pais <apais@linux.microsoft.com>
+Acked-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/x86.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -3041,9 +3041,11 @@ static void record_steal_time(struct kvm
+ * expensive IPIs.
+ */
+ if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) {
++ u8 st_preempted = xchg(&st->preempted, 0);
++
+ trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
+- st->preempted & KVM_VCPU_FLUSH_TLB);
+- if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB)
++ st_preempted & KVM_VCPU_FLUSH_TLB);
++ if (st_preempted & KVM_VCPU_FLUSH_TLB)
+ kvm_vcpu_flush_tlb_guest(vcpu);
+ } else {
+ st->preempted = 0;
--- /dev/null
+From stable-owner@vger.kernel.org Wed May 10 20:16:24 2023
+From: Rishabh Bhatnagar <risbhat@amazon.com>
+Date: Wed, 10 May 2023 18:15:40 +0000
+Subject: KVM: x86: Fix recording of guest steal time / preempted status
+To: <gregkh@linuxfoundation.org>, <stable@vger.kernel.org>
+Cc: <lee@kernel.org>, <seanjc@google.com>, <kvm@vger.kernel.org>, <bp@alien8.de>, <mingo@redhat.com>, <tglx@linutronix.de>, <pbonzini@redhat.com>, <vkuznets@redhat.com>, <wanpengli@tencent.com>, <jmattson@google.com>, <joro@8bytes.org>, David Woodhouse <dwmw2@infradead.org>, David Woodhouse <dwmw@amazon.co.uk>, Rishabh Bhatnagar <risbhat@amazon.com>, Allen Pais <apais@linux.microsoft.com>
+Message-ID: <20230510181547.22451-3-risbhat@amazon.com>
+
+From: Rishabh Bhatnagar <risbhat@amazon.com>
+
+From: David Woodhouse <dwmw2@infradead.org>
+
+commit 7e2175ebd695f17860c5bd4ad7616cce12ed4591 upstream.
+
+In commit b043138246a4 ("x86/KVM: Make sure KVM_VCPU_FLUSH_TLB flag is
+not missed") we switched to using a gfn_to_pfn_cache for accessing the
+guest steal time structure in order to allow for an atomic xchg of the
+preempted field. This has a couple of problems.
+
+Firstly, kvm_map_gfn() doesn't work at all for IOMEM pages when the
+atomic flag is set, which it is in kvm_steal_time_set_preempted(). So a
+guest vCPU using an IOMEM page for its steal time would never have its
+preempted field set.
+
+Secondly, the gfn_to_pfn_cache is not invalidated in all cases where it
+should have been. There are two stages to the GFN->PFN conversion;
+first the GFN is converted to a userspace HVA, and then that HVA is
+looked up in the process page tables to find the underlying host PFN.
+Correct invalidation of the latter would require being hooked up to the
+MMU notifiers, but that doesn't happen---so it just keeps mapping and
+unmapping the *wrong* PFN after the userspace page tables change.
+
+In the !IOMEM case at least the stale page *is* pinned all the time it's
+cached, so it won't be freed and reused by anyone else while still
+receiving the steal time updates. The map/unmap dance only takes care
+of the KVM administrivia such as marking the page dirty.
+
+Until the gfn_to_pfn cache handles the remapping automatically by
+integrating with the MMU notifiers, we might as well not get a
+kernel mapping of it, and use the perfectly serviceable userspace HVA
+that we already have. We just need to implement the atomic xchg on
+the userspace address with appropriate exception handling, which is
+fairly trivial.
+
+Cc: stable@vger.kernel.org
+Fixes: b043138246a4 ("x86/KVM: Make sure KVM_VCPU_FLUSH_TLB flag is not missed")
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Message-Id: <3645b9b889dac6438394194bb5586a46b68d581f.camel@infradead.org>
+[I didn't entirely agree with David's assessment of the
+ usefulness of the gfn_to_pfn cache, and integrated the outcome
+ of the discussion in the above commit message. - Paolo]
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+[risbhat@amazon.com: Use the older mark_page_dirty_in_slot api without
+kvm argument]
+Signed-off-by: Rishabh Bhatnagar <risbhat@amazon.com>
+Tested-by: Allen Pais <apais@linux.microsoft.com>
+Acked-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/include/asm/kvm_host.h | 2
+ arch/x86/kvm/x86.c | 105 ++++++++++++++++++++++++++++------------
+ 2 files changed, 76 insertions(+), 31 deletions(-)
+
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -664,7 +664,7 @@ struct kvm_vcpu_arch {
+ u8 preempted;
+ u64 msr_val;
+ u64 last_steal;
+- struct gfn_to_pfn_cache cache;
++ struct gfn_to_hva_cache cache;
+ } st;
+
+ u64 l1_tsc_offset;
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -3022,53 +3022,92 @@ static void kvm_vcpu_flush_tlb_guest(str
+
+ static void record_steal_time(struct kvm_vcpu *vcpu)
+ {
+- struct kvm_host_map map;
+- struct kvm_steal_time *st;
++ struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
++ struct kvm_steal_time __user *st;
++ struct kvm_memslots *slots;
++ u64 steal;
++ u32 version;
+
+ if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
+ return;
+
+- /* -EAGAIN is returned in atomic context so we can just return. */
+- if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT,
+- &map, &vcpu->arch.st.cache, false))
++ if (WARN_ON_ONCE(current->mm != vcpu->kvm->mm))
+ return;
+
+- st = map.hva +
+- offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
++ slots = kvm_memslots(vcpu->kvm);
++
++ if (unlikely(slots->generation != ghc->generation ||
++ kvm_is_error_hva(ghc->hva) || !ghc->memslot)) {
++ gfn_t gfn = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
++
++ /* We rely on the fact that it fits in a single page. */
++ BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS);
++
++ if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gfn, sizeof(*st)) ||
++ kvm_is_error_hva(ghc->hva) || !ghc->memslot)
++ return;
++ }
++
++ st = (struct kvm_steal_time __user *)ghc->hva;
++ if (!user_access_begin(st, sizeof(*st)))
++ return;
+
+ /*
+ * Doing a TLB flush here, on the guest's behalf, can avoid
+ * expensive IPIs.
+ */
+ if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) {
+- u8 st_preempted = xchg(&st->preempted, 0);
++ u8 st_preempted = 0;
++ int err = -EFAULT;
++
++ asm volatile("1: xchgb %0, %2\n"
++ "xor %1, %1\n"
++ "2:\n"
++ _ASM_EXTABLE_UA(1b, 2b)
++ : "+r" (st_preempted),
++ "+&r" (err)
++ : "m" (st->preempted));
++ if (err)
++ goto out;
++
++ user_access_end();
++
++ vcpu->arch.st.preempted = 0;
+
+ trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
+ st_preempted & KVM_VCPU_FLUSH_TLB);
+ if (st_preempted & KVM_VCPU_FLUSH_TLB)
+ kvm_vcpu_flush_tlb_guest(vcpu);
++
++ if (!user_access_begin(st, sizeof(*st)))
++ goto dirty;
+ } else {
+- st->preempted = 0;
++ unsafe_put_user(0, &st->preempted, out);
++ vcpu->arch.st.preempted = 0;
+ }
+
+- vcpu->arch.st.preempted = 0;
+-
+- if (st->version & 1)
+- st->version += 1; /* first time write, random junk */
++ unsafe_get_user(version, &st->version, out);
++ if (version & 1)
++ version += 1; /* first time write, random junk */
+
+- st->version += 1;
++ version += 1;
++ unsafe_put_user(version, &st->version, out);
+
+ smp_wmb();
+
+- st->steal += current->sched_info.run_delay -
++ unsafe_get_user(steal, &st->steal, out);
++ steal += current->sched_info.run_delay -
+ vcpu->arch.st.last_steal;
+ vcpu->arch.st.last_steal = current->sched_info.run_delay;
++ unsafe_put_user(steal, &st->steal, out);
+
+- smp_wmb();
+-
+- st->version += 1;
++ version += 1;
++ unsafe_put_user(version, &st->version, out);
+
+- kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false);
++ out:
++ user_access_end();
++ dirty:
++ mark_page_dirty_in_slot(ghc->memslot, gpa_to_gfn(ghc->gpa));
+ }
+
+ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+@@ -4053,8 +4092,10 @@ void kvm_arch_vcpu_load(struct kvm_vcpu
+
+ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
+ {
+- struct kvm_host_map map;
+- struct kvm_steal_time *st;
++ struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
++ struct kvm_steal_time __user *st;
++ struct kvm_memslots *slots;
++ static const u8 preempted = KVM_VCPU_PREEMPTED;
+
+ /*
+ * The vCPU can be marked preempted if and only if the VM-Exit was on
+@@ -4075,16 +4116,23 @@ static void kvm_steal_time_set_preempted
+ if (vcpu->arch.st.preempted)
+ return;
+
+- if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map,
+- &vcpu->arch.st.cache, true))
++ /* This happens on process exit */
++ if (unlikely(current->mm != vcpu->kvm->mm))
+ return;
+
+- st = map.hva +
+- offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
++ slots = kvm_memslots(vcpu->kvm);
++
++ if (unlikely(slots->generation != ghc->generation ||
++ kvm_is_error_hva(ghc->hva) || !ghc->memslot))
++ return;
+
+- st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
++ st = (struct kvm_steal_time __user *)ghc->hva;
++ BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted));
+
+- kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true);
++ if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted)))
++ vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
++
++ mark_page_dirty_in_slot(ghc->memslot, gpa_to_gfn(ghc->gpa));
+ }
+
+ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
+@@ -10266,11 +10314,8 @@ void kvm_arch_vcpu_postcreate(struct kvm
+
+ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
+ {
+- struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache;
+ int idx;
+
+- kvm_release_pfn(cache->pfn, cache->dirty, cache);
+-
+ kvmclock_reset(vcpu);
+
+ kvm_x86_ops.vcpu_free(vcpu);
--- /dev/null
+From stable-owner@vger.kernel.org Wed May 10 20:16:44 2023
+From: Rishabh Bhatnagar <risbhat@amazon.com>
+Date: Wed, 10 May 2023 18:15:47 +0000
+Subject: KVM: x86: move guest_pv_has out of user_access section
+To: <gregkh@linuxfoundation.org>, <stable@vger.kernel.org>
+Cc: <lee@kernel.org>, <seanjc@google.com>, <kvm@vger.kernel.org>, <bp@alien8.de>, <mingo@redhat.com>, <tglx@linutronix.de>, <pbonzini@redhat.com>, <vkuznets@redhat.com>, <wanpengli@tencent.com>, <jmattson@google.com>, <joro@8bytes.org>, Stephen Rothwell <sfr@canb.auug.org.au>, David Woodhouse <dwmw2@infradead.org>, "Rishabh Bhatnagar" <risbhat@amazon.com>, Allen Pais <apais@linux.microsoft.com>
+Message-ID: <20230510181547.22451-10-risbhat@amazon.com>
+
+From: Rishabh Bhatnagar <risbhat@amazon.com>
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+commit 3e067fd8503d6205aa0c1c8f48f6b209c592d19c upstream.
+
+When UBSAN is enabled, the code emitted for the call to guest_pv_has
+includes a call to __ubsan_handle_load_invalid_value. objtool
+complains that this call happens with UACCESS enabled; to avoid
+the warning, pull the calls to user_access_begin into both arms
+of the "if" statement, after the check for guest_pv_has.
+
+Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
+Cc: David Woodhouse <dwmw2@infradead.org>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Rishabh Bhatnagar <risbhat@amazon.com>
+Tested-by: Allen Pais <apais@linux.microsoft.com>
+Acked-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/x86.c | 9 ++++++---
+ 1 file changed, 6 insertions(+), 3 deletions(-)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -3049,9 +3049,6 @@ static void record_steal_time(struct kvm
+ }
+
+ st = (struct kvm_steal_time __user *)ghc->hva;
+- if (!user_access_begin(st, sizeof(*st)))
+- return;
+-
+ /*
+ * Doing a TLB flush here, on the guest's behalf, can avoid
+ * expensive IPIs.
+@@ -3060,6 +3057,9 @@ static void record_steal_time(struct kvm
+ u8 st_preempted = 0;
+ int err = -EFAULT;
+
++ if (!user_access_begin(st, sizeof(*st)))
++ return;
++
+ asm volatile("1: xchgb %0, %2\n"
+ "xor %1, %1\n"
+ "2:\n"
+@@ -3082,6 +3082,9 @@ static void record_steal_time(struct kvm
+ if (!user_access_begin(st, sizeof(*st)))
+ goto dirty;
+ } else {
++ if (!user_access_begin(st, sizeof(*st)))
++ return;
++
+ unsafe_put_user(0, &st->preempted, out);
+ vcpu->arch.st.preempted = 0;
+ }
--- /dev/null
+From stable-owner@vger.kernel.org Wed May 10 20:16:27 2023
+From: Rishabh Bhatnagar <risbhat@amazon.com>
+Date: Wed, 10 May 2023 18:15:42 +0000
+Subject: KVM: x86: Remove obsolete disabling of page faults in kvm_arch_vcpu_put()
+To: <gregkh@linuxfoundation.org>, <stable@vger.kernel.org>
+Cc: <lee@kernel.org>, <seanjc@google.com>, <kvm@vger.kernel.org>, <bp@alien8.de>, <mingo@redhat.com>, <tglx@linutronix.de>, <pbonzini@redhat.com>, <vkuznets@redhat.com>, <wanpengli@tencent.com>, <jmattson@google.com>, <joro@8bytes.org>, Rishabh Bhatnagar <risbhat@amazon.com>, Allen Pais <apais@linux.microsoft.com>
+Message-ID: <20230510181547.22451-5-risbhat@amazon.com>
+
+From: Rishabh Bhatnagar <risbhat@amazon.com>
+
+From: Sean Christopherson <seanjc@google.com>
+
+commit 19979fba9bfaeab427a8e106d915f0627c952828 upstream.
+
+Remove the disabling of page faults across kvm_steal_time_set_preempted()
+as KVM now accesses the steal time struct (shared with the guest) via a
+cached mapping (see commit b043138246a4, "x86/KVM: Make sure
+KVM_VCPU_FLUSH_TLB flag is not missed".) The cache lookup is flagged as
+atomic, thus it would be a bug if KVM tried to resolve a new pfn, i.e.
+we want the splat that would be reached via might_fault().
+
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Message-Id: <20210123000334.3123628-2-seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Rishabh Bhatnagar <risbhat@amazon.com>
+Tested-by: Allen Pais <apais@linux.microsoft.com>
+Acked-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/x86.c | 10 ----------
+ 1 file changed, 10 deletions(-)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -4143,22 +4143,12 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *
+ vcpu->arch.preempted_in_kernel = !kvm_x86_ops.get_cpl(vcpu);
+
+ /*
+- * Disable page faults because we're in atomic context here.
+- * kvm_write_guest_offset_cached() would call might_fault()
+- * that relies on pagefault_disable() to tell if there's a
+- * bug. NOTE: the write to guest memory may not go through if
+- * during postcopy live migration or if there's heavy guest
+- * paging.
+- */
+- pagefault_disable();
+- /*
+ * kvm_memslots() will be called by
+ * kvm_write_guest_offset_cached() so take the srcu lock.
+ */
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ kvm_steal_time_set_preempted(vcpu);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+- pagefault_enable();
+ kvm_x86_ops.vcpu_put(vcpu);
+ vcpu->arch.last_host_tsc = rdtsc();
+ /*
--- /dev/null
+From stable-owner@vger.kernel.org Wed May 10 20:16:44 2023
+From: Rishabh Bhatnagar <risbhat@amazon.com>
+Date: Wed, 10 May 2023 18:15:45 +0000
+Subject: KVM: x86: revalidate steal time cache if MSR value changes
+To: <gregkh@linuxfoundation.org>, <stable@vger.kernel.org>
+Cc: <lee@kernel.org>, <seanjc@google.com>, <kvm@vger.kernel.org>, <bp@alien8.de>, <mingo@redhat.com>, <tglx@linutronix.de>, <pbonzini@redhat.com>, <vkuznets@redhat.com>, <wanpengli@tencent.com>, <jmattson@google.com>, <joro@8bytes.org>, Dave Young <ruyang@redhat.com>, Xiaoying Yan <yiyan@redhat.com>, "Dr . David Alan Gilbert" <dgilbert@redhat.com>, David Woodhouse <dwmw@amazon.co.uk>, Rishabh Bhatnagar <risbhat@amazon.com>, Allen Pais <apais@linux.microsoft.com>
+Message-ID: <20230510181547.22451-8-risbhat@amazon.com>
+
+From: Rishabh Bhatnagar <risbhat@amazon.com>
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+commit 901d3765fa804ce42812f1d5b1f3de2dfbb26723 upstream.
+
+Commit 7e2175ebd695 ("KVM: x86: Fix recording of guest steal time
+/ preempted status", 2021-11-11) open coded the previous call to
+kvm_map_gfn, but in doing so it dropped the comparison between the cached
+guest physical address and the one in the MSR. This cause an incorrect
+cache hit if the guest modifies the steal time address while the memslots
+remain the same. This can happen with kexec, in which case the steal
+time data is written at the address used by the old kernel instead of
+the old one.
+
+While at it, rename the variable from gfn to gpa since it is a plain
+physical address and not a right-shifted one.
+
+Reported-by: Dave Young <ruyang@redhat.com>
+Reported-by: Xiaoying Yan <yiyan@redhat.com>
+Analyzed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
+Cc: David Woodhouse <dwmw@amazon.co.uk>
+Cc: stable@vger.kernel.org
+Fixes: 7e2175ebd695 ("KVM: x86: Fix recording of guest steal time / preempted status")
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Rishabh Bhatnagar <risbhat@amazon.com>
+Tested-by: Allen Pais <apais@linux.microsoft.com>
+Acked-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/x86.c | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -3025,6 +3025,7 @@ static void record_steal_time(struct kvm
+ struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
+ struct kvm_steal_time __user *st;
+ struct kvm_memslots *slots;
++ gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
+ u64 steal;
+ u32 version;
+
+@@ -3037,13 +3038,12 @@ static void record_steal_time(struct kvm
+ slots = kvm_memslots(vcpu->kvm);
+
+ if (unlikely(slots->generation != ghc->generation ||
++ gpa != ghc->gpa ||
+ kvm_is_error_hva(ghc->hva) || !ghc->memslot)) {
+- gfn_t gfn = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
+-
+ /* We rely on the fact that it fits in a single page. */
+ BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS);
+
+- if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gfn, sizeof(*st)) ||
++ if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gpa, sizeof(*st)) ||
+ kvm_is_error_hva(ghc->hva) || !ghc->memslot)
+ return;
+ }
--- /dev/null
+From 146a37e05d620cef4ad430e5d1c9c077fe6fa76f Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@linux.intel.com>
+Date: Fri, 17 Mar 2023 13:33:18 +0200
+Subject: serial: 8250: Fix serial8250_tx_empty() race with DMA Tx
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
+
+commit 146a37e05d620cef4ad430e5d1c9c077fe6fa76f upstream.
+
+There's a potential race before THRE/TEMT deasserts when DMA Tx is
+starting up (or the next batch of continuous Tx is being submitted).
+This can lead to misdetecting Tx empty condition.
+
+It is entirely normal for THRE/TEMT to be set for some time after the
+DMA Tx had been setup in serial8250_tx_dma(). As Tx side is definitely
+not empty at that point, it seems incorrect for serial8250_tx_empty()
+claim Tx is empty.
+
+Fix the race by also checking in serial8250_tx_empty() whether there's
+DMA Tx active.
+
+Note: This fix only addresses in-kernel race mainly to make using
+TCSADRAIN/FLUSH robust. Userspace can still cause other races but they
+seem userspace concurrency control problems.
+
+Fixes: 9ee4b83e51f74 ("serial: 8250: Add support for dmaengine")
+Cc: stable@vger.kernel.org
+Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
+Link: https://lore.kernel.org/r/20230317113318.31327-3-ilpo.jarvinen@linux.intel.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/tty/serial/8250/8250.h | 12 ++++++++++++
+ drivers/tty/serial/8250/8250_port.c | 12 +++++++++---
+ 2 files changed, 21 insertions(+), 3 deletions(-)
+
+--- a/drivers/tty/serial/8250/8250.h
++++ b/drivers/tty/serial/8250/8250.h
+@@ -330,6 +330,13 @@ extern int serial8250_rx_dma(struct uart
+ extern void serial8250_rx_dma_flush(struct uart_8250_port *);
+ extern int serial8250_request_dma(struct uart_8250_port *);
+ extern void serial8250_release_dma(struct uart_8250_port *);
++
++static inline bool serial8250_tx_dma_running(struct uart_8250_port *p)
++{
++ struct uart_8250_dma *dma = p->dma;
++
++ return dma && dma->tx_running;
++}
+ #else
+ static inline int serial8250_tx_dma(struct uart_8250_port *p)
+ {
+@@ -345,6 +352,11 @@ static inline int serial8250_request_dma
+ return -1;
+ }
+ static inline void serial8250_release_dma(struct uart_8250_port *p) { }
++
++static inline bool serial8250_tx_dma_running(struct uart_8250_port *p)
++{
++ return false;
++}
+ #endif
+
+ static inline int ns16550a_goto_highspeed(struct uart_8250_port *up)
+--- a/drivers/tty/serial/8250/8250_port.c
++++ b/drivers/tty/serial/8250/8250_port.c
+@@ -1971,19 +1971,25 @@ static int serial8250_tx_threshold_handl
+ static unsigned int serial8250_tx_empty(struct uart_port *port)
+ {
+ struct uart_8250_port *up = up_to_u8250p(port);
++ unsigned int result = 0;
+ unsigned long flags;
+ unsigned int lsr;
+
+ serial8250_rpm_get(up);
+
+ spin_lock_irqsave(&port->lock, flags);
+- lsr = serial_port_in(port, UART_LSR);
+- up->lsr_saved_flags |= lsr & LSR_SAVE_FLAGS;
++ if (!serial8250_tx_dma_running(up)) {
++ lsr = serial_port_in(port, UART_LSR);
++ up->lsr_saved_flags |= lsr & LSR_SAVE_FLAGS;
++
++ if ((lsr & BOTH_EMPTY) == BOTH_EMPTY)
++ result = TIOCSER_TEMT;
++ }
+ spin_unlock_irqrestore(&port->lock, flags);
+
+ serial8250_rpm_put(up);
+
+- return (lsr & BOTH_EMPTY) == BOTH_EMPTY ? TIOCSER_TEMT : 0;
++ return result;
+ }
+
+ unsigned int serial8250_do_get_mctrl(struct uart_port *port)
ext4-bail-out-of-ext4_xattr_ibody_get-fails-for-any-reason.patch
ext4-remove-a-bug_on-in-ext4_mb_release_group_pa.patch
ext4-fix-invalid-free-tracking-in-ext4_xattr_move_to_block.patch
+serial-8250-fix-serial8250_tx_empty-race-with-dma-tx.patch
+drbd-correctly-submit-flush-bio-on-barrier.patch
+kvm-x86-ensure-pv-tlb-flush-tracepoint-reflects-kvm-behavior.patch
+kvm-x86-fix-recording-of-guest-steal-time-preempted-status.patch
+kvm-fix-steal-time-asm-constraints.patch
+kvm-x86-remove-obsolete-disabling-of-page-faults-in-kvm_arch_vcpu_put.patch
+kvm-x86-do-not-set-st-preempted-when-going-back-to-user-space.patch
+kvm-x86-revalidate-steal-time-cache-if-msr-value-changes.patch
+kvm-x86-do-not-report-preemption-if-the-steal-time-cache-is-stale.patch
+kvm-x86-move-guest_pv_has-out-of-user_access-section.patch