--- /dev/null
+From ef7b782465bd6943fbcacb4af4fbe790137a5820 Mon Sep 17 00:00:00 2001
+From: Pavel Begunkov <asml.silence@gmail.com>
+Date: Wed, 7 Dec 2022 03:53:35 +0000
+Subject: io_uring: extract a io_msg_install_complete helper
+
+From: Pavel Begunkov <asml.silence@gmail.com>
+
+Commit 172113101641cf1f9628c528ec790cb809f2b704 upstream.
+
+Extract a helper called io_msg_install_complete() from io_msg_send_fd(),
+will be used later.
+
+Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
+Link: https://lore.kernel.org/r/1500ca1054cc4286a3ee1c60aacead57fcdfa02a.1670384893.git.asml.silence@gmail.com
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ io_uring/msg_ring.c | 38 +++++++++++++++++++++-----------------
+ 1 file changed, 21 insertions(+), 17 deletions(-)
+
+--- a/io_uring/msg_ring.c
++++ b/io_uring/msg_ring.c
+@@ -94,40 +94,25 @@ static struct file *io_msg_grab_file(str
+ return file;
+ }
+
+-static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags)
++static int io_msg_install_complete(struct io_kiocb *req, unsigned int issue_flags)
+ {
+ struct io_ring_ctx *target_ctx = req->file->private_data;
+ struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
+- struct io_ring_ctx *ctx = req->ctx;
+ struct file *src_file = msg->src_file;
+ int ret;
+
+- if (msg->len)
+- return -EINVAL;
+- if (target_ctx == ctx)
+- return -EINVAL;
+- if (target_ctx->flags & IORING_SETUP_R_DISABLED)
+- return -EBADFD;
+- if (!src_file) {
+- src_file = io_msg_grab_file(req, issue_flags);
+- if (!src_file)
+- return -EBADF;
+- msg->src_file = src_file;
+- req->flags |= REQ_F_NEED_CLEANUP;
+- }
+-
+ if (unlikely(io_double_lock_ctx(target_ctx, issue_flags)))
+ return -EAGAIN;
+
+ ret = __io_fixed_fd_install(target_ctx, src_file, msg->dst_fd);
+ if (ret < 0)
+ goto out_unlock;
++
+ msg->src_file = NULL;
+ req->flags &= ~REQ_F_NEED_CLEANUP;
+
+ if (msg->flags & IORING_MSG_RING_CQE_SKIP)
+ goto out_unlock;
+-
+ /*
+ * If this fails, the target still received the file descriptor but
+ * wasn't notified of the fact. This means that if this request
+@@ -141,6 +126,25 @@ out_unlock:
+ return ret;
+ }
+
++static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags)
++{
++ struct io_ring_ctx *target_ctx = req->file->private_data;
++ struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
++ struct io_ring_ctx *ctx = req->ctx;
++ struct file *src_file = msg->src_file;
++
++ if (target_ctx == ctx)
++ return -EINVAL;
++ if (!src_file) {
++ src_file = io_msg_grab_file(req, issue_flags);
++ if (!src_file)
++ return -EBADF;
++ msg->src_file = src_file;
++ req->flags |= REQ_F_NEED_CLEANUP;
++ }
++ return io_msg_install_complete(req, issue_flags);
++}
++
+ int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+ {
+ struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
--- /dev/null
+From f10ba483becae0b8c7de0e9fe3e0f6d07a405b7a Mon Sep 17 00:00:00 2001
+From: Pavel Begunkov <asml.silence@gmail.com>
+Date: Wed, 7 Dec 2022 03:53:34 +0000
+Subject: io_uring: get rid of double locking
+
+From: Pavel Begunkov <asml.silence@gmail.com>
+
+Commit 11373026f2960390d5e330df4e92735c4265c440 upstream.
+
+We don't need to take both uring_locks at once, msg_ring can be split in
+two parts, first getting a file from the filetable of the first ring and
+then installing it into the second one.
+
+Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
+Link: https://lore.kernel.org/r/a80ecc2bc99c3b3f2cf20015d618b7c51419a797.1670384893.git.asml.silence@gmail.com
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ io_uring/msg_ring.c | 85 +++++++++++++++++++++++++++++-----------------------
+ io_uring/msg_ring.h | 1
+ io_uring/opdef.c | 1
+ 3 files changed, 51 insertions(+), 36 deletions(-)
+
+--- a/io_uring/msg_ring.c
++++ b/io_uring/msg_ring.c
+@@ -15,6 +15,7 @@
+
+ struct io_msg {
+ struct file *file;
++ struct file *src_file;
+ u64 user_data;
+ u32 len;
+ u32 cmd;
+@@ -23,6 +24,17 @@ struct io_msg {
+ u32 flags;
+ };
+
++void io_msg_ring_cleanup(struct io_kiocb *req)
++{
++ struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
++
++ if (WARN_ON_ONCE(!msg->src_file))
++ return;
++
++ fput(msg->src_file);
++ msg->src_file = NULL;
++}
++
+ static int io_msg_ring_data(struct io_kiocb *req)
+ {
+ struct io_ring_ctx *target_ctx = req->file->private_data;
+@@ -39,17 +51,13 @@ static int io_msg_ring_data(struct io_ki
+ return -EOVERFLOW;
+ }
+
+-static void io_double_unlock_ctx(struct io_ring_ctx *ctx,
+- struct io_ring_ctx *octx,
++static void io_double_unlock_ctx(struct io_ring_ctx *octx,
+ unsigned int issue_flags)
+ {
+- if (issue_flags & IO_URING_F_UNLOCKED)
+- mutex_unlock(&ctx->uring_lock);
+ mutex_unlock(&octx->uring_lock);
+ }
+
+-static int io_double_lock_ctx(struct io_ring_ctx *ctx,
+- struct io_ring_ctx *octx,
++static int io_double_lock_ctx(struct io_ring_ctx *octx,
+ unsigned int issue_flags)
+ {
+ /*
+@@ -62,17 +70,28 @@ static int io_double_lock_ctx(struct io_
+ return -EAGAIN;
+ return 0;
+ }
++ mutex_lock(&octx->uring_lock);
++ return 0;
++}
+
+- /* Always grab smallest value ctx first. We know ctx != octx. */
+- if (ctx < octx) {
+- mutex_lock(&ctx->uring_lock);
+- mutex_lock(&octx->uring_lock);
+- } else {
+- mutex_lock(&octx->uring_lock);
+- mutex_lock(&ctx->uring_lock);
+- }
++static struct file *io_msg_grab_file(struct io_kiocb *req, unsigned int issue_flags)
++{
++ struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
++ struct io_ring_ctx *ctx = req->ctx;
++ struct file *file = NULL;
++ unsigned long file_ptr;
++ int idx = msg->src_fd;
+
+- return 0;
++ io_ring_submit_lock(ctx, issue_flags);
++ if (likely(idx < ctx->nr_user_files)) {
++ idx = array_index_nospec(idx, ctx->nr_user_files);
++ file_ptr = io_fixed_file_slot(&ctx->file_table, idx)->file_ptr;
++ file = (struct file *) (file_ptr & FFS_MASK);
++ if (file)
++ get_file(file);
++ }
++ io_ring_submit_unlock(ctx, issue_flags);
++ return file;
+ }
+
+ static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags)
+@@ -80,8 +99,7 @@ static int io_msg_send_fd(struct io_kioc
+ struct io_ring_ctx *target_ctx = req->file->private_data;
+ struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
+ struct io_ring_ctx *ctx = req->ctx;
+- unsigned long file_ptr;
+- struct file *src_file;
++ struct file *src_file = msg->src_file;
+ int ret;
+
+ if (msg->len)
+@@ -90,28 +108,22 @@ static int io_msg_send_fd(struct io_kioc
+ return -EINVAL;
+ if (target_ctx->flags & IORING_SETUP_R_DISABLED)
+ return -EBADFD;
++ if (!src_file) {
++ src_file = io_msg_grab_file(req, issue_flags);
++ if (!src_file)
++ return -EBADF;
++ msg->src_file = src_file;
++ req->flags |= REQ_F_NEED_CLEANUP;
++ }
+
+- ret = io_double_lock_ctx(ctx, target_ctx, issue_flags);
+- if (unlikely(ret))
+- return ret;
+-
+- ret = -EBADF;
+- if (unlikely(msg->src_fd >= ctx->nr_user_files))
+- goto out_unlock;
+-
+- msg->src_fd = array_index_nospec(msg->src_fd, ctx->nr_user_files);
+- file_ptr = io_fixed_file_slot(&ctx->file_table, msg->src_fd)->file_ptr;
+- if (!file_ptr)
+- goto out_unlock;
+-
+- src_file = (struct file *) (file_ptr & FFS_MASK);
+- get_file(src_file);
++ if (unlikely(io_double_lock_ctx(target_ctx, issue_flags)))
++ return -EAGAIN;
+
+ ret = __io_fixed_fd_install(target_ctx, src_file, msg->dst_fd);
+- if (ret < 0) {
+- fput(src_file);
++ if (ret < 0)
+ goto out_unlock;
+- }
++ msg->src_file = NULL;
++ req->flags &= ~REQ_F_NEED_CLEANUP;
+
+ if (msg->flags & IORING_MSG_RING_CQE_SKIP)
+ goto out_unlock;
+@@ -125,7 +137,7 @@ static int io_msg_send_fd(struct io_kioc
+ if (!io_post_aux_cqe(target_ctx, msg->user_data, ret, 0, true))
+ ret = -EOVERFLOW;
+ out_unlock:
+- io_double_unlock_ctx(ctx, target_ctx, issue_flags);
++ io_double_unlock_ctx(target_ctx, issue_flags);
+ return ret;
+ }
+
+@@ -136,6 +148,7 @@ int io_msg_ring_prep(struct io_kiocb *re
+ if (unlikely(sqe->buf_index || sqe->personality))
+ return -EINVAL;
+
++ msg->src_file = NULL;
+ msg->user_data = READ_ONCE(sqe->off);
+ msg->len = READ_ONCE(sqe->len);
+ msg->cmd = READ_ONCE(sqe->addr);
+--- a/io_uring/msg_ring.h
++++ b/io_uring/msg_ring.h
+@@ -2,3 +2,4 @@
+
+ int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+ int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags);
++void io_msg_ring_cleanup(struct io_kiocb *req);
+--- a/io_uring/opdef.c
++++ b/io_uring/opdef.c
+@@ -445,6 +445,7 @@ const struct io_op_def io_op_defs[] = {
+ .name = "MSG_RING",
+ .prep = io_msg_ring_prep,
+ .issue = io_msg_ring,
++ .cleanup = io_msg_ring_cleanup,
+ },
+ [IORING_OP_FSETXATTR] = {
+ .needs_file = 1,
--- /dev/null
+From c47db9ca937e232606060077bb09cff49e75dbb7 Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@kernel.dk>
+Date: Tue, 22 Aug 2023 18:00:02 -0600
+Subject: io_uring/msg_ring: fix missing lock on overflow for IOPOLL
+
+From: Jens Axboe <axboe@kernel.dk>
+
+Commit e12d7a46f65ae4b7d58a5e0c1cbfa825cf8d830d upstream.
+
+If the target ring is configured with IOPOLL, then we always need to hold
+the target ring uring_lock before posting CQEs. We could just grab it
+unconditionally, but since we don't expect many target rings to be of this
+type, make grabbing the uring_lock conditional on the ring type.
+
+Link: https://lore.kernel.org/io-uring/Y8krlYa52%2F0YGqkg@ip-172-31-85-199.ec2.internal/
+Reported-by: Xingyuan Mo <hdthky0@gmail.com>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ io_uring/msg_ring.c | 20 +++++++++++++++-----
+ 1 file changed, 15 insertions(+), 5 deletions(-)
+
+--- a/io_uring/msg_ring.c
++++ b/io_uring/msg_ring.c
+@@ -57,20 +57,30 @@ void io_msg_ring_cleanup(struct io_kiocb
+ msg->src_file = NULL;
+ }
+
+-static int io_msg_ring_data(struct io_kiocb *req)
++static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags)
+ {
+ struct io_ring_ctx *target_ctx = req->file->private_data;
+ struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
++ int ret;
+
+ if (msg->src_fd || msg->dst_fd || msg->flags)
+ return -EINVAL;
+ if (target_ctx->flags & IORING_SETUP_R_DISABLED)
+ return -EBADFD;
+
+- if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0, true))
+- return 0;
++ ret = -EOVERFLOW;
++ if (target_ctx->flags & IORING_SETUP_IOPOLL) {
++ if (unlikely(io_double_lock_ctx(target_ctx, issue_flags)))
++ return -EAGAIN;
++ if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0, true))
++ ret = 0;
++ io_double_unlock_ctx(target_ctx);
++ } else {
++ if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0, true))
++ ret = 0;
++ }
+
+- return -EOVERFLOW;
++ return ret;
+ }
+
+ static struct file *io_msg_grab_file(struct io_kiocb *req, unsigned int issue_flags)
+@@ -175,7 +185,7 @@ int io_msg_ring(struct io_kiocb *req, un
+
+ switch (msg->cmd) {
+ case IORING_MSG_DATA:
+- ret = io_msg_ring_data(req);
++ ret = io_msg_ring_data(req, issue_flags);
+ break;
+ case IORING_MSG_SEND_FD:
+ ret = io_msg_send_fd(req, issue_flags);
--- /dev/null
+From 5c3ffa8ec64e0726dcbfe37b5d126701301011ed Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@kernel.dk>
+Date: Thu, 19 Jan 2023 09:01:27 -0700
+Subject: io_uring/msg_ring: move double lock/unlock helpers higher up
+
+From: Jens Axboe <axboe@kernel.dk>
+
+Commit 423d5081d0451faa59a707e57373801da5b40141 upstream.
+
+In preparation for needing them somewhere else, move them and get rid of
+the unused 'issue_flags' for the unlock side.
+
+No functional changes in this patch.
+
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ io_uring/msg_ring.c | 47 +++++++++++++++++++++++------------------------
+ 1 file changed, 23 insertions(+), 24 deletions(-)
+
+--- a/io_uring/msg_ring.c
++++ b/io_uring/msg_ring.c
+@@ -24,6 +24,28 @@ struct io_msg {
+ u32 flags;
+ };
+
++static void io_double_unlock_ctx(struct io_ring_ctx *octx)
++{
++ mutex_unlock(&octx->uring_lock);
++}
++
++static int io_double_lock_ctx(struct io_ring_ctx *octx,
++ unsigned int issue_flags)
++{
++ /*
++ * To ensure proper ordering between the two ctxs, we can only
++ * attempt a trylock on the target. If that fails and we already have
++ * the source ctx lock, punt to io-wq.
++ */
++ if (!(issue_flags & IO_URING_F_UNLOCKED)) {
++ if (!mutex_trylock(&octx->uring_lock))
++ return -EAGAIN;
++ return 0;
++ }
++ mutex_lock(&octx->uring_lock);
++ return 0;
++}
++
+ void io_msg_ring_cleanup(struct io_kiocb *req)
+ {
+ struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
+@@ -51,29 +73,6 @@ static int io_msg_ring_data(struct io_ki
+ return -EOVERFLOW;
+ }
+
+-static void io_double_unlock_ctx(struct io_ring_ctx *octx,
+- unsigned int issue_flags)
+-{
+- mutex_unlock(&octx->uring_lock);
+-}
+-
+-static int io_double_lock_ctx(struct io_ring_ctx *octx,
+- unsigned int issue_flags)
+-{
+- /*
+- * To ensure proper ordering between the two ctxs, we can only
+- * attempt a trylock on the target. If that fails and we already have
+- * the source ctx lock, punt to io-wq.
+- */
+- if (!(issue_flags & IO_URING_F_UNLOCKED)) {
+- if (!mutex_trylock(&octx->uring_lock))
+- return -EAGAIN;
+- return 0;
+- }
+- mutex_lock(&octx->uring_lock);
+- return 0;
+-}
+-
+ static struct file *io_msg_grab_file(struct io_kiocb *req, unsigned int issue_flags)
+ {
+ struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
+@@ -122,7 +121,7 @@ static int io_msg_install_complete(struc
+ if (!io_post_aux_cqe(target_ctx, msg->user_data, ret, 0, true))
+ ret = -EOVERFLOW;
+ out_unlock:
+- io_double_unlock_ctx(target_ctx, issue_flags);
++ io_double_unlock_ctx(target_ctx);
+ return ret;
+ }
+
--- /dev/null
+From seanjc@google.com Sat Aug 26 18:44:07 2023
+From: Sean Christopherson <seanjc@google.com>
+Date: Wed, 23 Aug 2023 18:01:04 -0700
+Subject: [PATCH 6.1] KVM: x86/mmu: Fix an sign-extension bug with mmu_seq that hangs vCPUs
+To: stable@vger.kernel.org, Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: Paolo Bonzini <pbonzini@redhat.com>, linux-kernel@vger.kernel.org
+Message-ID: <20230824010104.2714198-1-seanjc@google.com>
+
+From: Sean Christopherson <seanjc@google.com>
+
+Upstream commit ba6e3fe25543 ("KVM: x86/mmu: Grab mmu_invalidate_seq in
+kvm_faultin_pfn()") unknowingly fixed the bug in v6.3 when refactoring
+how KVM tracks the sequence counter snapshot.
+
+Take the vCPU's mmu_seq snapshot as an "unsigned long" instead of an "int"
+when checking to see if a page fault is stale, as the sequence count is
+stored as an "unsigned long" everywhere else in KVM. This fixes a bug
+where KVM will effectively hang vCPUs due to always thinking page faults
+are stale, which results in KVM refusing to "fix" faults.
+
+mmu_invalidate_seq (née mmu_notifier_seq) is a sequence counter used when
+KVM is handling page faults to detect if userspace mappings relevant to
+the guest were invalidated between snapshotting the counter and acquiring
+mmu_lock, i.e. to ensure that the userspace mapping KVM is using to
+resolve the page fault is fresh. If KVM sees that the counter has
+changed, KVM simply resumes the guest without fixing the fault.
+
+What _should_ happen is that the source of the mmu_notifier invalidations
+eventually goes away, mmu_invalidate_seq becomes stable, and KVM can once
+again fix guest page fault(s).
+
+But for a long-lived VM and/or a VM that the host just doesn't particularly
+like, it's possible for a VM to be on the receiving end of 2 billion (with
+a B) mmu_notifier invalidations. When that happens, bit 31 will be set in
+mmu_invalidate_seq. This causes the value to be turned into a 32-bit
+negative value when implicitly cast to an "int" by is_page_fault_stale(),
+and then sign-extended into a 64-bit unsigned when the signed "int" is
+implicitly cast back to an "unsigned long" on the call to
+mmu_invalidate_retry_hva().
+
+As a result of the casting and sign-extension, given a sequence counter of
+e.g. 0x8002dc25, mmu_invalidate_retry_hva() ends up doing
+
+ if (0x8002dc25 != 0xffffffff8002dc25)
+
+and signals that the page fault is stale and needs to be retried even
+though the sequence counter is stable, and KVM effectively hangs any vCPU
+that takes a page fault (EPT violation or #NPF when TDP is enabled).
+
+Reported-by: Brian Rak <brak@vultr.com>
+Reported-by: Amaan Cheval <amaan.cheval@gmail.com>
+Reported-by: Eric Wheeler <kvm@lists.ewheeler.net>
+Closes: https://lore.kernel.org/all/f023d927-52aa-7e08-2ee5-59a2fbc65953@gameservers.com
+Fixes: a955cad84cda ("KVM: x86/mmu: Retry page fault if root is invalidated by memslot update")
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/mmu/mmu.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/kvm/mmu/mmu.c
++++ b/arch/x86/kvm/mmu/mmu.c
+@@ -4212,7 +4212,8 @@ static int kvm_faultin_pfn(struct kvm_vc
+ * root was invalidated by a memslot update or a relevant mmu_notifier fired.
+ */
+ static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
+- struct kvm_page_fault *fault, int mmu_seq)
++ struct kvm_page_fault *fault,
++ unsigned long mmu_seq)
+ {
+ struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root.hpa);
+
--- /dev/null
+From edbdb43fc96b11b3bfa531be306a1993d9fe89ec Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc@google.com>
+Date: Wed, 26 Apr 2023 15:03:23 -0700
+Subject: KVM: x86: Preserve TDP MMU roots until they are explicitly invalidated
+
+From: Sean Christopherson <seanjc@google.com>
+
+commit edbdb43fc96b11b3bfa531be306a1993d9fe89ec upstream.
+
+Preserve TDP MMU roots until they are explicitly invalidated by gifting
+the TDP MMU itself a reference to a root when it is allocated. Keeping a
+reference in the TDP MMU fixes a flaw where the TDP MMU exhibits terrible
+performance, and can potentially even soft-hang a vCPU, if a vCPU
+frequently unloads its roots, e.g. when KVM is emulating SMI+RSM.
+
+When KVM emulates something that invalidates _all_ TLB entries, e.g. SMI
+and RSM, KVM unloads all of the vCPUs roots (KVM keeps a small per-vCPU
+cache of previous roots). Unloading roots is a simple way to ensure KVM
+flushes and synchronizes all roots for the vCPU, as KVM flushes and syncs
+when allocating a "new" root (from the vCPU's perspective).
+
+In the shadow MMU, KVM keeps track of all shadow pages, roots included, in
+a per-VM hash table. Unloading a shadow MMU root just wipes it from the
+per-vCPU cache; the root is still tracked in the per-VM hash table. When
+KVM loads a "new" root for the vCPU, KVM will find the old, unloaded root
+in the per-VM hash table.
+
+Unlike the shadow MMU, the TDP MMU doesn't track "inactive" roots in a
+per-VM structure, where "active" in this case means a root is either
+in-use or cached as a previous root by at least one vCPU. When a TDP MMU
+root becomes inactive, i.e. the last vCPU reference to the root is put,
+KVM immediately frees the root (asterisk on "immediately" as the actual
+freeing may be done by a worker, but for all intents and purposes the root
+is gone).
+
+The TDP MMU behavior is especially problematic for 1-vCPU setups, as
+unloading all roots effectively frees all roots. The issue is mitigated
+to some degree in multi-vCPU setups as a different vCPU usually holds a
+reference to an unloaded root and thus keeps the root alive, allowing the
+vCPU to reuse its old root after unloading (with a flush+sync).
+
+The TDP MMU flaw has been known for some time, as until very recently,
+KVM's handling of CR0.WP also triggered unloading of all roots. The
+CR0.WP toggling scenario was eventually addressed by not unloading roots
+when _only_ CR0.WP is toggled, but such an approach doesn't Just Work
+for emulating SMM as KVM must emulate a full TLB flush on entry and exit
+to/from SMM. Given that the shadow MMU plays nice with unloading roots
+at will, teaching the TDP MMU to do the same is far less complex than
+modifying KVM to track which roots need to be flushed before reuse.
+
+Note, preserving all possible TDP MMU roots is not a concern with respect
+to memory consumption. Now that the role for direct MMUs doesn't include
+information about the guest, e.g. CR0.PG, CR0.WP, CR4.SMEP, etc., there
+are _at most_ six possible roots (where "guest_mode" here means L2):
+
+ 1. 4-level !SMM !guest_mode
+ 2. 4-level SMM !guest_mode
+ 3. 5-level !SMM !guest_mode
+ 4. 5-level SMM !guest_mode
+ 5. 4-level !SMM guest_mode
+ 6. 5-level !SMM guest_mode
+
+And because each vCPU can track 4 valid roots, a VM can already have all
+6 root combinations live at any given time. Not to mention that, in
+practice, no sane VMM will advertise different guest.MAXPHYADDR values
+across vCPUs, i.e. KVM won't ever use both 4-level and 5-level roots for
+a single VM. Furthermore, the vast majority of modern hypervisors will
+utilize EPT/NPT when available, thus the guest_mode=%true cases are also
+unlikely to be utilized.
+
+Reported-by: Jeremi Piotrowski <jpiotrowski@linux.microsoft.com>
+Link: https://lore.kernel.org/all/959c5bce-beb5-b463-7158-33fc4a4f910c@linux.microsoft.com
+Link: https://lkml.kernel.org/r/20220209170020.1775368-1-pbonzini%40redhat.com
+Link: https://lore.kernel.org/all/20230322013731.102955-1-minipli@grsecurity.net
+Link: https://lore.kernel.org/all/000000000000a0bc2b05f9dd7fab@google.com
+Link: https://lore.kernel.org/all/000000000000eca0b905fa0f7756@google.com
+Cc: Ben Gardon <bgardon@google.com>
+Cc: David Matlack <dmatlack@google.com>
+Cc: stable@vger.kernel.org
+Tested-by: Jeremi Piotrowski <jpiotrowski@linux.microsoft.com>
+Link: https://lore.kernel.org/r/20230426220323.3079789-1-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/mmu/tdp_mmu.c | 121 ++++++++++++++++++++-------------------------
+ 1 file changed, 56 insertions(+), 65 deletions(-)
+
+--- a/arch/x86/kvm/mmu/tdp_mmu.c
++++ b/arch/x86/kvm/mmu/tdp_mmu.c
+@@ -51,7 +51,17 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *
+ if (!kvm->arch.tdp_mmu_enabled)
+ return;
+
+- /* Also waits for any queued work items. */
++ /*
++ * Invalidate all roots, which besides the obvious, schedules all roots
++ * for zapping and thus puts the TDP MMU's reference to each root, i.e.
++ * ultimately frees all roots.
++ */
++ kvm_tdp_mmu_invalidate_all_roots(kvm);
++
++ /*
++ * Destroying a workqueue also first flushes the workqueue, i.e. no
++ * need to invoke kvm_tdp_mmu_zap_invalidated_roots().
++ */
+ destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);
+
+ WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
+@@ -127,16 +137,6 @@ static void tdp_mmu_schedule_zap_root(st
+ queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work);
+ }
+
+-static inline bool kvm_tdp_root_mark_invalid(struct kvm_mmu_page *page)
+-{
+- union kvm_mmu_page_role role = page->role;
+- role.invalid = true;
+-
+- /* No need to use cmpxchg, only the invalid bit can change. */
+- role.word = xchg(&page->role.word, role.word);
+- return role.invalid;
+-}
+-
+ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
+ bool shared)
+ {
+@@ -145,45 +145,12 @@ void kvm_tdp_mmu_put_root(struct kvm *kv
+ if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
+ return;
+
+- WARN_ON(!root->tdp_mmu_page);
+-
+ /*
+- * The root now has refcount=0. It is valid, but readers already
+- * cannot acquire a reference to it because kvm_tdp_mmu_get_root()
+- * rejects it. This remains true for the rest of the execution
+- * of this function, because readers visit valid roots only
+- * (except for tdp_mmu_zap_root_work(), which however
+- * does not acquire any reference itself).
+- *
+- * Even though there are flows that need to visit all roots for
+- * correctness, they all take mmu_lock for write, so they cannot yet
+- * run concurrently. The same is true after kvm_tdp_root_mark_invalid,
+- * since the root still has refcount=0.
+- *
+- * However, tdp_mmu_zap_root can yield, and writers do not expect to
+- * see refcount=0 (see for example kvm_tdp_mmu_invalidate_all_roots()).
+- * So the root temporarily gets an extra reference, going to refcount=1
+- * while staying invalid. Readers still cannot acquire any reference;
+- * but writers are now allowed to run if tdp_mmu_zap_root yields and
+- * they might take an extra reference if they themselves yield.
+- * Therefore, when the reference is given back by the worker,
+- * there is no guarantee that the refcount is still 1. If not, whoever
+- * puts the last reference will free the page, but they will not have to
+- * zap the root because a root cannot go from invalid to valid.
++ * The TDP MMU itself holds a reference to each root until the root is
++ * explicitly invalidated, i.e. the final reference should be never be
++ * put for a valid root.
+ */
+- if (!kvm_tdp_root_mark_invalid(root)) {
+- refcount_set(&root->tdp_mmu_root_count, 1);
+-
+- /*
+- * Zapping the root in a worker is not just "nice to have";
+- * it is required because kvm_tdp_mmu_invalidate_all_roots()
+- * skips already-invalid roots. If kvm_tdp_mmu_put_root() did
+- * not add the root to the workqueue, kvm_tdp_mmu_zap_all_fast()
+- * might return with some roots not zapped yet.
+- */
+- tdp_mmu_schedule_zap_root(kvm, root);
+- return;
+- }
++ KVM_BUG_ON(!is_tdp_mmu_page(root) || !root->role.invalid, kvm);
+
+ spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+ list_del_rcu(&root->link);
+@@ -329,7 +296,14 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(stru
+ root = tdp_mmu_alloc_sp(vcpu);
+ tdp_mmu_init_sp(root, NULL, 0, role);
+
+- refcount_set(&root->tdp_mmu_root_count, 1);
++ /*
++ * TDP MMU roots are kept until they are explicitly invalidated, either
++ * by a memslot update or by the destruction of the VM. Initialize the
++ * refcount to two; one reference for the vCPU, and one reference for
++ * the TDP MMU itself, which is held until the root is invalidated and
++ * is ultimately put by tdp_mmu_zap_root_work().
++ */
++ refcount_set(&root->tdp_mmu_root_count, 2);
+
+ spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+ list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
+@@ -1027,32 +1001,49 @@ void kvm_tdp_mmu_zap_invalidated_roots(s
+ /*
+ * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
+ * is about to be zapped, e.g. in response to a memslots update. The actual
+- * zapping is performed asynchronously, so a reference is taken on all roots.
+- * Using a separate workqueue makes it easy to ensure that the destruction is
+- * performed before the "fast zap" completes, without keeping a separate list
+- * of invalidated roots; the list is effectively the list of work items in
+- * the workqueue.
+- *
+- * Get a reference even if the root is already invalid, the asynchronous worker
+- * assumes it was gifted a reference to the root it processes. Because mmu_lock
+- * is held for write, it should be impossible to observe a root with zero refcount,
+- * i.e. the list of roots cannot be stale.
++ * zapping is performed asynchronously. Using a separate workqueue makes it
++ * easy to ensure that the destruction is performed before the "fast zap"
++ * completes, without keeping a separate list of invalidated roots; the list is
++ * effectively the list of work items in the workqueue.
+ *
+- * This has essentially the same effect for the TDP MMU
+- * as updating mmu_valid_gen does for the shadow MMU.
++ * Note, the asynchronous worker is gifted the TDP MMU's reference.
++ * See kvm_tdp_mmu_get_vcpu_root_hpa().
+ */
+ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
+ {
+ struct kvm_mmu_page *root;
+
+- lockdep_assert_held_write(&kvm->mmu_lock);
+- list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
+- if (!root->role.invalid &&
+- !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) {
++ /*
++ * mmu_lock must be held for write to ensure that a root doesn't become
++ * invalid while there are active readers (invalidating a root while
++ * there are active readers may or may not be problematic in practice,
++ * but it's uncharted territory and not supported).
++ *
++ * Waive the assertion if there are no users of @kvm, i.e. the VM is
++ * being destroyed after all references have been put, or if no vCPUs
++ * have been created (which means there are no roots), i.e. the VM is
++ * being destroyed in an error path of KVM_CREATE_VM.
++ */
++ if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
++ refcount_read(&kvm->users_count) && kvm->created_vcpus)
++ lockdep_assert_held_write(&kvm->mmu_lock);
++
++ /*
++ * As above, mmu_lock isn't held when destroying the VM! There can't
++ * be other references to @kvm, i.e. nothing else can invalidate roots
++ * or be consuming roots, but walking the list of roots does need to be
++ * guarded against roots being deleted by the asynchronous zap worker.
++ */
++ rcu_read_lock();
++
++ list_for_each_entry_rcu(root, &kvm->arch.tdp_mmu_roots, link) {
++ if (!root->role.invalid) {
+ root->role.invalid = true;
+ tdp_mmu_schedule_zap_root(kvm, root);
+ }
+ }
++
++ rcu_read_unlock();
+ }
+
+ /*
netfilter-nf_tables-fix-out-of-memory-error-handling.patch
rtnetlink-reject-negative-ifindexes-in-rtm_newlink.patch
bonding-fix-macvlan-over-alb-bond-support.patch
+kvm-x86-preserve-tdp-mmu-roots-until-they-are-explicitly-invalidated.patch
+kvm-x86-mmu-fix-an-sign-extension-bug-with-mmu_seq-that-hangs-vcpus.patch
+io_uring-get-rid-of-double-locking.patch
+io_uring-extract-a-io_msg_install_complete-helper.patch
+io_uring-msg_ring-move-double-lock-unlock-helpers-higher-up.patch
+io_uring-msg_ring-fix-missing-lock-on-overflow-for-iopoll.patch