From c63b9fb85d3d75ac074b46f22b75bbd68600b979 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Sat, 26 Aug 2023 18:48:20 +0200
Subject: [PATCH] 6.1-stable patches

added patches:
	io_uring-extract-a-io_msg_install_complete-helper.patch
	io_uring-get-rid-of-double-locking.patch
	io_uring-msg_ring-fix-missing-lock-on-overflow-for-iopoll.patch
	io_uring-msg_ring-move-double-lock-unlock-helpers-higher-up.patch
	kvm-x86-mmu-fix-an-sign-extension-bug-with-mmu_seq-that-hangs-vcpus.patch
	kvm-x86-preserve-tdp-mmu-roots-until-they-are-explicitly-invalidated.patch
---
 ...act-a-io_msg_install_complete-helper.patch |  91 +++++++
 .../io_uring-get-rid-of-double-locking.patch  | 194 +++++++++++++
 ...-missing-lock-on-overflow-for-iopoll.patch |  68 +++++
 ...double-lock-unlock-helpers-higher-up.patch |  90 ++++++
 ...on-bug-with-mmu_seq-that-hangs-vcpus.patch |  72 +++++
 ...ntil-they-are-explicitly-invalidated.patch | 257 ++++++++++++++++++
 queue-6.1/series                              |   6 +
 7 files changed, 778 insertions(+)
 create mode 100644 queue-6.1/io_uring-extract-a-io_msg_install_complete-helper.patch
 create mode 100644 queue-6.1/io_uring-get-rid-of-double-locking.patch
 create mode 100644 queue-6.1/io_uring-msg_ring-fix-missing-lock-on-overflow-for-iopoll.patch
 create mode 100644 queue-6.1/io_uring-msg_ring-move-double-lock-unlock-helpers-higher-up.patch
 create mode 100644 queue-6.1/kvm-x86-mmu-fix-an-sign-extension-bug-with-mmu_seq-that-hangs-vcpus.patch
 create mode 100644 queue-6.1/kvm-x86-preserve-tdp-mmu-roots-until-they-are-explicitly-invalidated.patch

diff --git a/queue-6.1/io_uring-extract-a-io_msg_install_complete-helper.patch b/queue-6.1/io_uring-extract-a-io_msg_install_complete-helper.patch
new file mode 100644
index 00000000000..6bfb2d206fc
--- /dev/null
+++ b/queue-6.1/io_uring-extract-a-io_msg_install_complete-helper.patch
@@ -0,0 +1,91 @@
+From ef7b782465bd6943fbcacb4af4fbe790137a5820 Mon Sep 17 00:00:00 2001
+From: Pavel Begunkov <asml.silence@gmail.com>
+Date: Wed, 7 Dec 2022 03:53:35 +0000
+Subject: io_uring: extract a io_msg_install_complete helper
+
+From: Pavel Begunkov <asml.silence@gmail.com>
+
+Commit 172113101641cf1f9628c528ec790cb809f2b704 upstream.
+
+Extract a helper called io_msg_install_complete() from io_msg_send_fd(),
+will be used later.
+
+Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
+Link: https://lore.kernel.org/r/1500ca1054cc4286a3ee1c60aacead57fcdfa02a.1670384893.git.asml.silence@gmail.com
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ io_uring/msg_ring.c |   38 +++++++++++++++++++++-----------------
+ 1 file changed, 21 insertions(+), 17 deletions(-)
+
+--- a/io_uring/msg_ring.c
++++ b/io_uring/msg_ring.c
+@@ -94,40 +94,25 @@ static struct file *io_msg_grab_file(str
+ 	return file;
+ }
+ 
+-static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags)
++static int io_msg_install_complete(struct io_kiocb *req, unsigned int issue_flags)
+ {
+ 	struct io_ring_ctx *target_ctx = req->file->private_data;
+ 	struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
+-	struct io_ring_ctx *ctx = req->ctx;
+ 	struct file *src_file = msg->src_file;
+ 	int ret;
+ 
+-	if (msg->len)
+-		return -EINVAL;
+-	if (target_ctx == ctx)
+-		return -EINVAL;
+-	if (target_ctx->flags & IORING_SETUP_R_DISABLED)
+-		return -EBADFD;
+-	if (!src_file) {
+-		src_file = io_msg_grab_file(req, issue_flags);
+-		if (!src_file)
+-			return -EBADF;
+-		msg->src_file = src_file;
+-		req->flags |= REQ_F_NEED_CLEANUP;
+-	}
+-
+ 	if (unlikely(io_double_lock_ctx(target_ctx, issue_flags)))
+ 		return -EAGAIN;
+ 
+ 	ret = __io_fixed_fd_install(target_ctx, src_file, msg->dst_fd);
+ 	if (ret < 0)
+ 		goto out_unlock;
++
+ 	msg->src_file = NULL;
+ 	req->flags &= ~REQ_F_NEED_CLEANUP;
+ 
+ 	if (msg->flags & IORING_MSG_RING_CQE_SKIP)
+ 		goto out_unlock;
+-
+ 	/*
+ 	 * If this fails, the target still received the file descriptor but
+ 	 * wasn't notified of the fact. This means that if this request
+@@ -141,6 +126,25 @@ out_unlock:
+ 	return ret;
+ }
+ 
++static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags)
++{
++	struct io_ring_ctx *target_ctx = req->file->private_data;
++	struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
++	struct io_ring_ctx *ctx = req->ctx;
++	struct file *src_file = msg->src_file;
++
++	if (target_ctx == ctx)
++		return -EINVAL;
++	if (!src_file) {
++		src_file = io_msg_grab_file(req, issue_flags);
++		if (!src_file)
++			return -EBADF;
++		msg->src_file = src_file;
++		req->flags |= REQ_F_NEED_CLEANUP;
++	}
++	return io_msg_install_complete(req, issue_flags);
++}
++
+ int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+ {
+ 	struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
diff --git a/queue-6.1/io_uring-get-rid-of-double-locking.patch b/queue-6.1/io_uring-get-rid-of-double-locking.patch
new file mode 100644
index 00000000000..8435bf17520
--- /dev/null
+++ b/queue-6.1/io_uring-get-rid-of-double-locking.patch
@@ -0,0 +1,194 @@
+From f10ba483becae0b8c7de0e9fe3e0f6d07a405b7a Mon Sep 17 00:00:00 2001
+From: Pavel Begunkov <asml.silence@gmail.com>
+Date: Wed, 7 Dec 2022 03:53:34 +0000
+Subject: io_uring: get rid of double locking
+
+From: Pavel Begunkov <asml.silence@gmail.com>
+
+Commit 11373026f2960390d5e330df4e92735c4265c440 upstream.
+
+We don't need to take both uring_locks at once, msg_ring can be split in
+two parts, first getting a file from the filetable of the first ring and
+then installing it into the second one.
+
+Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
+Link: https://lore.kernel.org/r/a80ecc2bc99c3b3f2cf20015d618b7c51419a797.1670384893.git.asml.silence@gmail.com
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ io_uring/msg_ring.c |   85 +++++++++++++++++++++++++++++-----------------------
+ io_uring/msg_ring.h |    1 
+ io_uring/opdef.c    |    1 
+ 3 files changed, 51 insertions(+), 36 deletions(-)
+
+--- a/io_uring/msg_ring.c
++++ b/io_uring/msg_ring.c
+@@ -15,6 +15,7 @@
+ 
+ struct io_msg {
+ 	struct file			*file;
++	struct file			*src_file;
+ 	u64 user_data;
+ 	u32 len;
+ 	u32 cmd;
+@@ -23,6 +24,17 @@ struct io_msg {
+ 	u32 flags;
+ };
+ 
++void io_msg_ring_cleanup(struct io_kiocb *req)
++{
++	struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
++
++	if (WARN_ON_ONCE(!msg->src_file))
++		return;
++
++	fput(msg->src_file);
++	msg->src_file = NULL;
++}
++
+ static int io_msg_ring_data(struct io_kiocb *req)
+ {
+ 	struct io_ring_ctx *target_ctx = req->file->private_data;
+@@ -39,17 +51,13 @@ static int io_msg_ring_data(struct io_ki
+ 	return -EOVERFLOW;
+ }
+ 
+-static void io_double_unlock_ctx(struct io_ring_ctx *ctx,
+-				 struct io_ring_ctx *octx,
++static void io_double_unlock_ctx(struct io_ring_ctx *octx,
+ 				 unsigned int issue_flags)
+ {
+-	if (issue_flags & IO_URING_F_UNLOCKED)
+-		mutex_unlock(&ctx->uring_lock);
+ 	mutex_unlock(&octx->uring_lock);
+ }
+ 
+-static int io_double_lock_ctx(struct io_ring_ctx *ctx,
+-			      struct io_ring_ctx *octx,
++static int io_double_lock_ctx(struct io_ring_ctx *octx,
+ 			      unsigned int issue_flags)
+ {
+ 	/*
+@@ -62,17 +70,28 @@ static int io_double_lock_ctx(struct io_
+ 			return -EAGAIN;
+ 		return 0;
+ 	}
++	mutex_lock(&octx->uring_lock);
++	return 0;
++}
+ 
+-	/* Always grab smallest value ctx first. We know ctx != octx. */
+-	if (ctx < octx) {
+-		mutex_lock(&ctx->uring_lock);
+-		mutex_lock(&octx->uring_lock);
+-	} else {
+-		mutex_lock(&octx->uring_lock);
+-		mutex_lock(&ctx->uring_lock);
+-	}
++static struct file *io_msg_grab_file(struct io_kiocb *req, unsigned int issue_flags)
++{
++	struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
++	struct io_ring_ctx *ctx = req->ctx;
++	struct file *file = NULL;
++	unsigned long file_ptr;
++	int idx = msg->src_fd;
+ 
+-	return 0;
++	io_ring_submit_lock(ctx, issue_flags);
++	if (likely(idx < ctx->nr_user_files)) {
++		idx = array_index_nospec(idx, ctx->nr_user_files);
++		file_ptr = io_fixed_file_slot(&ctx->file_table, idx)->file_ptr;
++		file = (struct file *) (file_ptr & FFS_MASK);
++		if (file)
++			get_file(file);
++	}
++	io_ring_submit_unlock(ctx, issue_flags);
++	return file;
+ }
+ 
+ static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags)
+@@ -80,8 +99,7 @@ static int io_msg_send_fd(struct io_kioc
+ 	struct io_ring_ctx *target_ctx = req->file->private_data;
+ 	struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
+ 	struct io_ring_ctx *ctx = req->ctx;
+-	unsigned long file_ptr;
+-	struct file *src_file;
++	struct file *src_file = msg->src_file;
+ 	int ret;
+ 
+ 	if (msg->len)
+@@ -90,28 +108,22 @@ static int io_msg_send_fd(struct io_kioc
+ 		return -EINVAL;
+ 	if (target_ctx->flags & IORING_SETUP_R_DISABLED)
+ 		return -EBADFD;
++	if (!src_file) {
++		src_file = io_msg_grab_file(req, issue_flags);
++		if (!src_file)
++			return -EBADF;
++		msg->src_file = src_file;
++		req->flags |= REQ_F_NEED_CLEANUP;
++	}
+ 
+-	ret = io_double_lock_ctx(ctx, target_ctx, issue_flags);
+-	if (unlikely(ret))
+-		return ret;
+-
+-	ret = -EBADF;
+-	if (unlikely(msg->src_fd >= ctx->nr_user_files))
+-		goto out_unlock;
+-
+-	msg->src_fd = array_index_nospec(msg->src_fd, ctx->nr_user_files);
+-	file_ptr = io_fixed_file_slot(&ctx->file_table, msg->src_fd)->file_ptr;
+-	if (!file_ptr)
+-		goto out_unlock;
+-
+-	src_file = (struct file *) (file_ptr & FFS_MASK);
+-	get_file(src_file);
++	if (unlikely(io_double_lock_ctx(target_ctx, issue_flags)))
++		return -EAGAIN;
+ 
+ 	ret = __io_fixed_fd_install(target_ctx, src_file, msg->dst_fd);
+-	if (ret < 0) {
+-		fput(src_file);
++	if (ret < 0)
+ 		goto out_unlock;
+-	}
++	msg->src_file = NULL;
++	req->flags &= ~REQ_F_NEED_CLEANUP;
+ 
+ 	if (msg->flags & IORING_MSG_RING_CQE_SKIP)
+ 		goto out_unlock;
+@@ -125,7 +137,7 @@ static int io_msg_send_fd(struct io_kioc
+ 	if (!io_post_aux_cqe(target_ctx, msg->user_data, ret, 0, true))
+ 		ret = -EOVERFLOW;
+ out_unlock:
+-	io_double_unlock_ctx(ctx, target_ctx, issue_flags);
++	io_double_unlock_ctx(target_ctx, issue_flags);
+ 	return ret;
+ }
+ 
+@@ -136,6 +148,7 @@ int io_msg_ring_prep(struct io_kiocb *re
+ 	if (unlikely(sqe->buf_index || sqe->personality))
+ 		return -EINVAL;
+ 
++	msg->src_file = NULL;
+ 	msg->user_data = READ_ONCE(sqe->off);
+ 	msg->len = READ_ONCE(sqe->len);
+ 	msg->cmd = READ_ONCE(sqe->addr);
+--- a/io_uring/msg_ring.h
++++ b/io_uring/msg_ring.h
+@@ -2,3 +2,4 @@
+ 
+ int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+ int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags);
++void io_msg_ring_cleanup(struct io_kiocb *req);
+--- a/io_uring/opdef.c
++++ b/io_uring/opdef.c
+@@ -445,6 +445,7 @@ const struct io_op_def io_op_defs[] = {
+ 		.name			= "MSG_RING",
+ 		.prep			= io_msg_ring_prep,
+ 		.issue			= io_msg_ring,
++		.cleanup		= io_msg_ring_cleanup,
+ 	},
+ 	[IORING_OP_FSETXATTR] = {
+ 		.needs_file = 1,
diff --git a/queue-6.1/io_uring-msg_ring-fix-missing-lock-on-overflow-for-iopoll.patch b/queue-6.1/io_uring-msg_ring-fix-missing-lock-on-overflow-for-iopoll.patch
new file mode 100644
index 00000000000..e717705cf64
--- /dev/null
+++ b/queue-6.1/io_uring-msg_ring-fix-missing-lock-on-overflow-for-iopoll.patch
@@ -0,0 +1,68 @@
+From c47db9ca937e232606060077bb09cff49e75dbb7 Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@kernel.dk>
+Date: Tue, 22 Aug 2023 18:00:02 -0600
+Subject: io_uring/msg_ring: fix missing lock on overflow for IOPOLL
+
+From: Jens Axboe <axboe@kernel.dk>
+
+Commit e12d7a46f65ae4b7d58a5e0c1cbfa825cf8d830d upstream.
+
+If the target ring is configured with IOPOLL, then we always need to hold
+the target ring uring_lock before posting CQEs. We could just grab it
+unconditionally, but since we don't expect many target rings to be of this
+type, make grabbing the uring_lock conditional on the ring type.
+
+Link: https://lore.kernel.org/io-uring/Y8krlYa52%2F0YGqkg@ip-172-31-85-199.ec2.internal/
+Reported-by: Xingyuan Mo <hdthky0@gmail.com>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ io_uring/msg_ring.c |   20 +++++++++++++++-----
+ 1 file changed, 15 insertions(+), 5 deletions(-)
+
+--- a/io_uring/msg_ring.c
++++ b/io_uring/msg_ring.c
+@@ -57,20 +57,30 @@ void io_msg_ring_cleanup(struct io_kiocb
+ 	msg->src_file = NULL;
+ }
+ 
+-static int io_msg_ring_data(struct io_kiocb *req)
++static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags)
+ {
+ 	struct io_ring_ctx *target_ctx = req->file->private_data;
+ 	struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
++	int ret;
+ 
+ 	if (msg->src_fd || msg->dst_fd || msg->flags)
+ 		return -EINVAL;
+ 	if (target_ctx->flags & IORING_SETUP_R_DISABLED)
+ 		return -EBADFD;
+ 
+-	if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0, true))
+-		return 0;
++	ret = -EOVERFLOW;
++	if (target_ctx->flags & IORING_SETUP_IOPOLL) {
++		if (unlikely(io_double_lock_ctx(target_ctx, issue_flags)))
++			return -EAGAIN;
++		if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0, true))
++			ret = 0;
++		io_double_unlock_ctx(target_ctx);
++	} else {
++		if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0, true))
++			ret = 0;
++	}
+ 
+-	return -EOVERFLOW;
++	return ret;
+ }
+ 
+ static struct file *io_msg_grab_file(struct io_kiocb *req, unsigned int issue_flags)
+@@ -175,7 +185,7 @@ int io_msg_ring(struct io_kiocb *req, un
+ 
+ 	switch (msg->cmd) {
+ 	case IORING_MSG_DATA:
+-		ret = io_msg_ring_data(req);
++		ret = io_msg_ring_data(req, issue_flags);
+ 		break;
+ 	case IORING_MSG_SEND_FD:
+ 		ret = io_msg_send_fd(req, issue_flags);
diff --git a/queue-6.1/io_uring-msg_ring-move-double-lock-unlock-helpers-higher-up.patch b/queue-6.1/io_uring-msg_ring-move-double-lock-unlock-helpers-higher-up.patch
new file mode 100644
index 00000000000..9ab511558a3
--- /dev/null
+++ b/queue-6.1/io_uring-msg_ring-move-double-lock-unlock-helpers-higher-up.patch
@@ -0,0 +1,90 @@
+From 5c3ffa8ec64e0726dcbfe37b5d126701301011ed Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@kernel.dk>
+Date: Thu, 19 Jan 2023 09:01:27 -0700
+Subject: io_uring/msg_ring: move double lock/unlock helpers higher up
+
+From: Jens Axboe <axboe@kernel.dk>
+
+Commit 423d5081d0451faa59a707e57373801da5b40141 upstream.
+
+In preparation for needing them somewhere else, move them and get rid of
+the unused 'issue_flags' for the unlock side.
+
+No functional changes in this patch.
+
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ io_uring/msg_ring.c |   47 +++++++++++++++++++++++------------------------
+ 1 file changed, 23 insertions(+), 24 deletions(-)
+
+--- a/io_uring/msg_ring.c
++++ b/io_uring/msg_ring.c
+@@ -24,6 +24,28 @@ struct io_msg {
+ 	u32 flags;
+ };
+ 
++static void io_double_unlock_ctx(struct io_ring_ctx *octx)
++{
++	mutex_unlock(&octx->uring_lock);
++}
++
++static int io_double_lock_ctx(struct io_ring_ctx *octx,
++			      unsigned int issue_flags)
++{
++	/*
++	 * To ensure proper ordering between the two ctxs, we can only
++	 * attempt a trylock on the target. If that fails and we already have
++	 * the source ctx lock, punt to io-wq.
++	 */
++	if (!(issue_flags & IO_URING_F_UNLOCKED)) {
++		if (!mutex_trylock(&octx->uring_lock))
++			return -EAGAIN;
++		return 0;
++	}
++	mutex_lock(&octx->uring_lock);
++	return 0;
++}
++
+ void io_msg_ring_cleanup(struct io_kiocb *req)
+ {
+ 	struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
+@@ -51,29 +73,6 @@ static int io_msg_ring_data(struct io_ki
+ 	return -EOVERFLOW;
+ }
+ 
+-static void io_double_unlock_ctx(struct io_ring_ctx *octx,
+-				 unsigned int issue_flags)
+-{
+-	mutex_unlock(&octx->uring_lock);
+-}
+-
+-static int io_double_lock_ctx(struct io_ring_ctx *octx,
+-			      unsigned int issue_flags)
+-{
+-	/*
+-	 * To ensure proper ordering between the two ctxs, we can only
+-	 * attempt a trylock on the target. If that fails and we already have
+-	 * the source ctx lock, punt to io-wq.
+-	 */
+-	if (!(issue_flags & IO_URING_F_UNLOCKED)) {
+-		if (!mutex_trylock(&octx->uring_lock))
+-			return -EAGAIN;
+-		return 0;
+-	}
+-	mutex_lock(&octx->uring_lock);
+-	return 0;
+-}
+-
+ static struct file *io_msg_grab_file(struct io_kiocb *req, unsigned int issue_flags)
+ {
+ 	struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
+@@ -122,7 +121,7 @@ static int io_msg_install_complete(struc
+ 	if (!io_post_aux_cqe(target_ctx, msg->user_data, ret, 0, true))
+ 		ret = -EOVERFLOW;
+ out_unlock:
+-	io_double_unlock_ctx(target_ctx, issue_flags);
++	io_double_unlock_ctx(target_ctx);
+ 	return ret;
+ }
+ 
diff --git a/queue-6.1/kvm-x86-mmu-fix-an-sign-extension-bug-with-mmu_seq-that-hangs-vcpus.patch b/queue-6.1/kvm-x86-mmu-fix-an-sign-extension-bug-with-mmu_seq-that-hangs-vcpus.patch
new file mode 100644
index 00000000000..ec5c1b78f64
--- /dev/null
+++ b/queue-6.1/kvm-x86-mmu-fix-an-sign-extension-bug-with-mmu_seq-that-hangs-vcpus.patch
@@ -0,0 +1,72 @@
+From seanjc@google.com  Sat Aug 26 18:44:07 2023
+From: Sean Christopherson <seanjc@google.com>
+Date: Wed, 23 Aug 2023 18:01:04 -0700
+Subject: [PATCH 6.1] KVM: x86/mmu: Fix an sign-extension bug with mmu_seq that hangs vCPUs
+To: stable@vger.kernel.org, Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: Paolo Bonzini <pbonzini@redhat.com>, linux-kernel@vger.kernel.org
+Message-ID: <20230824010104.2714198-1-seanjc@google.com>
+
+From: Sean Christopherson <seanjc@google.com>
+
+Upstream commit ba6e3fe25543 ("KVM: x86/mmu: Grab mmu_invalidate_seq in
+kvm_faultin_pfn()") unknowingly fixed the bug in v6.3 when refactoring
+how KVM tracks the sequence counter snapshot.
+
+Take the vCPU's mmu_seq snapshot as an "unsigned long" instead of an "int"
+when checking to see if a page fault is stale, as the sequence count is
+stored as an "unsigned long" everywhere else in KVM.  This fixes a bug
+where KVM will effectively hang vCPUs due to always thinking page faults
+are stale, which results in KVM refusing to "fix" faults.
+
+mmu_invalidate_seq (nÃ©e mmu_notifier_seq) is a sequence counter used when
+KVM is handling page faults to detect if userspace mappings relevant to
+the guest were invalidated between snapshotting the counter and acquiring
+mmu_lock, i.e. to ensure that the userspace mapping KVM is using to
+resolve the page fault is fresh.  If KVM sees that the counter has
+changed, KVM simply resumes the guest without fixing the fault.
+
+What _should_ happen is that the source of the mmu_notifier invalidations
+eventually goes away, mmu_invalidate_seq becomes stable, and KVM can once
+again fix guest page fault(s).
+
+But for a long-lived VM and/or a VM that the host just doesn't particularly
+like, it's possible for a VM to be on the receiving end of 2 billion (with
+a B) mmu_notifier invalidations.  When that happens, bit 31 will be set in
+mmu_invalidate_seq.  This causes the value to be turned into a 32-bit
+negative value when implicitly cast to an "int" by is_page_fault_stale(),
+and then sign-extended into a 64-bit unsigned when the signed "int" is
+implicitly cast back to an "unsigned long" on the call to
+mmu_invalidate_retry_hva().
+
+As a result of the casting and sign-extension, given a sequence counter of
+e.g. 0x8002dc25, mmu_invalidate_retry_hva() ends up doing
+
+	if (0x8002dc25 != 0xffffffff8002dc25)
+
+and signals that the page fault is stale and needs to be retried even
+though the sequence counter is stable, and KVM effectively hangs any vCPU
+that takes a page fault (EPT violation or #NPF when TDP is enabled).
+
+Reported-by: Brian Rak <brak@vultr.com>
+Reported-by: Amaan Cheval <amaan.cheval@gmail.com>
+Reported-by: Eric Wheeler <kvm@lists.ewheeler.net>
+Closes: https://lore.kernel.org/all/f023d927-52aa-7e08-2ee5-59a2fbc65953@gameservers.com
+Fixes: a955cad84cda ("KVM: x86/mmu: Retry page fault if root is invalidated by memslot update")
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/mmu/mmu.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/kvm/mmu/mmu.c
++++ b/arch/x86/kvm/mmu/mmu.c
+@@ -4212,7 +4212,8 @@ static int kvm_faultin_pfn(struct kvm_vc
+  * root was invalidated by a memslot update or a relevant mmu_notifier fired.
+  */
+ static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
+-				struct kvm_page_fault *fault, int mmu_seq)
++				struct kvm_page_fault *fault,
++				unsigned long mmu_seq)
+ {
+ 	struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root.hpa);
+ 
diff --git a/queue-6.1/kvm-x86-preserve-tdp-mmu-roots-until-they-are-explicitly-invalidated.patch b/queue-6.1/kvm-x86-preserve-tdp-mmu-roots-until-they-are-explicitly-invalidated.patch
new file mode 100644
index 00000000000..2e234f8ee9c
--- /dev/null
+++ b/queue-6.1/kvm-x86-preserve-tdp-mmu-roots-until-they-are-explicitly-invalidated.patch
@@ -0,0 +1,257 @@
+From edbdb43fc96b11b3bfa531be306a1993d9fe89ec Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc@google.com>
+Date: Wed, 26 Apr 2023 15:03:23 -0700
+Subject: KVM: x86: Preserve TDP MMU roots until they are explicitly invalidated
+
+From: Sean Christopherson <seanjc@google.com>
+
+commit edbdb43fc96b11b3bfa531be306a1993d9fe89ec upstream.
+
+Preserve TDP MMU roots until they are explicitly invalidated by gifting
+the TDP MMU itself a reference to a root when it is allocated.  Keeping a
+reference in the TDP MMU fixes a flaw where the TDP MMU exhibits terrible
+performance, and can potentially even soft-hang a vCPU, if a vCPU
+frequently unloads its roots, e.g. when KVM is emulating SMI+RSM.
+
+When KVM emulates something that invalidates _all_ TLB entries, e.g. SMI
+and RSM, KVM unloads all of the vCPUs roots (KVM keeps a small per-vCPU
+cache of previous roots).  Unloading roots is a simple way to ensure KVM
+flushes and synchronizes all roots for the vCPU, as KVM flushes and syncs
+when allocating a "new" root (from the vCPU's perspective).
+
+In the shadow MMU, KVM keeps track of all shadow pages, roots included, in
+a per-VM hash table.  Unloading a shadow MMU root just wipes it from the
+per-vCPU cache; the root is still tracked in the per-VM hash table.  When
+KVM loads a "new" root for the vCPU, KVM will find the old, unloaded root
+in the per-VM hash table.
+
+Unlike the shadow MMU, the TDP MMU doesn't track "inactive" roots in a
+per-VM structure, where "active" in this case means a root is either
+in-use or cached as a previous root by at least one vCPU.  When a TDP MMU
+root becomes inactive, i.e. the last vCPU reference to the root is put,
+KVM immediately frees the root (asterisk on "immediately" as the actual
+freeing may be done by a worker, but for all intents and purposes the root
+is gone).
+
+The TDP MMU behavior is especially problematic for 1-vCPU setups, as
+unloading all roots effectively frees all roots.  The issue is mitigated
+to some degree in multi-vCPU setups as a different vCPU usually holds a
+reference to an unloaded root and thus keeps the root alive, allowing the
+vCPU to reuse its old root after unloading (with a flush+sync).
+
+The TDP MMU flaw has been known for some time, as until very recently,
+KVM's handling of CR0.WP also triggered unloading of all roots.  The
+CR0.WP toggling scenario was eventually addressed by not unloading roots
+when _only_ CR0.WP is toggled, but such an approach doesn't Just Work
+for emulating SMM as KVM must emulate a full TLB flush on entry and exit
+to/from SMM.  Given that the shadow MMU plays nice with unloading roots
+at will, teaching the TDP MMU to do the same is far less complex than
+modifying KVM to track which roots need to be flushed before reuse.
+
+Note, preserving all possible TDP MMU roots is not a concern with respect
+to memory consumption.  Now that the role for direct MMUs doesn't include
+information about the guest, e.g. CR0.PG, CR0.WP, CR4.SMEP, etc., there
+are _at most_ six possible roots (where "guest_mode" here means L2):
+
+  1. 4-level !SMM !guest_mode
+  2. 4-level  SMM !guest_mode
+  3. 5-level !SMM !guest_mode
+  4. 5-level  SMM !guest_mode
+  5. 4-level !SMM guest_mode
+  6. 5-level !SMM guest_mode
+
+And because each vCPU can track 4 valid roots, a VM can already have all
+6 root combinations live at any given time.  Not to mention that, in
+practice, no sane VMM will advertise different guest.MAXPHYADDR values
+across vCPUs, i.e. KVM won't ever use both 4-level and 5-level roots for
+a single VM.  Furthermore, the vast majority of modern hypervisors will
+utilize EPT/NPT when available, thus the guest_mode=%true cases are also
+unlikely to be utilized.
+
+Reported-by: Jeremi Piotrowski <jpiotrowski@linux.microsoft.com>
+Link: https://lore.kernel.org/all/959c5bce-beb5-b463-7158-33fc4a4f910c@linux.microsoft.com
+Link: https://lkml.kernel.org/r/20220209170020.1775368-1-pbonzini%40redhat.com
+Link: https://lore.kernel.org/all/20230322013731.102955-1-minipli@grsecurity.net
+Link: https://lore.kernel.org/all/000000000000a0bc2b05f9dd7fab@google.com
+Link: https://lore.kernel.org/all/000000000000eca0b905fa0f7756@google.com
+Cc: Ben Gardon <bgardon@google.com>
+Cc: David Matlack <dmatlack@google.com>
+Cc: stable@vger.kernel.org
+Tested-by: Jeremi Piotrowski <jpiotrowski@linux.microsoft.com>
+Link: https://lore.kernel.org/r/20230426220323.3079789-1-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/mmu/tdp_mmu.c |  121 ++++++++++++++++++++-------------------------
+ 1 file changed, 56 insertions(+), 65 deletions(-)
+
+--- a/arch/x86/kvm/mmu/tdp_mmu.c
++++ b/arch/x86/kvm/mmu/tdp_mmu.c
+@@ -51,7 +51,17 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *
+ 	if (!kvm->arch.tdp_mmu_enabled)
+ 		return;
+ 
+-	/* Also waits for any queued work items.  */
++	/*
++	 * Invalidate all roots, which besides the obvious, schedules all roots
++	 * for zapping and thus puts the TDP MMU's reference to each root, i.e.
++	 * ultimately frees all roots.
++	 */
++	kvm_tdp_mmu_invalidate_all_roots(kvm);
++
++	/*
++	 * Destroying a workqueue also first flushes the workqueue, i.e. no
++	 * need to invoke kvm_tdp_mmu_zap_invalidated_roots().
++	 */
+ 	destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);
+ 
+ 	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
+@@ -127,16 +137,6 @@ static void tdp_mmu_schedule_zap_root(st
+ 	queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work);
+ }
+ 
+-static inline bool kvm_tdp_root_mark_invalid(struct kvm_mmu_page *page)
+-{
+-	union kvm_mmu_page_role role = page->role;
+-	role.invalid = true;
+-
+-	/* No need to use cmpxchg, only the invalid bit can change.  */
+-	role.word = xchg(&page->role.word, role.word);
+-	return role.invalid;
+-}
+-
+ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
+ 			  bool shared)
+ {
+@@ -145,45 +145,12 @@ void kvm_tdp_mmu_put_root(struct kvm *kv
+ 	if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
+ 		return;
+ 
+-	WARN_ON(!root->tdp_mmu_page);
+-
+ 	/*
+-	 * The root now has refcount=0.  It is valid, but readers already
+-	 * cannot acquire a reference to it because kvm_tdp_mmu_get_root()
+-	 * rejects it.  This remains true for the rest of the execution
+-	 * of this function, because readers visit valid roots only
+-	 * (except for tdp_mmu_zap_root_work(), which however
+-	 * does not acquire any reference itself).
+-	 *
+-	 * Even though there are flows that need to visit all roots for
+-	 * correctness, they all take mmu_lock for write, so they cannot yet
+-	 * run concurrently. The same is true after kvm_tdp_root_mark_invalid,
+-	 * since the root still has refcount=0.
+-	 *
+-	 * However, tdp_mmu_zap_root can yield, and writers do not expect to
+-	 * see refcount=0 (see for example kvm_tdp_mmu_invalidate_all_roots()).
+-	 * So the root temporarily gets an extra reference, going to refcount=1
+-	 * while staying invalid.  Readers still cannot acquire any reference;
+-	 * but writers are now allowed to run if tdp_mmu_zap_root yields and
+-	 * they might take an extra reference if they themselves yield.
+-	 * Therefore, when the reference is given back by the worker,
+-	 * there is no guarantee that the refcount is still 1.  If not, whoever
+-	 * puts the last reference will free the page, but they will not have to
+-	 * zap the root because a root cannot go from invalid to valid.
++	 * The TDP MMU itself holds a reference to each root until the root is
++	 * explicitly invalidated, i.e. the final reference should be never be
++	 * put for a valid root.
+ 	 */
+-	if (!kvm_tdp_root_mark_invalid(root)) {
+-		refcount_set(&root->tdp_mmu_root_count, 1);
+-
+-		/*
+-		 * Zapping the root in a worker is not just "nice to have";
+-		 * it is required because kvm_tdp_mmu_invalidate_all_roots()
+-		 * skips already-invalid roots.  If kvm_tdp_mmu_put_root() did
+-		 * not add the root to the workqueue, kvm_tdp_mmu_zap_all_fast()
+-		 * might return with some roots not zapped yet.
+-		 */
+-		tdp_mmu_schedule_zap_root(kvm, root);
+-		return;
+-	}
++	KVM_BUG_ON(!is_tdp_mmu_page(root) || !root->role.invalid, kvm);
+ 
+ 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+ 	list_del_rcu(&root->link);
+@@ -329,7 +296,14 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(stru
+ 	root = tdp_mmu_alloc_sp(vcpu);
+ 	tdp_mmu_init_sp(root, NULL, 0, role);
+ 
+-	refcount_set(&root->tdp_mmu_root_count, 1);
++	/*
++	 * TDP MMU roots are kept until they are explicitly invalidated, either
++	 * by a memslot update or by the destruction of the VM.  Initialize the
++	 * refcount to two; one reference for the vCPU, and one reference for
++	 * the TDP MMU itself, which is held until the root is invalidated and
++	 * is ultimately put by tdp_mmu_zap_root_work().
++	 */
++	refcount_set(&root->tdp_mmu_root_count, 2);
+ 
+ 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+ 	list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
+@@ -1027,32 +1001,49 @@ void kvm_tdp_mmu_zap_invalidated_roots(s
+ /*
+  * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
+  * is about to be zapped, e.g. in response to a memslots update.  The actual
+- * zapping is performed asynchronously, so a reference is taken on all roots.
+- * Using a separate workqueue makes it easy to ensure that the destruction is
+- * performed before the "fast zap" completes, without keeping a separate list
+- * of invalidated roots; the list is effectively the list of work items in
+- * the workqueue.
+- *
+- * Get a reference even if the root is already invalid, the asynchronous worker
+- * assumes it was gifted a reference to the root it processes.  Because mmu_lock
+- * is held for write, it should be impossible to observe a root with zero refcount,
+- * i.e. the list of roots cannot be stale.
++ * zapping is performed asynchronously.  Using a separate workqueue makes it
++ * easy to ensure that the destruction is performed before the "fast zap"
++ * completes, without keeping a separate list of invalidated roots; the list is
++ * effectively the list of work items in the workqueue.
+  *
+- * This has essentially the same effect for the TDP MMU
+- * as updating mmu_valid_gen does for the shadow MMU.
++ * Note, the asynchronous worker is gifted the TDP MMU's reference.
++ * See kvm_tdp_mmu_get_vcpu_root_hpa().
+  */
+ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
+ {
+ 	struct kvm_mmu_page *root;
+ 
+-	lockdep_assert_held_write(&kvm->mmu_lock);
+-	list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
+-		if (!root->role.invalid &&
+-		    !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) {
++	/*
++	 * mmu_lock must be held for write to ensure that a root doesn't become
++	 * invalid while there are active readers (invalidating a root while
++	 * there are active readers may or may not be problematic in practice,
++	 * but it's uncharted territory and not supported).
++	 *
++	 * Waive the assertion if there are no users of @kvm, i.e. the VM is
++	 * being destroyed after all references have been put, or if no vCPUs
++	 * have been created (which means there are no roots), i.e. the VM is
++	 * being destroyed in an error path of KVM_CREATE_VM.
++	 */
++	if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
++	    refcount_read(&kvm->users_count) && kvm->created_vcpus)
++		lockdep_assert_held_write(&kvm->mmu_lock);
++
++	/*
++	 * As above, mmu_lock isn't held when destroying the VM!  There can't
++	 * be other references to @kvm, i.e. nothing else can invalidate roots
++	 * or be consuming roots, but walking the list of roots does need to be
++	 * guarded against roots being deleted by the asynchronous zap worker.
++	 */
++	rcu_read_lock();
++
++	list_for_each_entry_rcu(root, &kvm->arch.tdp_mmu_roots, link) {
++		if (!root->role.invalid) {
+ 			root->role.invalid = true;
+ 			tdp_mmu_schedule_zap_root(kvm, root);
+ 		}
+ 	}
++
++	rcu_read_unlock();
+ }
+ 
+ /*
diff --git a/queue-6.1/series b/queue-6.1/series
index 009eb82c099..4ec408f0c44 100644
--- a/queue-6.1/series
+++ b/queue-6.1/series
@@ -56,3 +56,9 @@ netfilter-nf_tables-flush-pending-destroy-work-befor.patch
 netfilter-nf_tables-fix-out-of-memory-error-handling.patch
 rtnetlink-reject-negative-ifindexes-in-rtm_newlink.patch
 bonding-fix-macvlan-over-alb-bond-support.patch
+kvm-x86-preserve-tdp-mmu-roots-until-they-are-explicitly-invalidated.patch
+kvm-x86-mmu-fix-an-sign-extension-bug-with-mmu_seq-that-hangs-vcpus.patch
+io_uring-get-rid-of-double-locking.patch
+io_uring-extract-a-io_msg_install_complete-helper.patch
+io_uring-msg_ring-move-double-lock-unlock-helpers-higher-up.patch
+io_uring-msg_ring-fix-missing-lock-on-overflow-for-iopoll.patch
-- 
2.47.3