From c63b9fb85d3d75ac074b46f22b75bbd68600b979 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 26 Aug 2023 18:48:20 +0200 Subject: [PATCH] 6.1-stable patches added patches: io_uring-extract-a-io_msg_install_complete-helper.patch io_uring-get-rid-of-double-locking.patch io_uring-msg_ring-fix-missing-lock-on-overflow-for-iopoll.patch io_uring-msg_ring-move-double-lock-unlock-helpers-higher-up.patch kvm-x86-mmu-fix-an-sign-extension-bug-with-mmu_seq-that-hangs-vcpus.patch kvm-x86-preserve-tdp-mmu-roots-until-they-are-explicitly-invalidated.patch --- ...act-a-io_msg_install_complete-helper.patch | 91 +++++++ .../io_uring-get-rid-of-double-locking.patch | 194 +++++++++++++ ...-missing-lock-on-overflow-for-iopoll.patch | 68 +++++ ...double-lock-unlock-helpers-higher-up.patch | 90 ++++++ ...on-bug-with-mmu_seq-that-hangs-vcpus.patch | 72 +++++ ...ntil-they-are-explicitly-invalidated.patch | 257 ++++++++++++++++++ queue-6.1/series | 6 + 7 files changed, 778 insertions(+) create mode 100644 queue-6.1/io_uring-extract-a-io_msg_install_complete-helper.patch create mode 100644 queue-6.1/io_uring-get-rid-of-double-locking.patch create mode 100644 queue-6.1/io_uring-msg_ring-fix-missing-lock-on-overflow-for-iopoll.patch create mode 100644 queue-6.1/io_uring-msg_ring-move-double-lock-unlock-helpers-higher-up.patch create mode 100644 queue-6.1/kvm-x86-mmu-fix-an-sign-extension-bug-with-mmu_seq-that-hangs-vcpus.patch create mode 100644 queue-6.1/kvm-x86-preserve-tdp-mmu-roots-until-they-are-explicitly-invalidated.patch diff --git a/queue-6.1/io_uring-extract-a-io_msg_install_complete-helper.patch b/queue-6.1/io_uring-extract-a-io_msg_install_complete-helper.patch new file mode 100644 index 00000000000..6bfb2d206fc --- /dev/null +++ b/queue-6.1/io_uring-extract-a-io_msg_install_complete-helper.patch @@ -0,0 +1,91 @@ +From ef7b782465bd6943fbcacb4af4fbe790137a5820 Mon Sep 17 00:00:00 2001 +From: Pavel Begunkov +Date: Wed, 7 Dec 2022 03:53:35 +0000 +Subject: io_uring: extract a io_msg_install_complete helper + +From: Pavel Begunkov + +Commit 172113101641cf1f9628c528ec790cb809f2b704 upstream. + +Extract a helper called io_msg_install_complete() from io_msg_send_fd(), +will be used later. + +Signed-off-by: Pavel Begunkov +Link: https://lore.kernel.org/r/1500ca1054cc4286a3ee1c60aacead57fcdfa02a.1670384893.git.asml.silence@gmail.com +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + io_uring/msg_ring.c | 38 +++++++++++++++++++++----------------- + 1 file changed, 21 insertions(+), 17 deletions(-) + +--- a/io_uring/msg_ring.c ++++ b/io_uring/msg_ring.c +@@ -94,40 +94,25 @@ static struct file *io_msg_grab_file(str + return file; + } + +-static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags) ++static int io_msg_install_complete(struct io_kiocb *req, unsigned int issue_flags) + { + struct io_ring_ctx *target_ctx = req->file->private_data; + struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); +- struct io_ring_ctx *ctx = req->ctx; + struct file *src_file = msg->src_file; + int ret; + +- if (msg->len) +- return -EINVAL; +- if (target_ctx == ctx) +- return -EINVAL; +- if (target_ctx->flags & IORING_SETUP_R_DISABLED) +- return -EBADFD; +- if (!src_file) { +- src_file = io_msg_grab_file(req, issue_flags); +- if (!src_file) +- return -EBADF; +- msg->src_file = src_file; +- req->flags |= REQ_F_NEED_CLEANUP; +- } +- + if (unlikely(io_double_lock_ctx(target_ctx, issue_flags))) + return -EAGAIN; + + ret = __io_fixed_fd_install(target_ctx, src_file, msg->dst_fd); + if (ret < 0) + goto out_unlock; ++ + msg->src_file = NULL; + req->flags &= ~REQ_F_NEED_CLEANUP; + + if (msg->flags & IORING_MSG_RING_CQE_SKIP) + goto out_unlock; +- + /* + * If this fails, the target still received the file descriptor but + * wasn't notified of the fact. This means that if this request +@@ -141,6 +126,25 @@ out_unlock: + return ret; + } + ++static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags) ++{ ++ struct io_ring_ctx *target_ctx = req->file->private_data; ++ struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); ++ struct io_ring_ctx *ctx = req->ctx; ++ struct file *src_file = msg->src_file; ++ ++ if (target_ctx == ctx) ++ return -EINVAL; ++ if (!src_file) { ++ src_file = io_msg_grab_file(req, issue_flags); ++ if (!src_file) ++ return -EBADF; ++ msg->src_file = src_file; ++ req->flags |= REQ_F_NEED_CLEANUP; ++ } ++ return io_msg_install_complete(req, issue_flags); ++} ++ + int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) + { + struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); diff --git a/queue-6.1/io_uring-get-rid-of-double-locking.patch b/queue-6.1/io_uring-get-rid-of-double-locking.patch new file mode 100644 index 00000000000..8435bf17520 --- /dev/null +++ b/queue-6.1/io_uring-get-rid-of-double-locking.patch @@ -0,0 +1,194 @@ +From f10ba483becae0b8c7de0e9fe3e0f6d07a405b7a Mon Sep 17 00:00:00 2001 +From: Pavel Begunkov +Date: Wed, 7 Dec 2022 03:53:34 +0000 +Subject: io_uring: get rid of double locking + +From: Pavel Begunkov + +Commit 11373026f2960390d5e330df4e92735c4265c440 upstream. + +We don't need to take both uring_locks at once, msg_ring can be split in +two parts, first getting a file from the filetable of the first ring and +then installing it into the second one. + +Signed-off-by: Pavel Begunkov +Link: https://lore.kernel.org/r/a80ecc2bc99c3b3f2cf20015d618b7c51419a797.1670384893.git.asml.silence@gmail.com +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + io_uring/msg_ring.c | 85 +++++++++++++++++++++++++++++----------------------- + io_uring/msg_ring.h | 1 + io_uring/opdef.c | 1 + 3 files changed, 51 insertions(+), 36 deletions(-) + +--- a/io_uring/msg_ring.c ++++ b/io_uring/msg_ring.c +@@ -15,6 +15,7 @@ + + struct io_msg { + struct file *file; ++ struct file *src_file; + u64 user_data; + u32 len; + u32 cmd; +@@ -23,6 +24,17 @@ struct io_msg { + u32 flags; + }; + ++void io_msg_ring_cleanup(struct io_kiocb *req) ++{ ++ struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); ++ ++ if (WARN_ON_ONCE(!msg->src_file)) ++ return; ++ ++ fput(msg->src_file); ++ msg->src_file = NULL; ++} ++ + static int io_msg_ring_data(struct io_kiocb *req) + { + struct io_ring_ctx *target_ctx = req->file->private_data; +@@ -39,17 +51,13 @@ static int io_msg_ring_data(struct io_ki + return -EOVERFLOW; + } + +-static void io_double_unlock_ctx(struct io_ring_ctx *ctx, +- struct io_ring_ctx *octx, ++static void io_double_unlock_ctx(struct io_ring_ctx *octx, + unsigned int issue_flags) + { +- if (issue_flags & IO_URING_F_UNLOCKED) +- mutex_unlock(&ctx->uring_lock); + mutex_unlock(&octx->uring_lock); + } + +-static int io_double_lock_ctx(struct io_ring_ctx *ctx, +- struct io_ring_ctx *octx, ++static int io_double_lock_ctx(struct io_ring_ctx *octx, + unsigned int issue_flags) + { + /* +@@ -62,17 +70,28 @@ static int io_double_lock_ctx(struct io_ + return -EAGAIN; + return 0; + } ++ mutex_lock(&octx->uring_lock); ++ return 0; ++} + +- /* Always grab smallest value ctx first. We know ctx != octx. */ +- if (ctx < octx) { +- mutex_lock(&ctx->uring_lock); +- mutex_lock(&octx->uring_lock); +- } else { +- mutex_lock(&octx->uring_lock); +- mutex_lock(&ctx->uring_lock); +- } ++static struct file *io_msg_grab_file(struct io_kiocb *req, unsigned int issue_flags) ++{ ++ struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); ++ struct io_ring_ctx *ctx = req->ctx; ++ struct file *file = NULL; ++ unsigned long file_ptr; ++ int idx = msg->src_fd; + +- return 0; ++ io_ring_submit_lock(ctx, issue_flags); ++ if (likely(idx < ctx->nr_user_files)) { ++ idx = array_index_nospec(idx, ctx->nr_user_files); ++ file_ptr = io_fixed_file_slot(&ctx->file_table, idx)->file_ptr; ++ file = (struct file *) (file_ptr & FFS_MASK); ++ if (file) ++ get_file(file); ++ } ++ io_ring_submit_unlock(ctx, issue_flags); ++ return file; + } + + static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags) +@@ -80,8 +99,7 @@ static int io_msg_send_fd(struct io_kioc + struct io_ring_ctx *target_ctx = req->file->private_data; + struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); + struct io_ring_ctx *ctx = req->ctx; +- unsigned long file_ptr; +- struct file *src_file; ++ struct file *src_file = msg->src_file; + int ret; + + if (msg->len) +@@ -90,28 +108,22 @@ static int io_msg_send_fd(struct io_kioc + return -EINVAL; + if (target_ctx->flags & IORING_SETUP_R_DISABLED) + return -EBADFD; ++ if (!src_file) { ++ src_file = io_msg_grab_file(req, issue_flags); ++ if (!src_file) ++ return -EBADF; ++ msg->src_file = src_file; ++ req->flags |= REQ_F_NEED_CLEANUP; ++ } + +- ret = io_double_lock_ctx(ctx, target_ctx, issue_flags); +- if (unlikely(ret)) +- return ret; +- +- ret = -EBADF; +- if (unlikely(msg->src_fd >= ctx->nr_user_files)) +- goto out_unlock; +- +- msg->src_fd = array_index_nospec(msg->src_fd, ctx->nr_user_files); +- file_ptr = io_fixed_file_slot(&ctx->file_table, msg->src_fd)->file_ptr; +- if (!file_ptr) +- goto out_unlock; +- +- src_file = (struct file *) (file_ptr & FFS_MASK); +- get_file(src_file); ++ if (unlikely(io_double_lock_ctx(target_ctx, issue_flags))) ++ return -EAGAIN; + + ret = __io_fixed_fd_install(target_ctx, src_file, msg->dst_fd); +- if (ret < 0) { +- fput(src_file); ++ if (ret < 0) + goto out_unlock; +- } ++ msg->src_file = NULL; ++ req->flags &= ~REQ_F_NEED_CLEANUP; + + if (msg->flags & IORING_MSG_RING_CQE_SKIP) + goto out_unlock; +@@ -125,7 +137,7 @@ static int io_msg_send_fd(struct io_kioc + if (!io_post_aux_cqe(target_ctx, msg->user_data, ret, 0, true)) + ret = -EOVERFLOW; + out_unlock: +- io_double_unlock_ctx(ctx, target_ctx, issue_flags); ++ io_double_unlock_ctx(target_ctx, issue_flags); + return ret; + } + +@@ -136,6 +148,7 @@ int io_msg_ring_prep(struct io_kiocb *re + if (unlikely(sqe->buf_index || sqe->personality)) + return -EINVAL; + ++ msg->src_file = NULL; + msg->user_data = READ_ONCE(sqe->off); + msg->len = READ_ONCE(sqe->len); + msg->cmd = READ_ONCE(sqe->addr); +--- a/io_uring/msg_ring.h ++++ b/io_uring/msg_ring.h +@@ -2,3 +2,4 @@ + + int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); + int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags); ++void io_msg_ring_cleanup(struct io_kiocb *req); +--- a/io_uring/opdef.c ++++ b/io_uring/opdef.c +@@ -445,6 +445,7 @@ const struct io_op_def io_op_defs[] = { + .name = "MSG_RING", + .prep = io_msg_ring_prep, + .issue = io_msg_ring, ++ .cleanup = io_msg_ring_cleanup, + }, + [IORING_OP_FSETXATTR] = { + .needs_file = 1, diff --git a/queue-6.1/io_uring-msg_ring-fix-missing-lock-on-overflow-for-iopoll.patch b/queue-6.1/io_uring-msg_ring-fix-missing-lock-on-overflow-for-iopoll.patch new file mode 100644 index 00000000000..e717705cf64 --- /dev/null +++ b/queue-6.1/io_uring-msg_ring-fix-missing-lock-on-overflow-for-iopoll.patch @@ -0,0 +1,68 @@ +From c47db9ca937e232606060077bb09cff49e75dbb7 Mon Sep 17 00:00:00 2001 +From: Jens Axboe +Date: Tue, 22 Aug 2023 18:00:02 -0600 +Subject: io_uring/msg_ring: fix missing lock on overflow for IOPOLL + +From: Jens Axboe + +Commit e12d7a46f65ae4b7d58a5e0c1cbfa825cf8d830d upstream. + +If the target ring is configured with IOPOLL, then we always need to hold +the target ring uring_lock before posting CQEs. We could just grab it +unconditionally, but since we don't expect many target rings to be of this +type, make grabbing the uring_lock conditional on the ring type. + +Link: https://lore.kernel.org/io-uring/Y8krlYa52%2F0YGqkg@ip-172-31-85-199.ec2.internal/ +Reported-by: Xingyuan Mo +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + io_uring/msg_ring.c | 20 +++++++++++++++----- + 1 file changed, 15 insertions(+), 5 deletions(-) + +--- a/io_uring/msg_ring.c ++++ b/io_uring/msg_ring.c +@@ -57,20 +57,30 @@ void io_msg_ring_cleanup(struct io_kiocb + msg->src_file = NULL; + } + +-static int io_msg_ring_data(struct io_kiocb *req) ++static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags) + { + struct io_ring_ctx *target_ctx = req->file->private_data; + struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); ++ int ret; + + if (msg->src_fd || msg->dst_fd || msg->flags) + return -EINVAL; + if (target_ctx->flags & IORING_SETUP_R_DISABLED) + return -EBADFD; + +- if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0, true)) +- return 0; ++ ret = -EOVERFLOW; ++ if (target_ctx->flags & IORING_SETUP_IOPOLL) { ++ if (unlikely(io_double_lock_ctx(target_ctx, issue_flags))) ++ return -EAGAIN; ++ if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0, true)) ++ ret = 0; ++ io_double_unlock_ctx(target_ctx); ++ } else { ++ if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0, true)) ++ ret = 0; ++ } + +- return -EOVERFLOW; ++ return ret; + } + + static struct file *io_msg_grab_file(struct io_kiocb *req, unsigned int issue_flags) +@@ -175,7 +185,7 @@ int io_msg_ring(struct io_kiocb *req, un + + switch (msg->cmd) { + case IORING_MSG_DATA: +- ret = io_msg_ring_data(req); ++ ret = io_msg_ring_data(req, issue_flags); + break; + case IORING_MSG_SEND_FD: + ret = io_msg_send_fd(req, issue_flags); diff --git a/queue-6.1/io_uring-msg_ring-move-double-lock-unlock-helpers-higher-up.patch b/queue-6.1/io_uring-msg_ring-move-double-lock-unlock-helpers-higher-up.patch new file mode 100644 index 00000000000..9ab511558a3 --- /dev/null +++ b/queue-6.1/io_uring-msg_ring-move-double-lock-unlock-helpers-higher-up.patch @@ -0,0 +1,90 @@ +From 5c3ffa8ec64e0726dcbfe37b5d126701301011ed Mon Sep 17 00:00:00 2001 +From: Jens Axboe +Date: Thu, 19 Jan 2023 09:01:27 -0700 +Subject: io_uring/msg_ring: move double lock/unlock helpers higher up + +From: Jens Axboe + +Commit 423d5081d0451faa59a707e57373801da5b40141 upstream. + +In preparation for needing them somewhere else, move them and get rid of +the unused 'issue_flags' for the unlock side. + +No functional changes in this patch. + +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + io_uring/msg_ring.c | 47 +++++++++++++++++++++++------------------------ + 1 file changed, 23 insertions(+), 24 deletions(-) + +--- a/io_uring/msg_ring.c ++++ b/io_uring/msg_ring.c +@@ -24,6 +24,28 @@ struct io_msg { + u32 flags; + }; + ++static void io_double_unlock_ctx(struct io_ring_ctx *octx) ++{ ++ mutex_unlock(&octx->uring_lock); ++} ++ ++static int io_double_lock_ctx(struct io_ring_ctx *octx, ++ unsigned int issue_flags) ++{ ++ /* ++ * To ensure proper ordering between the two ctxs, we can only ++ * attempt a trylock on the target. If that fails and we already have ++ * the source ctx lock, punt to io-wq. ++ */ ++ if (!(issue_flags & IO_URING_F_UNLOCKED)) { ++ if (!mutex_trylock(&octx->uring_lock)) ++ return -EAGAIN; ++ return 0; ++ } ++ mutex_lock(&octx->uring_lock); ++ return 0; ++} ++ + void io_msg_ring_cleanup(struct io_kiocb *req) + { + struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); +@@ -51,29 +73,6 @@ static int io_msg_ring_data(struct io_ki + return -EOVERFLOW; + } + +-static void io_double_unlock_ctx(struct io_ring_ctx *octx, +- unsigned int issue_flags) +-{ +- mutex_unlock(&octx->uring_lock); +-} +- +-static int io_double_lock_ctx(struct io_ring_ctx *octx, +- unsigned int issue_flags) +-{ +- /* +- * To ensure proper ordering between the two ctxs, we can only +- * attempt a trylock on the target. If that fails and we already have +- * the source ctx lock, punt to io-wq. +- */ +- if (!(issue_flags & IO_URING_F_UNLOCKED)) { +- if (!mutex_trylock(&octx->uring_lock)) +- return -EAGAIN; +- return 0; +- } +- mutex_lock(&octx->uring_lock); +- return 0; +-} +- + static struct file *io_msg_grab_file(struct io_kiocb *req, unsigned int issue_flags) + { + struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); +@@ -122,7 +121,7 @@ static int io_msg_install_complete(struc + if (!io_post_aux_cqe(target_ctx, msg->user_data, ret, 0, true)) + ret = -EOVERFLOW; + out_unlock: +- io_double_unlock_ctx(target_ctx, issue_flags); ++ io_double_unlock_ctx(target_ctx); + return ret; + } + diff --git a/queue-6.1/kvm-x86-mmu-fix-an-sign-extension-bug-with-mmu_seq-that-hangs-vcpus.patch b/queue-6.1/kvm-x86-mmu-fix-an-sign-extension-bug-with-mmu_seq-that-hangs-vcpus.patch new file mode 100644 index 00000000000..ec5c1b78f64 --- /dev/null +++ b/queue-6.1/kvm-x86-mmu-fix-an-sign-extension-bug-with-mmu_seq-that-hangs-vcpus.patch @@ -0,0 +1,72 @@ +From seanjc@google.com Sat Aug 26 18:44:07 2023 +From: Sean Christopherson +Date: Wed, 23 Aug 2023 18:01:04 -0700 +Subject: [PATCH 6.1] KVM: x86/mmu: Fix an sign-extension bug with mmu_seq that hangs vCPUs +To: stable@vger.kernel.org, Greg Kroah-Hartman +Cc: Paolo Bonzini , linux-kernel@vger.kernel.org +Message-ID: <20230824010104.2714198-1-seanjc@google.com> + +From: Sean Christopherson + +Upstream commit ba6e3fe25543 ("KVM: x86/mmu: Grab mmu_invalidate_seq in +kvm_faultin_pfn()") unknowingly fixed the bug in v6.3 when refactoring +how KVM tracks the sequence counter snapshot. + +Take the vCPU's mmu_seq snapshot as an "unsigned long" instead of an "int" +when checking to see if a page fault is stale, as the sequence count is +stored as an "unsigned long" everywhere else in KVM. This fixes a bug +where KVM will effectively hang vCPUs due to always thinking page faults +are stale, which results in KVM refusing to "fix" faults. + +mmu_invalidate_seq (née mmu_notifier_seq) is a sequence counter used when +KVM is handling page faults to detect if userspace mappings relevant to +the guest were invalidated between snapshotting the counter and acquiring +mmu_lock, i.e. to ensure that the userspace mapping KVM is using to +resolve the page fault is fresh. If KVM sees that the counter has +changed, KVM simply resumes the guest without fixing the fault. + +What _should_ happen is that the source of the mmu_notifier invalidations +eventually goes away, mmu_invalidate_seq becomes stable, and KVM can once +again fix guest page fault(s). + +But for a long-lived VM and/or a VM that the host just doesn't particularly +like, it's possible for a VM to be on the receiving end of 2 billion (with +a B) mmu_notifier invalidations. When that happens, bit 31 will be set in +mmu_invalidate_seq. This causes the value to be turned into a 32-bit +negative value when implicitly cast to an "int" by is_page_fault_stale(), +and then sign-extended into a 64-bit unsigned when the signed "int" is +implicitly cast back to an "unsigned long" on the call to +mmu_invalidate_retry_hva(). + +As a result of the casting and sign-extension, given a sequence counter of +e.g. 0x8002dc25, mmu_invalidate_retry_hva() ends up doing + + if (0x8002dc25 != 0xffffffff8002dc25) + +and signals that the page fault is stale and needs to be retried even +though the sequence counter is stable, and KVM effectively hangs any vCPU +that takes a page fault (EPT violation or #NPF when TDP is enabled). + +Reported-by: Brian Rak +Reported-by: Amaan Cheval +Reported-by: Eric Wheeler +Closes: https://lore.kernel.org/all/f023d927-52aa-7e08-2ee5-59a2fbc65953@gameservers.com +Fixes: a955cad84cda ("KVM: x86/mmu: Retry page fault if root is invalidated by memslot update") +Signed-off-by: Sean Christopherson +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/mmu/mmu.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/arch/x86/kvm/mmu/mmu.c ++++ b/arch/x86/kvm/mmu/mmu.c +@@ -4212,7 +4212,8 @@ static int kvm_faultin_pfn(struct kvm_vc + * root was invalidated by a memslot update or a relevant mmu_notifier fired. + */ + static bool is_page_fault_stale(struct kvm_vcpu *vcpu, +- struct kvm_page_fault *fault, int mmu_seq) ++ struct kvm_page_fault *fault, ++ unsigned long mmu_seq) + { + struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root.hpa); + diff --git a/queue-6.1/kvm-x86-preserve-tdp-mmu-roots-until-they-are-explicitly-invalidated.patch b/queue-6.1/kvm-x86-preserve-tdp-mmu-roots-until-they-are-explicitly-invalidated.patch new file mode 100644 index 00000000000..2e234f8ee9c --- /dev/null +++ b/queue-6.1/kvm-x86-preserve-tdp-mmu-roots-until-they-are-explicitly-invalidated.patch @@ -0,0 +1,257 @@ +From edbdb43fc96b11b3bfa531be306a1993d9fe89ec Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Wed, 26 Apr 2023 15:03:23 -0700 +Subject: KVM: x86: Preserve TDP MMU roots until they are explicitly invalidated + +From: Sean Christopherson + +commit edbdb43fc96b11b3bfa531be306a1993d9fe89ec upstream. + +Preserve TDP MMU roots until they are explicitly invalidated by gifting +the TDP MMU itself a reference to a root when it is allocated. Keeping a +reference in the TDP MMU fixes a flaw where the TDP MMU exhibits terrible +performance, and can potentially even soft-hang a vCPU, if a vCPU +frequently unloads its roots, e.g. when KVM is emulating SMI+RSM. + +When KVM emulates something that invalidates _all_ TLB entries, e.g. SMI +and RSM, KVM unloads all of the vCPUs roots (KVM keeps a small per-vCPU +cache of previous roots). Unloading roots is a simple way to ensure KVM +flushes and synchronizes all roots for the vCPU, as KVM flushes and syncs +when allocating a "new" root (from the vCPU's perspective). + +In the shadow MMU, KVM keeps track of all shadow pages, roots included, in +a per-VM hash table. Unloading a shadow MMU root just wipes it from the +per-vCPU cache; the root is still tracked in the per-VM hash table. When +KVM loads a "new" root for the vCPU, KVM will find the old, unloaded root +in the per-VM hash table. + +Unlike the shadow MMU, the TDP MMU doesn't track "inactive" roots in a +per-VM structure, where "active" in this case means a root is either +in-use or cached as a previous root by at least one vCPU. When a TDP MMU +root becomes inactive, i.e. the last vCPU reference to the root is put, +KVM immediately frees the root (asterisk on "immediately" as the actual +freeing may be done by a worker, but for all intents and purposes the root +is gone). + +The TDP MMU behavior is especially problematic for 1-vCPU setups, as +unloading all roots effectively frees all roots. The issue is mitigated +to some degree in multi-vCPU setups as a different vCPU usually holds a +reference to an unloaded root and thus keeps the root alive, allowing the +vCPU to reuse its old root after unloading (with a flush+sync). + +The TDP MMU flaw has been known for some time, as until very recently, +KVM's handling of CR0.WP also triggered unloading of all roots. The +CR0.WP toggling scenario was eventually addressed by not unloading roots +when _only_ CR0.WP is toggled, but such an approach doesn't Just Work +for emulating SMM as KVM must emulate a full TLB flush on entry and exit +to/from SMM. Given that the shadow MMU plays nice with unloading roots +at will, teaching the TDP MMU to do the same is far less complex than +modifying KVM to track which roots need to be flushed before reuse. + +Note, preserving all possible TDP MMU roots is not a concern with respect +to memory consumption. Now that the role for direct MMUs doesn't include +information about the guest, e.g. CR0.PG, CR0.WP, CR4.SMEP, etc., there +are _at most_ six possible roots (where "guest_mode" here means L2): + + 1. 4-level !SMM !guest_mode + 2. 4-level SMM !guest_mode + 3. 5-level !SMM !guest_mode + 4. 5-level SMM !guest_mode + 5. 4-level !SMM guest_mode + 6. 5-level !SMM guest_mode + +And because each vCPU can track 4 valid roots, a VM can already have all +6 root combinations live at any given time. Not to mention that, in +practice, no sane VMM will advertise different guest.MAXPHYADDR values +across vCPUs, i.e. KVM won't ever use both 4-level and 5-level roots for +a single VM. Furthermore, the vast majority of modern hypervisors will +utilize EPT/NPT when available, thus the guest_mode=%true cases are also +unlikely to be utilized. + +Reported-by: Jeremi Piotrowski +Link: https://lore.kernel.org/all/959c5bce-beb5-b463-7158-33fc4a4f910c@linux.microsoft.com +Link: https://lkml.kernel.org/r/20220209170020.1775368-1-pbonzini%40redhat.com +Link: https://lore.kernel.org/all/20230322013731.102955-1-minipli@grsecurity.net +Link: https://lore.kernel.org/all/000000000000a0bc2b05f9dd7fab@google.com +Link: https://lore.kernel.org/all/000000000000eca0b905fa0f7756@google.com +Cc: Ben Gardon +Cc: David Matlack +Cc: stable@vger.kernel.org +Tested-by: Jeremi Piotrowski +Link: https://lore.kernel.org/r/20230426220323.3079789-1-seanjc@google.com +Signed-off-by: Sean Christopherson +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/mmu/tdp_mmu.c | 121 ++++++++++++++++++++------------------------- + 1 file changed, 56 insertions(+), 65 deletions(-) + +--- a/arch/x86/kvm/mmu/tdp_mmu.c ++++ b/arch/x86/kvm/mmu/tdp_mmu.c +@@ -51,7 +51,17 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm * + if (!kvm->arch.tdp_mmu_enabled) + return; + +- /* Also waits for any queued work items. */ ++ /* ++ * Invalidate all roots, which besides the obvious, schedules all roots ++ * for zapping and thus puts the TDP MMU's reference to each root, i.e. ++ * ultimately frees all roots. ++ */ ++ kvm_tdp_mmu_invalidate_all_roots(kvm); ++ ++ /* ++ * Destroying a workqueue also first flushes the workqueue, i.e. no ++ * need to invoke kvm_tdp_mmu_zap_invalidated_roots(). ++ */ + destroy_workqueue(kvm->arch.tdp_mmu_zap_wq); + + WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages)); +@@ -127,16 +137,6 @@ static void tdp_mmu_schedule_zap_root(st + queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work); + } + +-static inline bool kvm_tdp_root_mark_invalid(struct kvm_mmu_page *page) +-{ +- union kvm_mmu_page_role role = page->role; +- role.invalid = true; +- +- /* No need to use cmpxchg, only the invalid bit can change. */ +- role.word = xchg(&page->role.word, role.word); +- return role.invalid; +-} +- + void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, + bool shared) + { +@@ -145,45 +145,12 @@ void kvm_tdp_mmu_put_root(struct kvm *kv + if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) + return; + +- WARN_ON(!root->tdp_mmu_page); +- + /* +- * The root now has refcount=0. It is valid, but readers already +- * cannot acquire a reference to it because kvm_tdp_mmu_get_root() +- * rejects it. This remains true for the rest of the execution +- * of this function, because readers visit valid roots only +- * (except for tdp_mmu_zap_root_work(), which however +- * does not acquire any reference itself). +- * +- * Even though there are flows that need to visit all roots for +- * correctness, they all take mmu_lock for write, so they cannot yet +- * run concurrently. The same is true after kvm_tdp_root_mark_invalid, +- * since the root still has refcount=0. +- * +- * However, tdp_mmu_zap_root can yield, and writers do not expect to +- * see refcount=0 (see for example kvm_tdp_mmu_invalidate_all_roots()). +- * So the root temporarily gets an extra reference, going to refcount=1 +- * while staying invalid. Readers still cannot acquire any reference; +- * but writers are now allowed to run if tdp_mmu_zap_root yields and +- * they might take an extra reference if they themselves yield. +- * Therefore, when the reference is given back by the worker, +- * there is no guarantee that the refcount is still 1. If not, whoever +- * puts the last reference will free the page, but they will not have to +- * zap the root because a root cannot go from invalid to valid. ++ * The TDP MMU itself holds a reference to each root until the root is ++ * explicitly invalidated, i.e. the final reference should be never be ++ * put for a valid root. + */ +- if (!kvm_tdp_root_mark_invalid(root)) { +- refcount_set(&root->tdp_mmu_root_count, 1); +- +- /* +- * Zapping the root in a worker is not just "nice to have"; +- * it is required because kvm_tdp_mmu_invalidate_all_roots() +- * skips already-invalid roots. If kvm_tdp_mmu_put_root() did +- * not add the root to the workqueue, kvm_tdp_mmu_zap_all_fast() +- * might return with some roots not zapped yet. +- */ +- tdp_mmu_schedule_zap_root(kvm, root); +- return; +- } ++ KVM_BUG_ON(!is_tdp_mmu_page(root) || !root->role.invalid, kvm); + + spin_lock(&kvm->arch.tdp_mmu_pages_lock); + list_del_rcu(&root->link); +@@ -329,7 +296,14 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(stru + root = tdp_mmu_alloc_sp(vcpu); + tdp_mmu_init_sp(root, NULL, 0, role); + +- refcount_set(&root->tdp_mmu_root_count, 1); ++ /* ++ * TDP MMU roots are kept until they are explicitly invalidated, either ++ * by a memslot update or by the destruction of the VM. Initialize the ++ * refcount to two; one reference for the vCPU, and one reference for ++ * the TDP MMU itself, which is held until the root is invalidated and ++ * is ultimately put by tdp_mmu_zap_root_work(). ++ */ ++ refcount_set(&root->tdp_mmu_root_count, 2); + + spin_lock(&kvm->arch.tdp_mmu_pages_lock); + list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots); +@@ -1027,32 +1001,49 @@ void kvm_tdp_mmu_zap_invalidated_roots(s + /* + * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that + * is about to be zapped, e.g. in response to a memslots update. The actual +- * zapping is performed asynchronously, so a reference is taken on all roots. +- * Using a separate workqueue makes it easy to ensure that the destruction is +- * performed before the "fast zap" completes, without keeping a separate list +- * of invalidated roots; the list is effectively the list of work items in +- * the workqueue. +- * +- * Get a reference even if the root is already invalid, the asynchronous worker +- * assumes it was gifted a reference to the root it processes. Because mmu_lock +- * is held for write, it should be impossible to observe a root with zero refcount, +- * i.e. the list of roots cannot be stale. ++ * zapping is performed asynchronously. Using a separate workqueue makes it ++ * easy to ensure that the destruction is performed before the "fast zap" ++ * completes, without keeping a separate list of invalidated roots; the list is ++ * effectively the list of work items in the workqueue. + * +- * This has essentially the same effect for the TDP MMU +- * as updating mmu_valid_gen does for the shadow MMU. ++ * Note, the asynchronous worker is gifted the TDP MMU's reference. ++ * See kvm_tdp_mmu_get_vcpu_root_hpa(). + */ + void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) + { + struct kvm_mmu_page *root; + +- lockdep_assert_held_write(&kvm->mmu_lock); +- list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) { +- if (!root->role.invalid && +- !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) { ++ /* ++ * mmu_lock must be held for write to ensure that a root doesn't become ++ * invalid while there are active readers (invalidating a root while ++ * there are active readers may or may not be problematic in practice, ++ * but it's uncharted territory and not supported). ++ * ++ * Waive the assertion if there are no users of @kvm, i.e. the VM is ++ * being destroyed after all references have been put, or if no vCPUs ++ * have been created (which means there are no roots), i.e. the VM is ++ * being destroyed in an error path of KVM_CREATE_VM. ++ */ ++ if (IS_ENABLED(CONFIG_PROVE_LOCKING) && ++ refcount_read(&kvm->users_count) && kvm->created_vcpus) ++ lockdep_assert_held_write(&kvm->mmu_lock); ++ ++ /* ++ * As above, mmu_lock isn't held when destroying the VM! There can't ++ * be other references to @kvm, i.e. nothing else can invalidate roots ++ * or be consuming roots, but walking the list of roots does need to be ++ * guarded against roots being deleted by the asynchronous zap worker. ++ */ ++ rcu_read_lock(); ++ ++ list_for_each_entry_rcu(root, &kvm->arch.tdp_mmu_roots, link) { ++ if (!root->role.invalid) { + root->role.invalid = true; + tdp_mmu_schedule_zap_root(kvm, root); + } + } ++ ++ rcu_read_unlock(); + } + + /* diff --git a/queue-6.1/series b/queue-6.1/series index 009eb82c099..4ec408f0c44 100644 --- a/queue-6.1/series +++ b/queue-6.1/series @@ -56,3 +56,9 @@ netfilter-nf_tables-flush-pending-destroy-work-befor.patch netfilter-nf_tables-fix-out-of-memory-error-handling.patch rtnetlink-reject-negative-ifindexes-in-rtm_newlink.patch bonding-fix-macvlan-over-alb-bond-support.patch +kvm-x86-preserve-tdp-mmu-roots-until-they-are-explicitly-invalidated.patch +kvm-x86-mmu-fix-an-sign-extension-bug-with-mmu_seq-that-hangs-vcpus.patch +io_uring-get-rid-of-double-locking.patch +io_uring-extract-a-io_msg_install_complete-helper.patch +io_uring-msg_ring-move-double-lock-unlock-helpers-higher-up.patch +io_uring-msg_ring-fix-missing-lock-on-overflow-for-iopoll.patch -- 2.47.2