From 405c570a2bde3b7d9d1672c9ee829f77acca4fea Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 30 Nov 2022 13:28:05 +0100 Subject: [PATCH] 6.0-stable patches added patches: drm-i915-gvt-get-reference-to-kvm-iff-attachment-to-vm-is-successful.patch fpga-m10bmc-sec-fix-kconfig-dependencies.patch gcov-clang-fix-the-buffer-overflow-issue.patch io_uring-clear-tif_notify_signal-if-set-and-task_work-not-available.patch io_uring-cmpxchg-for-poll-arm-refs-release.patch io_uring-make-poll-refs-more-robust.patch kvm-update-gfn_to_pfn_cache-khva-when-it-moves-within-the-same-page.patch kvm-x86-add-kvm_leave_nested.patch kvm-x86-forcibly-leave-nested-mode-on-vcpu-reset.patch kvm-x86-mmu-fix-race-condition-in-direct_page_fault.patch kvm-x86-nsvm-harden-svm_free_nested-against-freeing-vmcb02-while-still-in-use.patch kvm-x86-nsvm-leave-nested-mode-on-vcpu-free.patch kvm-x86-remove-exit_int_info-warning-in-svm_handle_exit.patch kvm-x86-xen-only-do-in-kernel-acceleration-of-hypercalls-for-guest-cpl0.patch kvm-x86-xen-validate-port-number-in-schedop_poll.patch mm-cgroup-reclaim-fix-dirty-pages-throttling-on-cgroup-v1.patch mm-vmscan-fix-extreme-overreclaim-and-swap-floods.patch nilfs2-fix-nilfs_sufile_mark_dirty-not-set-segment-usage-as-dirty.patch --- ...m-iff-attachment-to-vm-is-successful.patch | 43 ++++++ ...-m10bmc-sec-fix-kconfig-dependencies.patch | 52 +++++++ ...-clang-fix-the-buffer-overflow-issue.patch | 88 +++++++++++ ...l-if-set-and-task_work-not-available.patch | 41 ++++++ ...ng-cmpxchg-for-poll-arm-refs-release.patch | 56 +++++++ .../io_uring-make-poll-refs-more-robust.patch | 101 +++++++++++++ ...a-when-it-moves-within-the-same-page.patch | 41 ++++++ queue-6.0/kvm-x86-add-kvm_leave_nested.patch | 71 +++++++++ ...ibly-leave-nested-mode-on-vcpu-reset.patch | 57 ++++++++ ...-race-condition-in-direct_page_fault.patch | 101 +++++++++++++ ...st-freeing-vmcb02-while-still-in-use.patch | 36 +++++ ...-nsvm-leave-nested-mode-on-vcpu-free.patch | 33 +++++ ..._int_info-warning-in-svm_handle_exit.patch | 58 ++++++++ ...eration-of-hypercalls-for-guest-cpl0.patch | 64 ++++++++ ...validate-port-number-in-schedop_poll.patch | 68 +++++++++ ...-dirty-pages-throttling-on-cgroup-v1.patch | 65 +++++++++ ...-extreme-overreclaim-and-swap-floods.patch | 137 ++++++++++++++++++ ...dirty-not-set-segment-usage-as-dirty.patch | 77 ++++++++++ queue-6.0/series | 18 +++ 19 files changed, 1207 insertions(+) create mode 100644 queue-6.0/drm-i915-gvt-get-reference-to-kvm-iff-attachment-to-vm-is-successful.patch create mode 100644 queue-6.0/fpga-m10bmc-sec-fix-kconfig-dependencies.patch create mode 100644 queue-6.0/gcov-clang-fix-the-buffer-overflow-issue.patch create mode 100644 queue-6.0/io_uring-clear-tif_notify_signal-if-set-and-task_work-not-available.patch create mode 100644 queue-6.0/io_uring-cmpxchg-for-poll-arm-refs-release.patch create mode 100644 queue-6.0/io_uring-make-poll-refs-more-robust.patch create mode 100644 queue-6.0/kvm-update-gfn_to_pfn_cache-khva-when-it-moves-within-the-same-page.patch create mode 100644 queue-6.0/kvm-x86-add-kvm_leave_nested.patch create mode 100644 queue-6.0/kvm-x86-forcibly-leave-nested-mode-on-vcpu-reset.patch create mode 100644 queue-6.0/kvm-x86-mmu-fix-race-condition-in-direct_page_fault.patch create mode 100644 queue-6.0/kvm-x86-nsvm-harden-svm_free_nested-against-freeing-vmcb02-while-still-in-use.patch create mode 100644 queue-6.0/kvm-x86-nsvm-leave-nested-mode-on-vcpu-free.patch create mode 100644 queue-6.0/kvm-x86-remove-exit_int_info-warning-in-svm_handle_exit.patch create mode 100644 queue-6.0/kvm-x86-xen-only-do-in-kernel-acceleration-of-hypercalls-for-guest-cpl0.patch create mode 100644 queue-6.0/kvm-x86-xen-validate-port-number-in-schedop_poll.patch create mode 100644 queue-6.0/mm-cgroup-reclaim-fix-dirty-pages-throttling-on-cgroup-v1.patch create mode 100644 queue-6.0/mm-vmscan-fix-extreme-overreclaim-and-swap-floods.patch create mode 100644 queue-6.0/nilfs2-fix-nilfs_sufile_mark_dirty-not-set-segment-usage-as-dirty.patch diff --git a/queue-6.0/drm-i915-gvt-get-reference-to-kvm-iff-attachment-to-vm-is-successful.patch b/queue-6.0/drm-i915-gvt-get-reference-to-kvm-iff-attachment-to-vm-is-successful.patch new file mode 100644 index 00000000000..c9b50b865a8 --- /dev/null +++ b/queue-6.0/drm-i915-gvt-get-reference-to-kvm-iff-attachment-to-vm-is-successful.patch @@ -0,0 +1,43 @@ +From 9ed1fdee9ee324f3505ff066287ee53143caaaa2 Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Fri, 11 Nov 2022 00:22:24 +0000 +Subject: drm/i915/gvt: Get reference to KVM iff attachment to VM is successful + +From: Sean Christopherson + +commit 9ed1fdee9ee324f3505ff066287ee53143caaaa2 upstream. + +Get a reference to KVM if and only if a vGPU is successfully attached to +the VM to avoid leaking a reference if there's no available vGPU. On +open_device() failure, vfio_device_open() doesn't invoke close_device(). + +Fixes: 421cfe6596f6 ("vfio: remove VFIO_GROUP_NOTIFY_SET_KVM") +Cc: stable@vger.kernel.org +Reviewed-by: Kevin Tian +Signed-off-by: Sean Christopherson +Signed-off-by: Zhenyu Wang +Link: http://patchwork.freedesktop.org/patch/msgid/20221111002225.2418386-2-seanjc@google.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/gpu/drm/i915/gvt/kvmgt.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +--- a/drivers/gpu/drm/i915/gvt/kvmgt.c ++++ b/drivers/gpu/drm/i915/gvt/kvmgt.c +@@ -765,8 +765,6 @@ static int intel_vgpu_open_device(struct + return -ESRCH; + } + +- kvm_get_kvm(vgpu->vfio_device.kvm); +- + if (__kvmgt_vgpu_exist(vgpu)) + return -EEXIST; + +@@ -777,6 +775,7 @@ static int intel_vgpu_open_device(struct + + vgpu->track_node.track_write = kvmgt_page_track_write; + vgpu->track_node.track_flush_slot = kvmgt_page_track_flush_slot; ++ kvm_get_kvm(vgpu->vfio_device.kvm); + kvm_page_track_register_notifier(vgpu->vfio_device.kvm, + &vgpu->track_node); + diff --git a/queue-6.0/fpga-m10bmc-sec-fix-kconfig-dependencies.patch b/queue-6.0/fpga-m10bmc-sec-fix-kconfig-dependencies.patch new file mode 100644 index 00000000000..77e455cea58 --- /dev/null +++ b/queue-6.0/fpga-m10bmc-sec-fix-kconfig-dependencies.patch @@ -0,0 +1,52 @@ +From dfd10332596ef11ceafd29c4e21b4117be423fc4 Mon Sep 17 00:00:00 2001 +From: Russ Weight +Date: Mon, 14 Nov 2022 16:11:27 -0800 +Subject: fpga: m10bmc-sec: Fix kconfig dependencies + +From: Russ Weight + +commit dfd10332596ef11ceafd29c4e21b4117be423fc4 upstream. + +The secure update driver depends on the firmware-upload functionality of +the firmware-loader. The firmware-loader is carried in the firmware-class +driver which is enabled with the tristate CONFIG_FW_LOADER option. The +firmware-upload functionality is included in the firmware-class driver if +the bool FW_UPLOAD config is set. + +The current dependency statement, "depends on FW_UPLOAD", is not adequate +because it does not implicitly turn on FW_LOADER. Instead of adding a +dependency, follow the convention used by drivers that require the +FW_LOADER_USER_HELPER functionality of the firmware-loader by using +select for both FW_LOADER and FW_UPLOAD. + +Fixes: bdf86d0e6ca3 ("fpga: m10bmc-sec: create max10 bmc secure update") +Reported-by: kernel test robot +Cc: stable@vger.kernel.org +Signed-off-by: Russ Weight +Acked-by: Randy Dunlap +Acked-by: Xu Yilun +Link: https://lore.kernel.org/r/20221115001127.289890-1-russell.h.weight@intel.com +Signed-off-by: Xu Yilun +Signed-off-by: Greg Kroah-Hartman +--- + drivers/fpga/Kconfig | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/drivers/fpga/Kconfig b/drivers/fpga/Kconfig +index 6c416955da53..bbe0a7cabb75 100644 +--- a/drivers/fpga/Kconfig ++++ b/drivers/fpga/Kconfig +@@ -246,7 +246,9 @@ config FPGA_MGR_VERSAL_FPGA + + config FPGA_M10_BMC_SEC_UPDATE + tristate "Intel MAX10 BMC Secure Update driver" +- depends on MFD_INTEL_M10_BMC && FW_UPLOAD ++ depends on MFD_INTEL_M10_BMC ++ select FW_LOADER ++ select FW_UPLOAD + help + Secure update support for the Intel MAX10 board management + controller. +-- +2.38.1 + diff --git a/queue-6.0/gcov-clang-fix-the-buffer-overflow-issue.patch b/queue-6.0/gcov-clang-fix-the-buffer-overflow-issue.patch new file mode 100644 index 00000000000..55dfa6e508c --- /dev/null +++ b/queue-6.0/gcov-clang-fix-the-buffer-overflow-issue.patch @@ -0,0 +1,88 @@ +From a6f810efabfd789d3bbafeacb4502958ec56c5ce Mon Sep 17 00:00:00 2001 +From: Mukesh Ojha +Date: Thu, 10 Nov 2022 00:31:37 +0530 +Subject: gcov: clang: fix the buffer overflow issue + +From: Mukesh Ojha + +commit a6f810efabfd789d3bbafeacb4502958ec56c5ce upstream. + +Currently, in clang version of gcov code when module is getting removed +gcov_info_add() incorrectly adds the sfn_ptr->counter to all the +dst->functions and it result in the kernel panic in below crash report. +Fix this by properly handling it. + +[ 8.899094][ T599] Unable to handle kernel write to read-only memory at virtual address ffffff80461cc000 +[ 8.899100][ T599] Mem abort info: +[ 8.899102][ T599] ESR = 0x9600004f +[ 8.899103][ T599] EC = 0x25: DABT (current EL), IL = 32 bits +[ 8.899105][ T599] SET = 0, FnV = 0 +[ 8.899107][ T599] EA = 0, S1PTW = 0 +[ 8.899108][ T599] FSC = 0x0f: level 3 permission fault +[ 8.899110][ T599] Data abort info: +[ 8.899111][ T599] ISV = 0, ISS = 0x0000004f +[ 8.899113][ T599] CM = 0, WnR = 1 +[ 8.899114][ T599] swapper pgtable: 4k pages, 39-bit VAs, pgdp=00000000ab8de000 +[ 8.899116][ T599] [ffffff80461cc000] pgd=18000009ffcde003, p4d=18000009ffcde003, pud=18000009ffcde003, pmd=18000009ffcad003, pte=00600000c61cc787 +[ 8.899124][ T599] Internal error: Oops: 9600004f [#1] PREEMPT SMP +[ 8.899265][ T599] Skip md ftrace buffer dump for: 0x1609e0 +.... +.., +[ 8.899544][ T599] CPU: 7 PID: 599 Comm: modprobe Tainted: G S OE 5.15.41-android13-8-g38e9b1af6bce #1 +[ 8.899547][ T599] Hardware name: XXX (DT) +[ 8.899549][ T599] pstate: 82400005 (Nzcv daif +PAN -UAO +TCO -DIT -SSBS BTYPE=--) +[ 8.899551][ T599] pc : gcov_info_add+0x9c/0xb8 +[ 8.899557][ T599] lr : gcov_event+0x28c/0x6b8 +[ 8.899559][ T599] sp : ffffffc00e733b00 +[ 8.899560][ T599] x29: ffffffc00e733b00 x28: ffffffc00e733d30 x27: ffffffe8dc297470 +[ 8.899563][ T599] x26: ffffffe8dc297000 x25: ffffffe8dc297000 x24: ffffffe8dc297000 +[ 8.899566][ T599] x23: ffffffe8dc0a6200 x22: ffffff880f68bf20 x21: 0000000000000000 +[ 8.899569][ T599] x20: ffffff880f68bf00 x19: ffffff8801babc00 x18: ffffffc00d7f9058 +[ 8.899572][ T599] x17: 0000000000088793 x16: ffffff80461cbe00 x15: 9100052952800785 +[ 8.899575][ T599] x14: 0000000000000200 x13: 0000000000000041 x12: 9100052952800785 +[ 8.899577][ T599] x11: ffffffe8dc297000 x10: ffffffe8dc297000 x9 : ffffff80461cbc80 +[ 8.899580][ T599] x8 : ffffff8801babe80 x7 : ffffffe8dc2ec000 x6 : ffffffe8dc2ed000 +[ 8.899583][ T599] x5 : 000000008020001f x4 : fffffffe2006eae0 x3 : 000000008020001f +[ 8.899586][ T599] x2 : ffffff8027c49200 x1 : ffffff8801babc20 x0 : ffffff80461cb3a0 +[ 8.899589][ T599] Call trace: +[ 8.899590][ T599] gcov_info_add+0x9c/0xb8 +[ 8.899592][ T599] gcov_module_notifier+0xbc/0x120 +[ 8.899595][ T599] blocking_notifier_call_chain+0xa0/0x11c +[ 8.899598][ T599] do_init_module+0x2a8/0x33c +[ 8.899600][ T599] load_module+0x23cc/0x261c +[ 8.899602][ T599] __arm64_sys_finit_module+0x158/0x194 +[ 8.899604][ T599] invoke_syscall+0x94/0x2bc +[ 8.899607][ T599] el0_svc_common+0x1d8/0x34c +[ 8.899609][ T599] do_el0_svc+0x40/0x54 +[ 8.899611][ T599] el0_svc+0x94/0x2f0 +[ 8.899613][ T599] el0t_64_sync_handler+0x88/0xec +[ 8.899615][ T599] el0t_64_sync+0x1b4/0x1b8 +[ 8.899618][ T599] Code: f905f56c f86e69ec f86e6a0f 8b0c01ec (f82e6a0c) +[ 8.899620][ T599] ---[ end trace ed5218e9e5b6e2e6 ]--- + +Link: https://lkml.kernel.org/r/1668020497-13142-1-git-send-email-quic_mojha@quicinc.com +Fixes: e178a5beb369 ("gcov: clang support") +Signed-off-by: Mukesh Ojha +Reviewed-by: Peter Oberparleiter +Tested-by: Peter Oberparleiter +Cc: Nathan Chancellor +Cc: Nick Desaulniers +Cc: Tom Rix +Cc: [5.2+] +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + kernel/gcov/clang.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/kernel/gcov/clang.c ++++ b/kernel/gcov/clang.c +@@ -280,6 +280,8 @@ void gcov_info_add(struct gcov_info *dst + + for (i = 0; i < sfn_ptr->num_counters; i++) + dfn_ptr->counters[i] += sfn_ptr->counters[i]; ++ ++ sfn_ptr = list_next_entry(sfn_ptr, head); + } + } + diff --git a/queue-6.0/io_uring-clear-tif_notify_signal-if-set-and-task_work-not-available.patch b/queue-6.0/io_uring-clear-tif_notify_signal-if-set-and-task_work-not-available.patch new file mode 100644 index 00000000000..c3ef61e7568 --- /dev/null +++ b/queue-6.0/io_uring-clear-tif_notify_signal-if-set-and-task_work-not-available.patch @@ -0,0 +1,41 @@ +From 7cfe7a09489c1cefee7181e07b5f2bcbaebd9f41 Mon Sep 17 00:00:00 2001 +From: Jens Axboe +Date: Fri, 25 Nov 2022 09:36:29 -0700 +Subject: io_uring: clear TIF_NOTIFY_SIGNAL if set and task_work not available + +From: Jens Axboe + +commit 7cfe7a09489c1cefee7181e07b5f2bcbaebd9f41 upstream. + +With how task_work is added and signaled, we can have TIF_NOTIFY_SIGNAL +set and no task_work pending as it got run in a previous loop. Treat +TIF_NOTIFY_SIGNAL like get_signal(), always clear it if set regardless +of whether or not task_work is pending to run. + +Cc: stable@vger.kernel.org +Fixes: 46a525e199e4 ("io_uring: don't gate task_work run on TIF_NOTIFY_SIGNAL") +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + io_uring/io_uring.h | 9 +++++++-- + 1 file changed, 7 insertions(+), 2 deletions(-) + +--- a/io_uring/io_uring.h ++++ b/io_uring/io_uring.h +@@ -229,9 +229,14 @@ static inline unsigned int io_sqring_ent + + static inline bool io_run_task_work(void) + { ++ /* ++ * Always check-and-clear the task_work notification signal. With how ++ * signaling works for task_work, we can find it set with nothing to ++ * run. We need to clear it for that case, like get_signal() does. ++ */ ++ if (test_thread_flag(TIF_NOTIFY_SIGNAL)) ++ clear_notify_signal(); + if (task_work_pending(current)) { +- if (test_thread_flag(TIF_NOTIFY_SIGNAL)) +- clear_notify_signal(); + __set_current_state(TASK_RUNNING); + task_work_run(); + return 1; diff --git a/queue-6.0/io_uring-cmpxchg-for-poll-arm-refs-release.patch b/queue-6.0/io_uring-cmpxchg-for-poll-arm-refs-release.patch new file mode 100644 index 00000000000..55c72a5e24f --- /dev/null +++ b/queue-6.0/io_uring-cmpxchg-for-poll-arm-refs-release.patch @@ -0,0 +1,56 @@ +From 2f3893437a4ebf2e892ca172e9e122841319d675 Mon Sep 17 00:00:00 2001 +From: Pavel Begunkov +Date: Sun, 20 Nov 2022 16:57:41 +0000 +Subject: io_uring: cmpxchg for poll arm refs release + +From: Pavel Begunkov + +commit 2f3893437a4ebf2e892ca172e9e122841319d675 upstream. + +Replace atomically substracting the ownership reference at the end of +arming a poll with a cmpxchg. We try to release ownership by setting 0 +assuming that poll_refs didn't change while we were arming. If it did +change, we keep the ownership and use it to queue a tw, which is fully +capable to process all events and (even tolerates spurious wake ups). + +It's a bit more elegant as we reduce races b/w setting the cancellation +flag and getting refs with this release, and with that we don't have to +worry about any kinds of underflows. It's not the fastest path for +polling. The performance difference b/w cmpxchg and atomic dec is +usually negligible and it's not the fastest path. + +Cc: stable@vger.kernel.org +Fixes: aa43477b04025 ("io_uring: poll rework") +Signed-off-by: Pavel Begunkov +Link: https://lore.kernel.org/r/0c95251624397ea6def568ff040cad2d7926fd51.1668963050.git.asml.silence@gmail.com +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + io_uring/poll.c | 8 +++----- + 1 file changed, 3 insertions(+), 5 deletions(-) + +--- a/io_uring/poll.c ++++ b/io_uring/poll.c +@@ -519,7 +519,6 @@ static int __io_arm_poll_handler(struct + unsigned issue_flags) + { + struct io_ring_ctx *ctx = req->ctx; +- int v; + + INIT_HLIST_NODE(&req->hash_node); + req->work.cancel_seq = atomic_read(&ctx->cancel_seq); +@@ -587,11 +586,10 @@ static int __io_arm_poll_handler(struct + + if (ipt->owning) { + /* +- * Release ownership. If someone tried to queue a tw while it was +- * locked, kick it off for them. ++ * Try to release ownership. If we see a change of state, e.g. ++ * poll was waken up, queue up a tw, it'll deal with it. + */ +- v = atomic_dec_return(&req->poll_refs); +- if (unlikely(v & IO_POLL_REF_MASK)) ++ if (atomic_cmpxchg(&req->poll_refs, 1, 0) != 1) + __io_poll_execute(req, 0); + } + return 0; diff --git a/queue-6.0/io_uring-make-poll-refs-more-robust.patch b/queue-6.0/io_uring-make-poll-refs-more-robust.patch new file mode 100644 index 00000000000..44bbaf15b45 --- /dev/null +++ b/queue-6.0/io_uring-make-poll-refs-more-robust.patch @@ -0,0 +1,101 @@ +From a26a35e9019fd70bf3cf647dcfdae87abc7bacea Mon Sep 17 00:00:00 2001 +From: Pavel Begunkov +Date: Sun, 20 Nov 2022 16:57:42 +0000 +Subject: io_uring: make poll refs more robust + +From: Pavel Begunkov + +commit a26a35e9019fd70bf3cf647dcfdae87abc7bacea upstream. + +poll_refs carry two functions, the first is ownership over the request. +The second is notifying the io_poll_check_events() that there was an +event but wake up couldn't grab the ownership, so io_poll_check_events() +should retry. + +We want to make poll_refs more robust against overflows. Instead of +always incrementing it, which covers two purposes with one atomic, check +if poll_refs is elevated enough and if so set a retry flag without +attempts to grab ownership. The gap between the bias check and following +atomics may seem racy, but we don't need it to be strict. Moreover there +might only be maximum 4 parallel updates: by the first and the second +poll entries, __io_arm_poll_handler() and cancellation. From those four, +only poll wake ups may be executed multiple times, but they're protected +by a spin. + +Cc: stable@vger.kernel.org +Reported-by: Lin Ma +Fixes: aa43477b04025 ("io_uring: poll rework") +Signed-off-by: Pavel Begunkov +Link: https://lore.kernel.org/r/c762bc31f8683b3270f3587691348a7119ef9c9d.1668963050.git.asml.silence@gmail.com +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + io_uring/poll.c | 36 +++++++++++++++++++++++++++++++++++- + 1 file changed, 35 insertions(+), 1 deletion(-) + +--- a/io_uring/poll.c ++++ b/io_uring/poll.c +@@ -40,7 +40,14 @@ struct io_poll_table { + }; + + #define IO_POLL_CANCEL_FLAG BIT(31) +-#define IO_POLL_REF_MASK GENMASK(30, 0) ++#define IO_POLL_RETRY_FLAG BIT(30) ++#define IO_POLL_REF_MASK GENMASK(29, 0) ++ ++/* ++ * We usually have 1-2 refs taken, 128 is more than enough and we want to ++ * maximise the margin between this amount and the moment when it overflows. ++ */ ++#define IO_POLL_REF_BIAS 128 + + #define IO_WQE_F_DOUBLE 1 + +@@ -58,6 +65,21 @@ static inline bool wqe_is_double(struct + return priv & IO_WQE_F_DOUBLE; + } + ++static bool io_poll_get_ownership_slowpath(struct io_kiocb *req) ++{ ++ int v; ++ ++ /* ++ * poll_refs are already elevated and we don't have much hope for ++ * grabbing the ownership. Instead of incrementing set a retry flag ++ * to notify the loop that there might have been some change. ++ */ ++ v = atomic_fetch_or(IO_POLL_RETRY_FLAG, &req->poll_refs); ++ if (v & IO_POLL_REF_MASK) ++ return false; ++ return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK); ++} ++ + /* + * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can + * bump it and acquire ownership. It's disallowed to modify requests while not +@@ -66,6 +88,8 @@ static inline bool wqe_is_double(struct + */ + static inline bool io_poll_get_ownership(struct io_kiocb *req) + { ++ if (unlikely(atomic_read(&req->poll_refs) >= IO_POLL_REF_BIAS)) ++ return io_poll_get_ownership_slowpath(req); + return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK); + } + +@@ -235,6 +259,16 @@ static int io_poll_check_events(struct i + */ + if ((v & IO_POLL_REF_MASK) != 1) + req->cqe.res = 0; ++ if (v & IO_POLL_RETRY_FLAG) { ++ req->cqe.res = 0; ++ /* ++ * We won't find new events that came in between ++ * vfs_poll and the ref put unless we clear the flag ++ * in advance. ++ */ ++ atomic_andnot(IO_POLL_RETRY_FLAG, &req->poll_refs); ++ v &= ~IO_POLL_RETRY_FLAG; ++ } + + /* the mask was stashed in __io_poll_execute */ + if (!req->cqe.res) { diff --git a/queue-6.0/kvm-update-gfn_to_pfn_cache-khva-when-it-moves-within-the-same-page.patch b/queue-6.0/kvm-update-gfn_to_pfn_cache-khva-when-it-moves-within-the-same-page.patch new file mode 100644 index 00000000000..dd8f319707f --- /dev/null +++ b/queue-6.0/kvm-update-gfn_to_pfn_cache-khva-when-it-moves-within-the-same-page.patch @@ -0,0 +1,41 @@ +From 8332f0ed4f187c7b700831bd7cc83ce180a944b9 Mon Sep 17 00:00:00 2001 +From: David Woodhouse +Date: Sat, 19 Nov 2022 09:25:39 +0000 +Subject: KVM: Update gfn_to_pfn_cache khva when it moves within the same page + +From: David Woodhouse + +commit 8332f0ed4f187c7b700831bd7cc83ce180a944b9 upstream. + +In the case where a GPC is refreshed to a different location within the +same page, we didn't bother to update it. Mostly we don't need to, but +since the ->khva field also includes the offset within the page, that +does have to be updated. + +Fixes: 3ba2c95ea180 ("KVM: Do not incorporate page offset into gfn=>pfn cache user address") +Signed-off-by: David Woodhouse +Reviewed-by: Paul Durrant +Reviewed-by: Sean Christopherson +Cc: stable@kernel.org +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + virt/kvm/pfncache.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +--- a/virt/kvm/pfncache.c ++++ b/virt/kvm/pfncache.c +@@ -297,7 +297,12 @@ int kvm_gfn_to_pfn_cache_refresh(struct + if (!gpc->valid || old_uhva != gpc->uhva) { + ret = hva_to_pfn_retry(kvm, gpc); + } else { +- /* If the HVA→PFN mapping was already valid, don't unmap it. */ ++ /* ++ * If the HVA→PFN mapping was already valid, don't unmap it. ++ * But do update gpc->khva because the offset within the page ++ * may have changed. ++ */ ++ gpc->khva = old_khva + page_offset; + old_pfn = KVM_PFN_ERR_FAULT; + old_khva = NULL; + ret = 0; diff --git a/queue-6.0/kvm-x86-add-kvm_leave_nested.patch b/queue-6.0/kvm-x86-add-kvm_leave_nested.patch new file mode 100644 index 00000000000..e5162ab5191 --- /dev/null +++ b/queue-6.0/kvm-x86-add-kvm_leave_nested.patch @@ -0,0 +1,71 @@ +From f9697df251438b0798780900e8b43bdb12a56d64 Mon Sep 17 00:00:00 2001 +From: Maxim Levitsky +Date: Thu, 3 Nov 2022 16:13:45 +0200 +Subject: KVM: x86: add kvm_leave_nested + +From: Maxim Levitsky + +commit f9697df251438b0798780900e8b43bdb12a56d64 upstream. + +add kvm_leave_nested which wraps a call to nested_ops->leave_nested +into a function. + +Cc: stable@vger.kernel.org +Signed-off-by: Maxim Levitsky +Message-Id: <20221103141351.50662-4-mlevitsk@redhat.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/svm/nested.c | 3 --- + arch/x86/kvm/vmx/nested.c | 3 --- + arch/x86/kvm/x86.c | 8 +++++++- + 3 files changed, 7 insertions(+), 7 deletions(-) + +--- a/arch/x86/kvm/svm/nested.c ++++ b/arch/x86/kvm/svm/nested.c +@@ -1164,9 +1164,6 @@ void svm_free_nested(struct vcpu_svm *sv + svm->nested.initialized = false; + } + +-/* +- * Forcibly leave nested mode in order to be able to reset the VCPU later on. +- */ + void svm_leave_nested(struct kvm_vcpu *vcpu) + { + struct vcpu_svm *svm = to_svm(vcpu); +--- a/arch/x86/kvm/vmx/nested.c ++++ b/arch/x86/kvm/vmx/nested.c +@@ -6294,9 +6294,6 @@ out: + return kvm_state.size; + } + +-/* +- * Forcibly leave nested mode in order to be able to reset the VCPU later on. +- */ + void vmx_leave_nested(struct kvm_vcpu *vcpu) + { + if (is_guest_mode(vcpu)) { +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -611,6 +611,12 @@ void kvm_deliver_exception_payload(struc + } + EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload); + ++/* Forcibly leave the nested mode in cases like a vCPU reset */ ++static void kvm_leave_nested(struct kvm_vcpu *vcpu) ++{ ++ kvm_x86_ops.nested_ops->leave_nested(vcpu); ++} ++ + static void kvm_multiple_exception(struct kvm_vcpu *vcpu, + unsigned nr, bool has_error, u32 error_code, + bool has_payload, unsigned long payload, bool reinject) +@@ -5154,7 +5160,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_e + + if (events->flags & KVM_VCPUEVENT_VALID_SMM) { + if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) { +- kvm_x86_ops.nested_ops->leave_nested(vcpu); ++ kvm_leave_nested(vcpu); + kvm_smm_changed(vcpu, events->smi.smm); + } + diff --git a/queue-6.0/kvm-x86-forcibly-leave-nested-mode-on-vcpu-reset.patch b/queue-6.0/kvm-x86-forcibly-leave-nested-mode-on-vcpu-reset.patch new file mode 100644 index 00000000000..7b4de10ce48 --- /dev/null +++ b/queue-6.0/kvm-x86-forcibly-leave-nested-mode-on-vcpu-reset.patch @@ -0,0 +1,57 @@ +From ed129ec9057f89d615ba0c81a4984a90345a1684 Mon Sep 17 00:00:00 2001 +From: Maxim Levitsky +Date: Thu, 3 Nov 2022 16:13:46 +0200 +Subject: KVM: x86: forcibly leave nested mode on vCPU reset + +From: Maxim Levitsky + +commit ed129ec9057f89d615ba0c81a4984a90345a1684 upstream. + +While not obivous, kvm_vcpu_reset() leaves the nested mode by clearing +'vcpu->arch.hflags' but it does so without all the required housekeeping. + +On SVM, it is possible to have a vCPU reset while in guest mode because +unlike VMX, on SVM, INIT's are not latched in SVM non root mode and in +addition to that L1 doesn't have to intercept triple fault, which should +also trigger L1's reset if happens in L2 while L1 didn't intercept it. + +If one of the above conditions happen, KVM will continue to use vmcb02 +while not having in the guest mode. + +Later the IA32_EFER will be cleared which will lead to freeing of the +nested guest state which will (correctly) free the vmcb02, but since +KVM still uses it (incorrectly) this will lead to a use after free +and kernel crash. + +This issue is assigned CVE-2022-3344 + +Cc: stable@vger.kernel.org +Signed-off-by: Maxim Levitsky +Message-Id: <20221103141351.50662-5-mlevitsk@redhat.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/x86.c | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -11789,8 +11789,18 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcp + WARN_ON_ONCE(!init_event && + (old_cr0 || kvm_read_cr3(vcpu) || kvm_read_cr4(vcpu))); + ++ /* ++ * SVM doesn't unconditionally VM-Exit on INIT and SHUTDOWN, thus it's ++ * possible to INIT the vCPU while L2 is active. Force the vCPU back ++ * into L1 as EFER.SVME is cleared on INIT (along with all other EFER ++ * bits), i.e. virtualization is disabled. ++ */ ++ if (is_guest_mode(vcpu)) ++ kvm_leave_nested(vcpu); ++ + kvm_lapic_reset(vcpu, init_event); + ++ WARN_ON_ONCE(is_guest_mode(vcpu) || is_smm(vcpu)); + vcpu->arch.hflags = 0; + + vcpu->arch.smi_pending = 0; diff --git a/queue-6.0/kvm-x86-mmu-fix-race-condition-in-direct_page_fault.patch b/queue-6.0/kvm-x86-mmu-fix-race-condition-in-direct_page_fault.patch new file mode 100644 index 00000000000..51ccc94a885 --- /dev/null +++ b/queue-6.0/kvm-x86-mmu-fix-race-condition-in-direct_page_fault.patch @@ -0,0 +1,101 @@ +From 47b0c2e4c220f2251fd8dcfbb44479819c715e15 Mon Sep 17 00:00:00 2001 +From: Kazuki Takiguchi +Date: Wed, 23 Nov 2022 14:36:00 -0500 +Subject: KVM: x86/mmu: Fix race condition in direct_page_fault + +From: Kazuki Takiguchi + +commit 47b0c2e4c220f2251fd8dcfbb44479819c715e15 upstream. + +make_mmu_pages_available() must be called with mmu_lock held for write. +However, if the TDP MMU is used, it will be called with mmu_lock held for +read. +This function does nothing unless shadow pages are used, so there is no +race unless nested TDP is used. +Since nested TDP uses shadow pages, old shadow pages may be zapped by this +function even when the TDP MMU is enabled. +Since shadow pages are never allocated by kvm_tdp_mmu_map(), a race +condition can be avoided by not calling make_mmu_pages_available() if the +TDP MMU is currently in use. + +I encountered this when repeatedly starting and stopping nested VM. +It can be artificially caused by allocating a large number of nested TDP +SPTEs. + +For example, the following BUG and general protection fault are caused in +the host kernel. + +pte_list_remove: 00000000cd54fc10 many->many +------------[ cut here ]------------ +kernel BUG at arch/x86/kvm/mmu/mmu.c:963! +invalid opcode: 0000 [#1] PREEMPT SMP NOPTI +RIP: 0010:pte_list_remove.cold+0x16/0x48 [kvm] +Call Trace: + + drop_spte+0xe0/0x180 [kvm] + mmu_page_zap_pte+0x4f/0x140 [kvm] + __kvm_mmu_prepare_zap_page+0x62/0x3e0 [kvm] + kvm_mmu_zap_oldest_mmu_pages+0x7d/0xf0 [kvm] + direct_page_fault+0x3cb/0x9b0 [kvm] + kvm_tdp_page_fault+0x2c/0xa0 [kvm] + kvm_mmu_page_fault+0x207/0x930 [kvm] + npf_interception+0x47/0xb0 [kvm_amd] + svm_invoke_exit_handler+0x13c/0x1a0 [kvm_amd] + svm_handle_exit+0xfc/0x2c0 [kvm_amd] + kvm_arch_vcpu_ioctl_run+0xa79/0x1780 [kvm] + kvm_vcpu_ioctl+0x29b/0x6f0 [kvm] + __x64_sys_ioctl+0x95/0xd0 + do_syscall_64+0x5c/0x90 + +general protection fault, probably for non-canonical address +0xdead000000000122: 0000 [#1] PREEMPT SMP NOPTI +RIP: 0010:kvm_mmu_commit_zap_page.part.0+0x4b/0xe0 [kvm] +Call Trace: + + kvm_mmu_zap_oldest_mmu_pages+0xae/0xf0 [kvm] + direct_page_fault+0x3cb/0x9b0 [kvm] + kvm_tdp_page_fault+0x2c/0xa0 [kvm] + kvm_mmu_page_fault+0x207/0x930 [kvm] + npf_interception+0x47/0xb0 [kvm_amd] + +CVE: CVE-2022-45869 +Fixes: a2855afc7ee8 ("KVM: x86/mmu: Allow parallel page faults for the TDP MMU") +Signed-off-by: Kazuki Takiguchi +Cc: stable@vger.kernel.org +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/mmu/mmu.c | 13 +++++++------ + 1 file changed, 7 insertions(+), 6 deletions(-) + +--- a/arch/x86/kvm/mmu/mmu.c ++++ b/arch/x86/kvm/mmu/mmu.c +@@ -2431,6 +2431,7 @@ static bool __kvm_mmu_prepare_zap_page(s + { + bool list_unstable, zapped_root = false; + ++ lockdep_assert_held_write(&kvm->mmu_lock); + trace_kvm_mmu_prepare_zap_page(sp); + ++kvm->stat.mmu_shadow_zapped; + *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list); +@@ -4250,14 +4251,14 @@ static int direct_page_fault(struct kvm_ + if (is_page_fault_stale(vcpu, fault, mmu_seq)) + goto out_unlock; + +- r = make_mmu_pages_available(vcpu); +- if (r) +- goto out_unlock; +- +- if (is_tdp_mmu_fault) ++ if (is_tdp_mmu_fault) { + r = kvm_tdp_mmu_map(vcpu, fault); +- else ++ } else { ++ r = make_mmu_pages_available(vcpu); ++ if (r) ++ goto out_unlock; + r = __direct_map(vcpu, fault); ++ } + + out_unlock: + if (is_tdp_mmu_fault) diff --git a/queue-6.0/kvm-x86-nsvm-harden-svm_free_nested-against-freeing-vmcb02-while-still-in-use.patch b/queue-6.0/kvm-x86-nsvm-harden-svm_free_nested-against-freeing-vmcb02-while-still-in-use.patch new file mode 100644 index 00000000000..0733d91a8f2 --- /dev/null +++ b/queue-6.0/kvm-x86-nsvm-harden-svm_free_nested-against-freeing-vmcb02-while-still-in-use.patch @@ -0,0 +1,36 @@ +From 16ae56d7e0528559bf8dc9070e3bfd8ba3de80df Mon Sep 17 00:00:00 2001 +From: Maxim Levitsky +Date: Thu, 3 Nov 2022 16:13:44 +0200 +Subject: KVM: x86: nSVM: harden svm_free_nested against freeing vmcb02 while still in use + +From: Maxim Levitsky + +commit 16ae56d7e0528559bf8dc9070e3bfd8ba3de80df upstream. + +Make sure that KVM uses vmcb01 before freeing nested state, and warn if +that is not the case. + +This is a minimal fix for CVE-2022-3344 making the kernel print a warning +instead of a kernel panic. + +Cc: stable@vger.kernel.org +Signed-off-by: Maxim Levitsky +Message-Id: <20221103141351.50662-3-mlevitsk@redhat.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/svm/nested.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/arch/x86/kvm/svm/nested.c ++++ b/arch/x86/kvm/svm/nested.c +@@ -1143,6 +1143,9 @@ void svm_free_nested(struct vcpu_svm *sv + if (!svm->nested.initialized) + return; + ++ if (WARN_ON_ONCE(svm->vmcb != svm->vmcb01.ptr)) ++ svm_switch_vmcb(svm, &svm->vmcb01); ++ + svm_vcpu_free_msrpm(svm->nested.msrpm); + svm->nested.msrpm = NULL; + diff --git a/queue-6.0/kvm-x86-nsvm-leave-nested-mode-on-vcpu-free.patch b/queue-6.0/kvm-x86-nsvm-leave-nested-mode-on-vcpu-free.patch new file mode 100644 index 00000000000..4c6c703063b --- /dev/null +++ b/queue-6.0/kvm-x86-nsvm-leave-nested-mode-on-vcpu-free.patch @@ -0,0 +1,33 @@ +From 917401f26a6af5756d89b550a8e1bd50cf42b07e Mon Sep 17 00:00:00 2001 +From: Maxim Levitsky +Date: Thu, 3 Nov 2022 16:13:43 +0200 +Subject: KVM: x86: nSVM: leave nested mode on vCPU free + +From: Maxim Levitsky + +commit 917401f26a6af5756d89b550a8e1bd50cf42b07e upstream. + +If the VM was terminated while nested, we free the nested state +while the vCPU still is in nested mode. + +Soon a warning will be added for this condition. + +Cc: stable@vger.kernel.org +Signed-off-by: Maxim Levitsky +Message-Id: <20221103141351.50662-2-mlevitsk@redhat.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/svm/svm.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -1440,6 +1440,7 @@ static void svm_vcpu_free(struct kvm_vcp + */ + svm_clear_current_vmcb(svm->vmcb); + ++ svm_leave_nested(vcpu); + svm_free_nested(svm); + + sev_free_vcpu(vcpu); diff --git a/queue-6.0/kvm-x86-remove-exit_int_info-warning-in-svm_handle_exit.patch b/queue-6.0/kvm-x86-remove-exit_int_info-warning-in-svm_handle_exit.patch new file mode 100644 index 00000000000..73f4e756b4e --- /dev/null +++ b/queue-6.0/kvm-x86-remove-exit_int_info-warning-in-svm_handle_exit.patch @@ -0,0 +1,58 @@ +From 05311ce954aebe75935d9ae7d38ac82b5b796e33 Mon Sep 17 00:00:00 2001 +From: Maxim Levitsky +Date: Thu, 3 Nov 2022 16:13:51 +0200 +Subject: KVM: x86: remove exit_int_info warning in svm_handle_exit + +From: Maxim Levitsky + +commit 05311ce954aebe75935d9ae7d38ac82b5b796e33 upstream. + +It is valid to receive external interrupt and have broken IDT entry, +which will lead to #GP with exit_int_into that will contain the index of +the IDT entry (e.g any value). + +Other exceptions can happen as well, like #NP or #SS +(if stack switch fails). + +Thus this warning can be user triggred and has very little value. + +Cc: stable@vger.kernel.org +Signed-off-by: Maxim Levitsky +Message-Id: <20221103141351.50662-10-mlevitsk@redhat.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/svm/svm.c | 15 --------------- + 1 file changed, 15 deletions(-) + +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -346,12 +346,6 @@ int svm_set_efer(struct kvm_vcpu *vcpu, + return 0; + } + +-static int is_external_interrupt(u32 info) +-{ +- info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; +- return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR); +-} +- + static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu) + { + struct vcpu_svm *svm = to_svm(vcpu); +@@ -3427,15 +3421,6 @@ static int svm_handle_exit(struct kvm_vc + return 0; + } + +- if (is_external_interrupt(svm->vmcb->control.exit_int_info) && +- exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR && +- exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH && +- exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI) +- printk(KERN_ERR "%s: unexpected exit_int_info 0x%x " +- "exit_code 0x%x\n", +- __func__, svm->vmcb->control.exit_int_info, +- exit_code); +- + if (exit_fastpath != EXIT_FASTPATH_NONE) + return 1; + diff --git a/queue-6.0/kvm-x86-xen-only-do-in-kernel-acceleration-of-hypercalls-for-guest-cpl0.patch b/queue-6.0/kvm-x86-xen-only-do-in-kernel-acceleration-of-hypercalls-for-guest-cpl0.patch new file mode 100644 index 00000000000..d3969040887 --- /dev/null +++ b/queue-6.0/kvm-x86-xen-only-do-in-kernel-acceleration-of-hypercalls-for-guest-cpl0.patch @@ -0,0 +1,64 @@ +From c2b8cdfaf3a6721afe0c8c060a631b1c67a7f1ee Mon Sep 17 00:00:00 2001 +From: David Woodhouse +Date: Sat, 12 Nov 2022 13:52:25 +0000 +Subject: KVM: x86/xen: Only do in-kernel acceleration of hypercalls for guest CPL0 + +From: David Woodhouse + +commit c2b8cdfaf3a6721afe0c8c060a631b1c67a7f1ee upstream. + +There are almost no hypercalls which are valid from CPL > 0, and definitely +none which are handled by the kernel. + +Fixes: 2fd6df2f2b47 ("KVM: x86/xen: intercept EVTCHNOP_send from guests") +Reported-by: Michal Luczaj +Signed-off-by: David Woodhouse +Reviewed-by: Sean Christopherson +Cc: stable@kernel.org +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/xen.c | 12 +++++++++++- + 1 file changed, 11 insertions(+), 1 deletion(-) + +--- a/arch/x86/kvm/xen.c ++++ b/arch/x86/kvm/xen.c +@@ -1216,6 +1216,7 @@ int kvm_xen_hypercall(struct kvm_vcpu *v + bool longmode; + u64 input, params[6], r = -ENOSYS; + bool handled = false; ++ u8 cpl; + + input = (u64)kvm_register_read(vcpu, VCPU_REGS_RAX); + +@@ -1243,9 +1244,17 @@ int kvm_xen_hypercall(struct kvm_vcpu *v + params[5] = (u64)kvm_r9_read(vcpu); + } + #endif ++ cpl = static_call(kvm_x86_get_cpl)(vcpu); + trace_kvm_xen_hypercall(input, params[0], params[1], params[2], + params[3], params[4], params[5]); + ++ /* ++ * Only allow hypercall acceleration for CPL0. The rare hypercalls that ++ * are permitted in guest userspace can be handled by the VMM. ++ */ ++ if (unlikely(cpl > 0)) ++ goto handle_in_userspace; ++ + switch (input) { + case __HYPERVISOR_xen_version: + if (params[0] == XENVER_version && vcpu->kvm->arch.xen.xen_version) { +@@ -1280,10 +1289,11 @@ int kvm_xen_hypercall(struct kvm_vcpu *v + if (handled) + return kvm_xen_hypercall_set_result(vcpu, r); + ++handle_in_userspace: + vcpu->run->exit_reason = KVM_EXIT_XEN; + vcpu->run->xen.type = KVM_EXIT_XEN_HCALL; + vcpu->run->xen.u.hcall.longmode = longmode; +- vcpu->run->xen.u.hcall.cpl = static_call(kvm_x86_get_cpl)(vcpu); ++ vcpu->run->xen.u.hcall.cpl = cpl; + vcpu->run->xen.u.hcall.input = input; + vcpu->run->xen.u.hcall.params[0] = params[0]; + vcpu->run->xen.u.hcall.params[1] = params[1]; diff --git a/queue-6.0/kvm-x86-xen-validate-port-number-in-schedop_poll.patch b/queue-6.0/kvm-x86-xen-validate-port-number-in-schedop_poll.patch new file mode 100644 index 00000000000..9aac9e54ded --- /dev/null +++ b/queue-6.0/kvm-x86-xen-validate-port-number-in-schedop_poll.patch @@ -0,0 +1,68 @@ +From 4ea9439fd537313f3381f0af4ebbf05e3f51a58c Mon Sep 17 00:00:00 2001 +From: David Woodhouse +Date: Sat, 12 Nov 2022 13:48:58 +0000 +Subject: KVM: x86/xen: Validate port number in SCHEDOP_poll + +From: David Woodhouse + +commit 4ea9439fd537313f3381f0af4ebbf05e3f51a58c upstream. + +We shouldn't allow guests to poll on arbitrary port numbers off the end +of the event channel table. + +Fixes: 1a65105a5aba ("KVM: x86/xen: handle PV spinlocks slowpath") +[dwmw2: my bug though; the original version did check the validity as a + side-effect of an idr_find() which I ripped out in refactoring.] +Reported-by: Michal Luczaj +Signed-off-by: David Woodhouse +Reviewed-by: Sean Christopherson +Cc: stable@kernel.org +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/xen.c | 20 ++++++++++++-------- + 1 file changed, 12 insertions(+), 8 deletions(-) + +--- a/arch/x86/kvm/xen.c ++++ b/arch/x86/kvm/xen.c +@@ -954,6 +954,14 @@ static int kvm_xen_hypercall_complete_us + return kvm_xen_hypercall_set_result(vcpu, run->xen.u.hcall.result); + } + ++static inline int max_evtchn_port(struct kvm *kvm) ++{ ++ if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) ++ return EVTCHN_2L_NR_CHANNELS; ++ else ++ return COMPAT_EVTCHN_2L_NR_CHANNELS; ++} ++ + static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports, + evtchn_port_t *ports) + { +@@ -1042,6 +1050,10 @@ static bool kvm_xen_schedop_poll(struct + *r = -EFAULT; + goto out; + } ++ if (ports[i] >= max_evtchn_port(vcpu->kvm)) { ++ *r = -EINVAL; ++ goto out; ++ } + } + + if (sched_poll.nr_ports == 1) +@@ -1308,14 +1320,6 @@ handle_in_userspace: + return 0; + } + +-static inline int max_evtchn_port(struct kvm *kvm) +-{ +- if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) +- return EVTCHN_2L_NR_CHANNELS; +- else +- return COMPAT_EVTCHN_2L_NR_CHANNELS; +-} +- + static void kvm_xen_check_poller(struct kvm_vcpu *vcpu, int port) + { + int poll_evtchn = vcpu->arch.xen.poll_evtchn; diff --git a/queue-6.0/mm-cgroup-reclaim-fix-dirty-pages-throttling-on-cgroup-v1.patch b/queue-6.0/mm-cgroup-reclaim-fix-dirty-pages-throttling-on-cgroup-v1.patch new file mode 100644 index 00000000000..665354fe117 --- /dev/null +++ b/queue-6.0/mm-cgroup-reclaim-fix-dirty-pages-throttling-on-cgroup-v1.patch @@ -0,0 +1,65 @@ +From 81a70c21d9170de67a45843bdd627f4cce9c4215 Mon Sep 17 00:00:00 2001 +From: "Aneesh Kumar K.V" +Date: Fri, 18 Nov 2022 12:36:03 +0530 +Subject: mm/cgroup/reclaim: fix dirty pages throttling on cgroup v1 + +From: Aneesh Kumar K.V + +commit 81a70c21d9170de67a45843bdd627f4cce9c4215 upstream. + +balance_dirty_pages doesn't do the required dirty throttling on cgroupv1. +See commit 9badce000e2c ("cgroup, writeback: don't enable cgroup writeback +on traditional hierarchies"). Instead, the kernel depends on writeback +throttling in shrink_folio_list to achieve the same goal. With large +memory systems, the flusher may not be able to writeback quickly enough +such that we will start finding pages in the shrink_folio_list already in +writeback. Hence for cgroupv1 let's do a reclaim throttle after waking up +the flusher. + +The below test which used to fail on a 256GB system completes till the the +file system is full with this change. + +root@lp2:/sys/fs/cgroup/memory# mkdir test +root@lp2:/sys/fs/cgroup/memory# cd test/ +root@lp2:/sys/fs/cgroup/memory/test# echo 120M > memory.limit_in_bytes +root@lp2:/sys/fs/cgroup/memory/test# echo $$ > tasks +root@lp2:/sys/fs/cgroup/memory/test# dd if=/dev/zero of=/home/kvaneesh/test bs=1M +Killed + +Link: https://lkml.kernel.org/r/20221118070603.84081-1-aneesh.kumar@linux.ibm.com +Signed-off-by: Aneesh Kumar K.V +Suggested-by: Johannes Weiner +Acked-by: Johannes Weiner +Cc: Tejun Heo +Cc: zefan li +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/vmscan.c | 14 +++++++++++++- + 1 file changed, 13 insertions(+), 1 deletion(-) + +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -2472,8 +2472,20 @@ shrink_inactive_list(unsigned long nr_to + * the flushers simply cannot keep up with the allocation + * rate. Nudge the flusher threads in case they are asleep. + */ +- if (stat.nr_unqueued_dirty == nr_taken) ++ if (stat.nr_unqueued_dirty == nr_taken) { + wakeup_flusher_threads(WB_REASON_VMSCAN); ++ /* ++ * For cgroupv1 dirty throttling is achieved by waking up ++ * the kernel flusher here and later waiting on folios ++ * which are in writeback to finish (see shrink_folio_list()). ++ * ++ * Flusher may not be able to issue writeback quickly ++ * enough for cgroupv1 writeback throttling to work ++ * on a large system. ++ */ ++ if (!writeback_throttling_sane(sc)) ++ reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK); ++ } + + sc->nr.dirty += stat.nr_dirty; + sc->nr.congested += stat.nr_congested; diff --git a/queue-6.0/mm-vmscan-fix-extreme-overreclaim-and-swap-floods.patch b/queue-6.0/mm-vmscan-fix-extreme-overreclaim-and-swap-floods.patch new file mode 100644 index 00000000000..254288ba165 --- /dev/null +++ b/queue-6.0/mm-vmscan-fix-extreme-overreclaim-and-swap-floods.patch @@ -0,0 +1,137 @@ +From f53af4285d775cd9a9a146fc438bd0a1bee1838a Mon Sep 17 00:00:00 2001 +From: Johannes Weiner +Date: Tue, 2 Aug 2022 12:28:11 -0400 +Subject: mm: vmscan: fix extreme overreclaim and swap floods + +From: Johannes Weiner + +commit f53af4285d775cd9a9a146fc438bd0a1bee1838a upstream. + +During proactive reclaim, we sometimes observe severe overreclaim, with +several thousand times more pages reclaimed than requested. + +This trace was obtained from shrink_lruvec() during such an instance: + + prio:0 anon_cost:1141521 file_cost:7767 + nr_reclaimed:4387406 nr_to_reclaim:1047 (or_factor:4190) + nr=[7161123 345 578 1111] + +While he reclaimer requested 4M, vmscan reclaimed close to 16G, most of it +by swapping. These requests take over a minute, during which the write() +to memory.reclaim is unkillably stuck inside the kernel. + +Digging into the source, this is caused by the proportional reclaim +bailout logic. This code tries to resolve a fundamental conflict: to +reclaim roughly what was requested, while also aging all LRUs fairly and +in accordance to their size, swappiness, refault rates etc. The way it +attempts fairness is that once the reclaim goal has been reached, it stops +scanning the LRUs with the smaller remaining scan targets, and adjusts the +remainder of the bigger LRUs according to how much of the smaller LRUs was +scanned. It then finishes scanning that remainder regardless of the +reclaim goal. + +This works fine if priority levels are low and the LRU lists are +comparable in size. However, in this instance, the cgroup that is +targeted by proactive reclaim has almost no files left - they've already +been squeezed out by proactive reclaim earlier - and the remaining anon +pages are hot. Anon rotations cause the priority level to drop to 0, +which results in reclaim targeting all of anon (a lot) and all of file +(almost nothing). By the time reclaim decides to bail, it has scanned +most or all of the file target, and therefor must also scan most or all of +the enormous anon target. This target is thousands of times larger than +the reclaim goal, thus causing the overreclaim. + +The bailout code hasn't changed in years, why is this failing now? The +most likely explanations are two other recent changes in anon reclaim: + +1. Before the series starting with commit 5df741963d52 ("mm: fix LRU + balancing effect of new transparent huge pages"), the VM was + overall relatively reluctant to swap at all, even if swap was + configured. This means the LRU balancing code didn't come into play + as often as it does now, and mostly in high pressure situations + where pronounced swap activity wouldn't be as surprising. + +2. For historic reasons, shrink_lruvec() loops on the scan targets of + all LRU lists except the active anon one, meaning it would bail if + the only remaining pages to scan were active anon - even if there + were a lot of them. + + Before the series starting with commit ccc5dc67340c ("mm/vmscan: + make active/inactive ratio as 1:1 for anon lru"), most anon pages + would live on the active LRU; the inactive one would contain only a + handful of preselected reclaim candidates. After the series, anon + gets aged similarly to file, and the inactive list is the default + for new anon pages as well, making it often the much bigger list. + + As a result, the VM is now more likely to actually finish large + anon targets than before. + +Change the code such that only one SWAP_CLUSTER_MAX-sized nudge toward the +larger LRU lists is made before bailing out on a met reclaim goal. + +This fixes the extreme overreclaim problem. + +Fairness is more subtle and harder to evaluate. No obvious misbehavior +was observed on the test workload, in any case. Conceptually, fairness +should primarily be a cumulative effect from regular, lower priority +scans. Once the VM is in trouble and needs to escalate scan targets to +make forward progress, fairness needs to take a backseat. This is also +acknowledged by the myriad exceptions in get_scan_count(). This patch +makes fairness decrease gradually, as it keeps fairness work static over +increasing priority levels with growing scan targets. This should make +more sense - although we may have to re-visit the exact values. + +Link: https://lkml.kernel.org/r/20220802162811.39216-1-hannes@cmpxchg.org +Signed-off-by: Johannes Weiner +Reviewed-by: Rik van Riel +Acked-by: Mel Gorman +Cc: Hugh Dickins +Cc: Joonsoo Kim +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/vmscan.c | 10 ++++------ + 1 file changed, 4 insertions(+), 6 deletions(-) + +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -2967,8 +2967,8 @@ static void shrink_lruvec(struct lruvec + enum lru_list lru; + unsigned long nr_reclaimed = 0; + unsigned long nr_to_reclaim = sc->nr_to_reclaim; ++ bool proportional_reclaim; + struct blk_plug plug; +- bool scan_adjusted; + + get_scan_count(lruvec, sc, nr); + +@@ -2986,8 +2986,8 @@ static void shrink_lruvec(struct lruvec + * abort proportional reclaim if either the file or anon lru has already + * dropped to zero at the first pass. + */ +- scan_adjusted = (!cgroup_reclaim(sc) && !current_is_kswapd() && +- sc->priority == DEF_PRIORITY); ++ proportional_reclaim = (!cgroup_reclaim(sc) && !current_is_kswapd() && ++ sc->priority == DEF_PRIORITY); + + blk_start_plug(&plug); + while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || +@@ -3007,7 +3007,7 @@ static void shrink_lruvec(struct lruvec + + cond_resched(); + +- if (nr_reclaimed < nr_to_reclaim || scan_adjusted) ++ if (nr_reclaimed < nr_to_reclaim || proportional_reclaim) + continue; + + /* +@@ -3058,8 +3058,6 @@ static void shrink_lruvec(struct lruvec + nr_scanned = targets[lru] - nr[lru]; + nr[lru] = targets[lru] * (100 - percentage) / 100; + nr[lru] -= min(nr[lru], nr_scanned); +- +- scan_adjusted = true; + } + blk_finish_plug(&plug); + sc->nr_reclaimed += nr_reclaimed; diff --git a/queue-6.0/nilfs2-fix-nilfs_sufile_mark_dirty-not-set-segment-usage-as-dirty.patch b/queue-6.0/nilfs2-fix-nilfs_sufile_mark_dirty-not-set-segment-usage-as-dirty.patch new file mode 100644 index 00000000000..df1c045df31 --- /dev/null +++ b/queue-6.0/nilfs2-fix-nilfs_sufile_mark_dirty-not-set-segment-usage-as-dirty.patch @@ -0,0 +1,77 @@ +From 512c5ca01a3610ab14ff6309db363de51f1c13a6 Mon Sep 17 00:00:00 2001 +From: Chen Zhongjin +Date: Fri, 18 Nov 2022 14:33:04 +0800 +Subject: nilfs2: fix nilfs_sufile_mark_dirty() not set segment usage as dirty + +From: Chen Zhongjin + +commit 512c5ca01a3610ab14ff6309db363de51f1c13a6 upstream. + +When extending segments, nilfs_sufile_alloc() is called to get an +unassigned segment, then mark it as dirty to avoid accidentally allocating +the same segment in the future. + +But for some special cases such as a corrupted image it can be unreliable. +If such corruption of the dirty state of the segment occurs, nilfs2 may +reallocate a segment that is in use and pick the same segment for writing +twice at the same time. + +This will cause the problem reported by syzkaller: +https://syzkaller.appspot.com/bug?id=c7c4748e11ffcc367cef04f76e02e931833cbd24 + +This case started with segbuf1.segnum = 3, nextnum = 4 when constructed. +It supposed segment 4 has already been allocated and marked as dirty. + +However the dirty state was corrupted and segment 4 usage was not dirty. +For the first time nilfs_segctor_extend_segments() segment 4 was allocated +again, which made segbuf2 and next segbuf3 had same segment 4. + +sb_getblk() will get same bh for segbuf2 and segbuf3, and this bh is added +to both buffer lists of two segbuf. It makes the lists broken which +causes NULL pointer dereference. + +Fix the problem by setting usage as dirty every time in +nilfs_sufile_mark_dirty(), which is called during constructing current +segment to be written out and before allocating next segment. + +[chenzhongjin@huawei.com: add lock protection per Ryusuke] + Link: https://lkml.kernel.org/r/20221121091141.214703-1-chenzhongjin@huawei.com +Link: https://lkml.kernel.org/r/20221118063304.140187-1-chenzhongjin@huawei.com +Fixes: 9ff05123e3bf ("nilfs2: segment constructor") +Signed-off-by: Chen Zhongjin +Reported-by: +Reported-by: Liu Shixin +Acked-by: Ryusuke Konishi +Tested-by: Ryusuke Konishi +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + fs/nilfs2/sufile.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +--- a/fs/nilfs2/sufile.c ++++ b/fs/nilfs2/sufile.c +@@ -495,14 +495,22 @@ void nilfs_sufile_do_free(struct inode * + int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum) + { + struct buffer_head *bh; ++ void *kaddr; ++ struct nilfs_segment_usage *su; + int ret; + ++ down_write(&NILFS_MDT(sufile)->mi_sem); + ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh); + if (!ret) { + mark_buffer_dirty(bh); + nilfs_mdt_mark_dirty(sufile); ++ kaddr = kmap_atomic(bh->b_page); ++ su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr); ++ nilfs_segment_usage_set_dirty(su); ++ kunmap_atomic(kaddr); + brelse(bh); + } ++ up_write(&NILFS_MDT(sufile)->mi_sem); + return ret; + } + diff --git a/queue-6.0/series b/queue-6.0/series index f633c9eca5b..28a92ec2bac 100644 --- a/queue-6.0/series +++ b/queue-6.0/series @@ -192,3 +192,21 @@ drm-amd-display-fix-calculation-for-cursor-cab-alloc.patch usb-dwc3-gadget-conditionally-remove-requests.patch usb-dwc3-gadget-return-eshutdown-on-ep-disable.patch usb-dwc3-gadget-clear-ep-descriptor-last.patch +io_uring-cmpxchg-for-poll-arm-refs-release.patch +io_uring-make-poll-refs-more-robust.patch +io_uring-clear-tif_notify_signal-if-set-and-task_work-not-available.patch +nilfs2-fix-nilfs_sufile_mark_dirty-not-set-segment-usage-as-dirty.patch +gcov-clang-fix-the-buffer-overflow-issue.patch +mm-cgroup-reclaim-fix-dirty-pages-throttling-on-cgroup-v1.patch +mm-vmscan-fix-extreme-overreclaim-and-swap-floods.patch +fpga-m10bmc-sec-fix-kconfig-dependencies.patch +kvm-x86-mmu-fix-race-condition-in-direct_page_fault.patch +kvm-x86-xen-only-do-in-kernel-acceleration-of-hypercalls-for-guest-cpl0.patch +kvm-x86-xen-validate-port-number-in-schedop_poll.patch +drm-i915-gvt-get-reference-to-kvm-iff-attachment-to-vm-is-successful.patch +kvm-x86-nsvm-leave-nested-mode-on-vcpu-free.patch +kvm-x86-forcibly-leave-nested-mode-on-vcpu-reset.patch +kvm-x86-nsvm-harden-svm_free_nested-against-freeing-vmcb02-while-still-in-use.patch +kvm-x86-add-kvm_leave_nested.patch +kvm-x86-remove-exit_int_info-warning-in-svm_handle_exit.patch +kvm-update-gfn_to_pfn_cache-khva-when-it-moves-within-the-same-page.patch -- 2.47.3