]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
6.0-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 30 Nov 2022 12:28:05 +0000 (13:28 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 30 Nov 2022 12:28:05 +0000 (13:28 +0100)
added patches:
drm-i915-gvt-get-reference-to-kvm-iff-attachment-to-vm-is-successful.patch
fpga-m10bmc-sec-fix-kconfig-dependencies.patch
gcov-clang-fix-the-buffer-overflow-issue.patch
io_uring-clear-tif_notify_signal-if-set-and-task_work-not-available.patch
io_uring-cmpxchg-for-poll-arm-refs-release.patch
io_uring-make-poll-refs-more-robust.patch
kvm-update-gfn_to_pfn_cache-khva-when-it-moves-within-the-same-page.patch
kvm-x86-add-kvm_leave_nested.patch
kvm-x86-forcibly-leave-nested-mode-on-vcpu-reset.patch
kvm-x86-mmu-fix-race-condition-in-direct_page_fault.patch
kvm-x86-nsvm-harden-svm_free_nested-against-freeing-vmcb02-while-still-in-use.patch
kvm-x86-nsvm-leave-nested-mode-on-vcpu-free.patch
kvm-x86-remove-exit_int_info-warning-in-svm_handle_exit.patch
kvm-x86-xen-only-do-in-kernel-acceleration-of-hypercalls-for-guest-cpl0.patch
kvm-x86-xen-validate-port-number-in-schedop_poll.patch
mm-cgroup-reclaim-fix-dirty-pages-throttling-on-cgroup-v1.patch
mm-vmscan-fix-extreme-overreclaim-and-swap-floods.patch
nilfs2-fix-nilfs_sufile_mark_dirty-not-set-segment-usage-as-dirty.patch

19 files changed:
queue-6.0/drm-i915-gvt-get-reference-to-kvm-iff-attachment-to-vm-is-successful.patch [new file with mode: 0644]
queue-6.0/fpga-m10bmc-sec-fix-kconfig-dependencies.patch [new file with mode: 0644]
queue-6.0/gcov-clang-fix-the-buffer-overflow-issue.patch [new file with mode: 0644]
queue-6.0/io_uring-clear-tif_notify_signal-if-set-and-task_work-not-available.patch [new file with mode: 0644]
queue-6.0/io_uring-cmpxchg-for-poll-arm-refs-release.patch [new file with mode: 0644]
queue-6.0/io_uring-make-poll-refs-more-robust.patch [new file with mode: 0644]
queue-6.0/kvm-update-gfn_to_pfn_cache-khva-when-it-moves-within-the-same-page.patch [new file with mode: 0644]
queue-6.0/kvm-x86-add-kvm_leave_nested.patch [new file with mode: 0644]
queue-6.0/kvm-x86-forcibly-leave-nested-mode-on-vcpu-reset.patch [new file with mode: 0644]
queue-6.0/kvm-x86-mmu-fix-race-condition-in-direct_page_fault.patch [new file with mode: 0644]
queue-6.0/kvm-x86-nsvm-harden-svm_free_nested-against-freeing-vmcb02-while-still-in-use.patch [new file with mode: 0644]
queue-6.0/kvm-x86-nsvm-leave-nested-mode-on-vcpu-free.patch [new file with mode: 0644]
queue-6.0/kvm-x86-remove-exit_int_info-warning-in-svm_handle_exit.patch [new file with mode: 0644]
queue-6.0/kvm-x86-xen-only-do-in-kernel-acceleration-of-hypercalls-for-guest-cpl0.patch [new file with mode: 0644]
queue-6.0/kvm-x86-xen-validate-port-number-in-schedop_poll.patch [new file with mode: 0644]
queue-6.0/mm-cgroup-reclaim-fix-dirty-pages-throttling-on-cgroup-v1.patch [new file with mode: 0644]
queue-6.0/mm-vmscan-fix-extreme-overreclaim-and-swap-floods.patch [new file with mode: 0644]
queue-6.0/nilfs2-fix-nilfs_sufile_mark_dirty-not-set-segment-usage-as-dirty.patch [new file with mode: 0644]
queue-6.0/series

diff --git a/queue-6.0/drm-i915-gvt-get-reference-to-kvm-iff-attachment-to-vm-is-successful.patch b/queue-6.0/drm-i915-gvt-get-reference-to-kvm-iff-attachment-to-vm-is-successful.patch
new file mode 100644 (file)
index 0000000..c9b50b8
--- /dev/null
@@ -0,0 +1,43 @@
+From 9ed1fdee9ee324f3505ff066287ee53143caaaa2 Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc@google.com>
+Date: Fri, 11 Nov 2022 00:22:24 +0000
+Subject: drm/i915/gvt: Get reference to KVM iff attachment to VM is successful
+
+From: Sean Christopherson <seanjc@google.com>
+
+commit 9ed1fdee9ee324f3505ff066287ee53143caaaa2 upstream.
+
+Get a reference to KVM if and only if a vGPU is successfully attached to
+the VM to avoid leaking a reference if there's no available vGPU.  On
+open_device() failure, vfio_device_open() doesn't invoke close_device().
+
+Fixes: 421cfe6596f6 ("vfio: remove VFIO_GROUP_NOTIFY_SET_KVM")
+Cc: stable@vger.kernel.org
+Reviewed-by: Kevin Tian <kevin.tian@intel.com>
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com>
+Link: http://patchwork.freedesktop.org/patch/msgid/20221111002225.2418386-2-seanjc@google.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/i915/gvt/kvmgt.c |    3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/drivers/gpu/drm/i915/gvt/kvmgt.c
++++ b/drivers/gpu/drm/i915/gvt/kvmgt.c
+@@ -765,8 +765,6 @@ static int intel_vgpu_open_device(struct
+               return -ESRCH;
+       }
+-      kvm_get_kvm(vgpu->vfio_device.kvm);
+-
+       if (__kvmgt_vgpu_exist(vgpu))
+               return -EEXIST;
+@@ -777,6 +775,7 @@ static int intel_vgpu_open_device(struct
+       vgpu->track_node.track_write = kvmgt_page_track_write;
+       vgpu->track_node.track_flush_slot = kvmgt_page_track_flush_slot;
++      kvm_get_kvm(vgpu->vfio_device.kvm);
+       kvm_page_track_register_notifier(vgpu->vfio_device.kvm,
+                                        &vgpu->track_node);
diff --git a/queue-6.0/fpga-m10bmc-sec-fix-kconfig-dependencies.patch b/queue-6.0/fpga-m10bmc-sec-fix-kconfig-dependencies.patch
new file mode 100644 (file)
index 0000000..77e455c
--- /dev/null
@@ -0,0 +1,52 @@
+From dfd10332596ef11ceafd29c4e21b4117be423fc4 Mon Sep 17 00:00:00 2001
+From: Russ Weight <russell.h.weight@intel.com>
+Date: Mon, 14 Nov 2022 16:11:27 -0800
+Subject: fpga: m10bmc-sec: Fix kconfig dependencies
+
+From: Russ Weight <russell.h.weight@intel.com>
+
+commit dfd10332596ef11ceafd29c4e21b4117be423fc4 upstream.
+
+The secure update driver depends on the firmware-upload functionality of
+the firmware-loader. The firmware-loader is carried in the firmware-class
+driver which is enabled with the tristate CONFIG_FW_LOADER option. The
+firmware-upload functionality is included in the firmware-class driver if
+the bool FW_UPLOAD config is set.
+
+The current dependency statement, "depends on FW_UPLOAD", is not adequate
+because it does not implicitly turn on FW_LOADER. Instead of adding a
+dependency, follow the convention used by drivers that require the
+FW_LOADER_USER_HELPER functionality of the firmware-loader by using
+select for both FW_LOADER and FW_UPLOAD.
+
+Fixes: bdf86d0e6ca3 ("fpga: m10bmc-sec: create max10 bmc secure update")
+Reported-by: kernel test robot <lkp@intel.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Russ Weight <russell.h.weight@intel.com>
+Acked-by: Randy Dunlap <rdunlap@infradead.org>
+Acked-by: Xu Yilun <yilun.xu@intel.com>
+Link: https://lore.kernel.org/r/20221115001127.289890-1-russell.h.weight@intel.com
+Signed-off-by: Xu Yilun <yilun.xu@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/fpga/Kconfig | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/fpga/Kconfig b/drivers/fpga/Kconfig
+index 6c416955da53..bbe0a7cabb75 100644
+--- a/drivers/fpga/Kconfig
++++ b/drivers/fpga/Kconfig
+@@ -246,7 +246,9 @@ config FPGA_MGR_VERSAL_FPGA
+ config FPGA_M10_BMC_SEC_UPDATE
+       tristate "Intel MAX10 BMC Secure Update driver"
+-      depends on MFD_INTEL_M10_BMC && FW_UPLOAD
++      depends on MFD_INTEL_M10_BMC
++      select FW_LOADER
++      select FW_UPLOAD
+       help
+         Secure update support for the Intel MAX10 board management
+         controller.
+-- 
+2.38.1
+
diff --git a/queue-6.0/gcov-clang-fix-the-buffer-overflow-issue.patch b/queue-6.0/gcov-clang-fix-the-buffer-overflow-issue.patch
new file mode 100644 (file)
index 0000000..55dfa6e
--- /dev/null
@@ -0,0 +1,88 @@
+From a6f810efabfd789d3bbafeacb4502958ec56c5ce Mon Sep 17 00:00:00 2001
+From: Mukesh Ojha <quic_mojha@quicinc.com>
+Date: Thu, 10 Nov 2022 00:31:37 +0530
+Subject: gcov: clang: fix the buffer overflow issue
+
+From: Mukesh Ojha <quic_mojha@quicinc.com>
+
+commit a6f810efabfd789d3bbafeacb4502958ec56c5ce upstream.
+
+Currently, in clang version of gcov code when module is getting removed
+gcov_info_add() incorrectly adds the sfn_ptr->counter to all the
+dst->functions and it result in the kernel panic in below crash report.
+Fix this by properly handling it.
+
+[    8.899094][  T599] Unable to handle kernel write to read-only memory at virtual address ffffff80461cc000
+[    8.899100][  T599] Mem abort info:
+[    8.899102][  T599]   ESR = 0x9600004f
+[    8.899103][  T599]   EC = 0x25: DABT (current EL), IL = 32 bits
+[    8.899105][  T599]   SET = 0, FnV = 0
+[    8.899107][  T599]   EA = 0, S1PTW = 0
+[    8.899108][  T599]   FSC = 0x0f: level 3 permission fault
+[    8.899110][  T599] Data abort info:
+[    8.899111][  T599]   ISV = 0, ISS = 0x0000004f
+[    8.899113][  T599]   CM = 0, WnR = 1
+[    8.899114][  T599] swapper pgtable: 4k pages, 39-bit VAs, pgdp=00000000ab8de000
+[    8.899116][  T599] [ffffff80461cc000] pgd=18000009ffcde003, p4d=18000009ffcde003, pud=18000009ffcde003, pmd=18000009ffcad003, pte=00600000c61cc787
+[    8.899124][  T599] Internal error: Oops: 9600004f [#1] PREEMPT SMP
+[    8.899265][  T599] Skip md ftrace buffer dump for: 0x1609e0
+....
+..,
+[    8.899544][  T599] CPU: 7 PID: 599 Comm: modprobe Tainted: G S         OE     5.15.41-android13-8-g38e9b1af6bce #1
+[    8.899547][  T599] Hardware name: XXX (DT)
+[    8.899549][  T599] pstate: 82400005 (Nzcv daif +PAN -UAO +TCO -DIT -SSBS BTYPE=--)
+[    8.899551][  T599] pc : gcov_info_add+0x9c/0xb8
+[    8.899557][  T599] lr : gcov_event+0x28c/0x6b8
+[    8.899559][  T599] sp : ffffffc00e733b00
+[    8.899560][  T599] x29: ffffffc00e733b00 x28: ffffffc00e733d30 x27: ffffffe8dc297470
+[    8.899563][  T599] x26: ffffffe8dc297000 x25: ffffffe8dc297000 x24: ffffffe8dc297000
+[    8.899566][  T599] x23: ffffffe8dc0a6200 x22: ffffff880f68bf20 x21: 0000000000000000
+[    8.899569][  T599] x20: ffffff880f68bf00 x19: ffffff8801babc00 x18: ffffffc00d7f9058
+[    8.899572][  T599] x17: 0000000000088793 x16: ffffff80461cbe00 x15: 9100052952800785
+[    8.899575][  T599] x14: 0000000000000200 x13: 0000000000000041 x12: 9100052952800785
+[    8.899577][  T599] x11: ffffffe8dc297000 x10: ffffffe8dc297000 x9 : ffffff80461cbc80
+[    8.899580][  T599] x8 : ffffff8801babe80 x7 : ffffffe8dc2ec000 x6 : ffffffe8dc2ed000
+[    8.899583][  T599] x5 : 000000008020001f x4 : fffffffe2006eae0 x3 : 000000008020001f
+[    8.899586][  T599] x2 : ffffff8027c49200 x1 : ffffff8801babc20 x0 : ffffff80461cb3a0
+[    8.899589][  T599] Call trace:
+[    8.899590][  T599]  gcov_info_add+0x9c/0xb8
+[    8.899592][  T599]  gcov_module_notifier+0xbc/0x120
+[    8.899595][  T599]  blocking_notifier_call_chain+0xa0/0x11c
+[    8.899598][  T599]  do_init_module+0x2a8/0x33c
+[    8.899600][  T599]  load_module+0x23cc/0x261c
+[    8.899602][  T599]  __arm64_sys_finit_module+0x158/0x194
+[    8.899604][  T599]  invoke_syscall+0x94/0x2bc
+[    8.899607][  T599]  el0_svc_common+0x1d8/0x34c
+[    8.899609][  T599]  do_el0_svc+0x40/0x54
+[    8.899611][  T599]  el0_svc+0x94/0x2f0
+[    8.899613][  T599]  el0t_64_sync_handler+0x88/0xec
+[    8.899615][  T599]  el0t_64_sync+0x1b4/0x1b8
+[    8.899618][  T599] Code: f905f56c f86e69ec f86e6a0f 8b0c01ec (f82e6a0c)
+[    8.899620][  T599] ---[ end trace ed5218e9e5b6e2e6 ]---
+
+Link: https://lkml.kernel.org/r/1668020497-13142-1-git-send-email-quic_mojha@quicinc.com
+Fixes: e178a5beb369 ("gcov: clang support")
+Signed-off-by: Mukesh Ojha <quic_mojha@quicinc.com>
+Reviewed-by: Peter Oberparleiter <oberpar@linux.ibm.com>
+Tested-by: Peter Oberparleiter <oberpar@linux.ibm.com>
+Cc: Nathan Chancellor <nathan@kernel.org>
+Cc: Nick Desaulniers <ndesaulniers@google.com>
+Cc: Tom Rix <trix@redhat.com>
+Cc: <stable@vger.kernel.org>   [5.2+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/gcov/clang.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/kernel/gcov/clang.c
++++ b/kernel/gcov/clang.c
+@@ -280,6 +280,8 @@ void gcov_info_add(struct gcov_info *dst
+               for (i = 0; i < sfn_ptr->num_counters; i++)
+                       dfn_ptr->counters[i] += sfn_ptr->counters[i];
++
++              sfn_ptr = list_next_entry(sfn_ptr, head);
+       }
+ }
diff --git a/queue-6.0/io_uring-clear-tif_notify_signal-if-set-and-task_work-not-available.patch b/queue-6.0/io_uring-clear-tif_notify_signal-if-set-and-task_work-not-available.patch
new file mode 100644 (file)
index 0000000..c3ef61e
--- /dev/null
@@ -0,0 +1,41 @@
+From 7cfe7a09489c1cefee7181e07b5f2bcbaebd9f41 Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@kernel.dk>
+Date: Fri, 25 Nov 2022 09:36:29 -0700
+Subject: io_uring: clear TIF_NOTIFY_SIGNAL if set and task_work not available
+
+From: Jens Axboe <axboe@kernel.dk>
+
+commit 7cfe7a09489c1cefee7181e07b5f2bcbaebd9f41 upstream.
+
+With how task_work is added and signaled, we can have TIF_NOTIFY_SIGNAL
+set and no task_work pending as it got run in a previous loop. Treat
+TIF_NOTIFY_SIGNAL like get_signal(), always clear it if set regardless
+of whether or not task_work is pending to run.
+
+Cc: stable@vger.kernel.org
+Fixes: 46a525e199e4 ("io_uring: don't gate task_work run on TIF_NOTIFY_SIGNAL")
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ io_uring/io_uring.h |    9 +++++++--
+ 1 file changed, 7 insertions(+), 2 deletions(-)
+
+--- a/io_uring/io_uring.h
++++ b/io_uring/io_uring.h
+@@ -229,9 +229,14 @@ static inline unsigned int io_sqring_ent
+ static inline bool io_run_task_work(void)
+ {
++      /*
++       * Always check-and-clear the task_work notification signal. With how
++       * signaling works for task_work, we can find it set with nothing to
++       * run. We need to clear it for that case, like get_signal() does.
++       */
++      if (test_thread_flag(TIF_NOTIFY_SIGNAL))
++              clear_notify_signal();
+       if (task_work_pending(current)) {
+-              if (test_thread_flag(TIF_NOTIFY_SIGNAL))
+-                      clear_notify_signal();
+               __set_current_state(TASK_RUNNING);
+               task_work_run();
+               return 1;
diff --git a/queue-6.0/io_uring-cmpxchg-for-poll-arm-refs-release.patch b/queue-6.0/io_uring-cmpxchg-for-poll-arm-refs-release.patch
new file mode 100644 (file)
index 0000000..55c72a5
--- /dev/null
@@ -0,0 +1,56 @@
+From 2f3893437a4ebf2e892ca172e9e122841319d675 Mon Sep 17 00:00:00 2001
+From: Pavel Begunkov <asml.silence@gmail.com>
+Date: Sun, 20 Nov 2022 16:57:41 +0000
+Subject: io_uring: cmpxchg for poll arm refs release
+
+From: Pavel Begunkov <asml.silence@gmail.com>
+
+commit 2f3893437a4ebf2e892ca172e9e122841319d675 upstream.
+
+Replace atomically substracting the ownership reference at the end of
+arming a poll with a cmpxchg. We try to release ownership by setting 0
+assuming that poll_refs didn't change while we were arming. If it did
+change, we keep the ownership and use it to queue a tw, which is fully
+capable to process all events and (even tolerates spurious wake ups).
+
+It's a bit more elegant as we reduce races b/w setting the cancellation
+flag and getting refs with this release, and with that we don't have to
+worry about any kinds of underflows. It's not the fastest path for
+polling. The performance difference b/w cmpxchg and atomic dec is
+usually negligible and it's not the fastest path.
+
+Cc: stable@vger.kernel.org
+Fixes: aa43477b04025 ("io_uring: poll rework")
+Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
+Link: https://lore.kernel.org/r/0c95251624397ea6def568ff040cad2d7926fd51.1668963050.git.asml.silence@gmail.com
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ io_uring/poll.c |    8 +++-----
+ 1 file changed, 3 insertions(+), 5 deletions(-)
+
+--- a/io_uring/poll.c
++++ b/io_uring/poll.c
+@@ -519,7 +519,6 @@ static int __io_arm_poll_handler(struct
+                                unsigned issue_flags)
+ {
+       struct io_ring_ctx *ctx = req->ctx;
+-      int v;
+       INIT_HLIST_NODE(&req->hash_node);
+       req->work.cancel_seq = atomic_read(&ctx->cancel_seq);
+@@ -587,11 +586,10 @@ static int __io_arm_poll_handler(struct
+       if (ipt->owning) {
+               /*
+-               * Release ownership. If someone tried to queue a tw while it was
+-               * locked, kick it off for them.
++               * Try to release ownership. If we see a change of state, e.g.
++               * poll was waken up, queue up a tw, it'll deal with it.
+                */
+-              v = atomic_dec_return(&req->poll_refs);
+-              if (unlikely(v & IO_POLL_REF_MASK))
++              if (atomic_cmpxchg(&req->poll_refs, 1, 0) != 1)
+                       __io_poll_execute(req, 0);
+       }
+       return 0;
diff --git a/queue-6.0/io_uring-make-poll-refs-more-robust.patch b/queue-6.0/io_uring-make-poll-refs-more-robust.patch
new file mode 100644 (file)
index 0000000..44bbaf1
--- /dev/null
@@ -0,0 +1,101 @@
+From a26a35e9019fd70bf3cf647dcfdae87abc7bacea Mon Sep 17 00:00:00 2001
+From: Pavel Begunkov <asml.silence@gmail.com>
+Date: Sun, 20 Nov 2022 16:57:42 +0000
+Subject: io_uring: make poll refs more robust
+
+From: Pavel Begunkov <asml.silence@gmail.com>
+
+commit a26a35e9019fd70bf3cf647dcfdae87abc7bacea upstream.
+
+poll_refs carry two functions, the first is ownership over the request.
+The second is notifying the io_poll_check_events() that there was an
+event but wake up couldn't grab the ownership, so io_poll_check_events()
+should retry.
+
+We want to make poll_refs more robust against overflows. Instead of
+always incrementing it, which covers two purposes with one atomic, check
+if poll_refs is elevated enough and if so set a retry flag without
+attempts to grab ownership. The gap between the bias check and following
+atomics may seem racy, but we don't need it to be strict. Moreover there
+might only be maximum 4 parallel updates: by the first and the second
+poll entries, __io_arm_poll_handler() and cancellation. From those four,
+only poll wake ups may be executed multiple times, but they're protected
+by a spin.
+
+Cc: stable@vger.kernel.org
+Reported-by: Lin Ma <linma@zju.edu.cn>
+Fixes: aa43477b04025 ("io_uring: poll rework")
+Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
+Link: https://lore.kernel.org/r/c762bc31f8683b3270f3587691348a7119ef9c9d.1668963050.git.asml.silence@gmail.com
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ io_uring/poll.c |   36 +++++++++++++++++++++++++++++++++++-
+ 1 file changed, 35 insertions(+), 1 deletion(-)
+
+--- a/io_uring/poll.c
++++ b/io_uring/poll.c
+@@ -40,7 +40,14 @@ struct io_poll_table {
+ };
+ #define IO_POLL_CANCEL_FLAG   BIT(31)
+-#define IO_POLL_REF_MASK      GENMASK(30, 0)
++#define IO_POLL_RETRY_FLAG    BIT(30)
++#define IO_POLL_REF_MASK      GENMASK(29, 0)
++
++/*
++ * We usually have 1-2 refs taken, 128 is more than enough and we want to
++ * maximise the margin between this amount and the moment when it overflows.
++ */
++#define IO_POLL_REF_BIAS      128
+ #define IO_WQE_F_DOUBLE               1
+@@ -58,6 +65,21 @@ static inline bool wqe_is_double(struct
+       return priv & IO_WQE_F_DOUBLE;
+ }
++static bool io_poll_get_ownership_slowpath(struct io_kiocb *req)
++{
++      int v;
++
++      /*
++       * poll_refs are already elevated and we don't have much hope for
++       * grabbing the ownership. Instead of incrementing set a retry flag
++       * to notify the loop that there might have been some change.
++       */
++      v = atomic_fetch_or(IO_POLL_RETRY_FLAG, &req->poll_refs);
++      if (v & IO_POLL_REF_MASK)
++              return false;
++      return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK);
++}
++
+ /*
+  * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can
+  * bump it and acquire ownership. It's disallowed to modify requests while not
+@@ -66,6 +88,8 @@ static inline bool wqe_is_double(struct
+  */
+ static inline bool io_poll_get_ownership(struct io_kiocb *req)
+ {
++      if (unlikely(atomic_read(&req->poll_refs) >= IO_POLL_REF_BIAS))
++              return io_poll_get_ownership_slowpath(req);
+       return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK);
+ }
+@@ -235,6 +259,16 @@ static int io_poll_check_events(struct i
+                */
+               if ((v & IO_POLL_REF_MASK) != 1)
+                       req->cqe.res = 0;
++              if (v & IO_POLL_RETRY_FLAG) {
++                      req->cqe.res = 0;
++                      /*
++                       * We won't find new events that came in between
++                       * vfs_poll and the ref put unless we clear the flag
++                       * in advance.
++                       */
++                      atomic_andnot(IO_POLL_RETRY_FLAG, &req->poll_refs);
++                      v &= ~IO_POLL_RETRY_FLAG;
++              }
+               /* the mask was stashed in __io_poll_execute */
+               if (!req->cqe.res) {
diff --git a/queue-6.0/kvm-update-gfn_to_pfn_cache-khva-when-it-moves-within-the-same-page.patch b/queue-6.0/kvm-update-gfn_to_pfn_cache-khva-when-it-moves-within-the-same-page.patch
new file mode 100644 (file)
index 0000000..dd8f319
--- /dev/null
@@ -0,0 +1,41 @@
+From 8332f0ed4f187c7b700831bd7cc83ce180a944b9 Mon Sep 17 00:00:00 2001
+From: David Woodhouse <dwmw@amazon.co.uk>
+Date: Sat, 19 Nov 2022 09:25:39 +0000
+Subject: KVM: Update gfn_to_pfn_cache khva when it moves within the same page
+
+From: David Woodhouse <dwmw@amazon.co.uk>
+
+commit 8332f0ed4f187c7b700831bd7cc83ce180a944b9 upstream.
+
+In the case where a GPC is refreshed to a different location within the
+same page, we didn't bother to update it. Mostly we don't need to, but
+since the ->khva field also includes the offset within the page, that
+does have to be updated.
+
+Fixes: 3ba2c95ea180 ("KVM: Do not incorporate page offset into gfn=>pfn cache user address")
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Reviewed-by: Paul Durrant <paul@xen.org>
+Reviewed-by: Sean Christopherson <seanjc@google.com>
+Cc: stable@kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ virt/kvm/pfncache.c |    7 ++++++-
+ 1 file changed, 6 insertions(+), 1 deletion(-)
+
+--- a/virt/kvm/pfncache.c
++++ b/virt/kvm/pfncache.c
+@@ -297,7 +297,12 @@ int kvm_gfn_to_pfn_cache_refresh(struct
+       if (!gpc->valid || old_uhva != gpc->uhva) {
+               ret = hva_to_pfn_retry(kvm, gpc);
+       } else {
+-              /* If the HVA→PFN mapping was already valid, don't unmap it. */
++              /*
++               * If the HVA→PFN mapping was already valid, don't unmap it.
++               * But do update gpc->khva because the offset within the page
++               * may have changed.
++               */
++              gpc->khva = old_khva + page_offset;
+               old_pfn = KVM_PFN_ERR_FAULT;
+               old_khva = NULL;
+               ret = 0;
diff --git a/queue-6.0/kvm-x86-add-kvm_leave_nested.patch b/queue-6.0/kvm-x86-add-kvm_leave_nested.patch
new file mode 100644 (file)
index 0000000..e5162ab
--- /dev/null
@@ -0,0 +1,71 @@
+From f9697df251438b0798780900e8b43bdb12a56d64 Mon Sep 17 00:00:00 2001
+From: Maxim Levitsky <mlevitsk@redhat.com>
+Date: Thu, 3 Nov 2022 16:13:45 +0200
+Subject: KVM: x86: add kvm_leave_nested
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+commit f9697df251438b0798780900e8b43bdb12a56d64 upstream.
+
+add kvm_leave_nested which wraps a call to nested_ops->leave_nested
+into a function.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Message-Id: <20221103141351.50662-4-mlevitsk@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/svm/nested.c |    3 ---
+ arch/x86/kvm/vmx/nested.c |    3 ---
+ arch/x86/kvm/x86.c        |    8 +++++++-
+ 3 files changed, 7 insertions(+), 7 deletions(-)
+
+--- a/arch/x86/kvm/svm/nested.c
++++ b/arch/x86/kvm/svm/nested.c
+@@ -1164,9 +1164,6 @@ void svm_free_nested(struct vcpu_svm *sv
+       svm->nested.initialized = false;
+ }
+-/*
+- * Forcibly leave nested mode in order to be able to reset the VCPU later on.
+- */
+ void svm_leave_nested(struct kvm_vcpu *vcpu)
+ {
+       struct vcpu_svm *svm = to_svm(vcpu);
+--- a/arch/x86/kvm/vmx/nested.c
++++ b/arch/x86/kvm/vmx/nested.c
+@@ -6294,9 +6294,6 @@ out:
+       return kvm_state.size;
+ }
+-/*
+- * Forcibly leave nested mode in order to be able to reset the VCPU later on.
+- */
+ void vmx_leave_nested(struct kvm_vcpu *vcpu)
+ {
+       if (is_guest_mode(vcpu)) {
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -611,6 +611,12 @@ void kvm_deliver_exception_payload(struc
+ }
+ EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload);
++/* Forcibly leave the nested mode in cases like a vCPU reset */
++static void kvm_leave_nested(struct kvm_vcpu *vcpu)
++{
++      kvm_x86_ops.nested_ops->leave_nested(vcpu);
++}
++
+ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
+               unsigned nr, bool has_error, u32 error_code,
+               bool has_payload, unsigned long payload, bool reinject)
+@@ -5154,7 +5160,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_e
+       if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
+               if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) {
+-                      kvm_x86_ops.nested_ops->leave_nested(vcpu);
++                      kvm_leave_nested(vcpu);
+                       kvm_smm_changed(vcpu, events->smi.smm);
+               }
diff --git a/queue-6.0/kvm-x86-forcibly-leave-nested-mode-on-vcpu-reset.patch b/queue-6.0/kvm-x86-forcibly-leave-nested-mode-on-vcpu-reset.patch
new file mode 100644 (file)
index 0000000..7b4de10
--- /dev/null
@@ -0,0 +1,57 @@
+From ed129ec9057f89d615ba0c81a4984a90345a1684 Mon Sep 17 00:00:00 2001
+From: Maxim Levitsky <mlevitsk@redhat.com>
+Date: Thu, 3 Nov 2022 16:13:46 +0200
+Subject: KVM: x86: forcibly leave nested mode on vCPU reset
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+commit ed129ec9057f89d615ba0c81a4984a90345a1684 upstream.
+
+While not obivous, kvm_vcpu_reset() leaves the nested mode by clearing
+'vcpu->arch.hflags' but it does so without all the required housekeeping.
+
+On SVM,        it is possible to have a vCPU reset while in guest mode because
+unlike VMX, on SVM, INIT's are not latched in SVM non root mode and in
+addition to that L1 doesn't have to intercept triple fault, which should
+also trigger L1's reset if happens in L2 while L1 didn't intercept it.
+
+If one of the above conditions happen, KVM will        continue to use vmcb02
+while not having in the guest mode.
+
+Later the IA32_EFER will be cleared which will lead to freeing of the
+nested guest state which will (correctly) free the vmcb02, but since
+KVM still uses it (incorrectly) this will lead to a use after free
+and kernel crash.
+
+This issue is assigned CVE-2022-3344
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Message-Id: <20221103141351.50662-5-mlevitsk@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/x86.c |   10 ++++++++++
+ 1 file changed, 10 insertions(+)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -11789,8 +11789,18 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcp
+       WARN_ON_ONCE(!init_event &&
+                    (old_cr0 || kvm_read_cr3(vcpu) || kvm_read_cr4(vcpu)));
++      /*
++       * SVM doesn't unconditionally VM-Exit on INIT and SHUTDOWN, thus it's
++       * possible to INIT the vCPU while L2 is active.  Force the vCPU back
++       * into L1 as EFER.SVME is cleared on INIT (along with all other EFER
++       * bits), i.e. virtualization is disabled.
++       */
++      if (is_guest_mode(vcpu))
++              kvm_leave_nested(vcpu);
++
+       kvm_lapic_reset(vcpu, init_event);
++      WARN_ON_ONCE(is_guest_mode(vcpu) || is_smm(vcpu));
+       vcpu->arch.hflags = 0;
+       vcpu->arch.smi_pending = 0;
diff --git a/queue-6.0/kvm-x86-mmu-fix-race-condition-in-direct_page_fault.patch b/queue-6.0/kvm-x86-mmu-fix-race-condition-in-direct_page_fault.patch
new file mode 100644 (file)
index 0000000..51ccc94
--- /dev/null
@@ -0,0 +1,101 @@
+From 47b0c2e4c220f2251fd8dcfbb44479819c715e15 Mon Sep 17 00:00:00 2001
+From: Kazuki Takiguchi <takiguchi.kazuki171@gmail.com>
+Date: Wed, 23 Nov 2022 14:36:00 -0500
+Subject: KVM: x86/mmu: Fix race condition in direct_page_fault
+
+From: Kazuki Takiguchi <takiguchi.kazuki171@gmail.com>
+
+commit 47b0c2e4c220f2251fd8dcfbb44479819c715e15 upstream.
+
+make_mmu_pages_available() must be called with mmu_lock held for write.
+However, if the TDP MMU is used, it will be called with mmu_lock held for
+read.
+This function does nothing unless shadow pages are used, so there is no
+race unless nested TDP is used.
+Since nested TDP uses shadow pages, old shadow pages may be zapped by this
+function even when the TDP MMU is enabled.
+Since shadow pages are never allocated by kvm_tdp_mmu_map(), a race
+condition can be avoided by not calling make_mmu_pages_available() if the
+TDP MMU is currently in use.
+
+I encountered this when repeatedly starting and stopping nested VM.
+It can be artificially caused by allocating a large number of nested TDP
+SPTEs.
+
+For example, the following BUG and general protection fault are caused in
+the host kernel.
+
+pte_list_remove: 00000000cd54fc10 many->many
+------------[ cut here ]------------
+kernel BUG at arch/x86/kvm/mmu/mmu.c:963!
+invalid opcode: 0000 [#1] PREEMPT SMP NOPTI
+RIP: 0010:pte_list_remove.cold+0x16/0x48 [kvm]
+Call Trace:
+ <TASK>
+ drop_spte+0xe0/0x180 [kvm]
+ mmu_page_zap_pte+0x4f/0x140 [kvm]
+ __kvm_mmu_prepare_zap_page+0x62/0x3e0 [kvm]
+ kvm_mmu_zap_oldest_mmu_pages+0x7d/0xf0 [kvm]
+ direct_page_fault+0x3cb/0x9b0 [kvm]
+ kvm_tdp_page_fault+0x2c/0xa0 [kvm]
+ kvm_mmu_page_fault+0x207/0x930 [kvm]
+ npf_interception+0x47/0xb0 [kvm_amd]
+ svm_invoke_exit_handler+0x13c/0x1a0 [kvm_amd]
+ svm_handle_exit+0xfc/0x2c0 [kvm_amd]
+ kvm_arch_vcpu_ioctl_run+0xa79/0x1780 [kvm]
+ kvm_vcpu_ioctl+0x29b/0x6f0 [kvm]
+ __x64_sys_ioctl+0x95/0xd0
+ do_syscall_64+0x5c/0x90
+
+general protection fault, probably for non-canonical address
+0xdead000000000122: 0000 [#1] PREEMPT SMP NOPTI
+RIP: 0010:kvm_mmu_commit_zap_page.part.0+0x4b/0xe0 [kvm]
+Call Trace:
+ <TASK>
+ kvm_mmu_zap_oldest_mmu_pages+0xae/0xf0 [kvm]
+ direct_page_fault+0x3cb/0x9b0 [kvm]
+ kvm_tdp_page_fault+0x2c/0xa0 [kvm]
+ kvm_mmu_page_fault+0x207/0x930 [kvm]
+ npf_interception+0x47/0xb0 [kvm_amd]
+
+CVE: CVE-2022-45869
+Fixes: a2855afc7ee8 ("KVM: x86/mmu: Allow parallel page faults for the TDP MMU")
+Signed-off-by: Kazuki Takiguchi <takiguchi.kazuki171@gmail.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/mmu/mmu.c |   13 +++++++------
+ 1 file changed, 7 insertions(+), 6 deletions(-)
+
+--- a/arch/x86/kvm/mmu/mmu.c
++++ b/arch/x86/kvm/mmu/mmu.c
+@@ -2431,6 +2431,7 @@ static bool __kvm_mmu_prepare_zap_page(s
+ {
+       bool list_unstable, zapped_root = false;
++      lockdep_assert_held_write(&kvm->mmu_lock);
+       trace_kvm_mmu_prepare_zap_page(sp);
+       ++kvm->stat.mmu_shadow_zapped;
+       *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
+@@ -4250,14 +4251,14 @@ static int direct_page_fault(struct kvm_
+       if (is_page_fault_stale(vcpu, fault, mmu_seq))
+               goto out_unlock;
+-      r = make_mmu_pages_available(vcpu);
+-      if (r)
+-              goto out_unlock;
+-
+-      if (is_tdp_mmu_fault)
++      if (is_tdp_mmu_fault) {
+               r = kvm_tdp_mmu_map(vcpu, fault);
+-      else
++      } else {
++              r = make_mmu_pages_available(vcpu);
++              if (r)
++                      goto out_unlock;
+               r = __direct_map(vcpu, fault);
++      }
+ out_unlock:
+       if (is_tdp_mmu_fault)
diff --git a/queue-6.0/kvm-x86-nsvm-harden-svm_free_nested-against-freeing-vmcb02-while-still-in-use.patch b/queue-6.0/kvm-x86-nsvm-harden-svm_free_nested-against-freeing-vmcb02-while-still-in-use.patch
new file mode 100644 (file)
index 0000000..0733d91
--- /dev/null
@@ -0,0 +1,36 @@
+From 16ae56d7e0528559bf8dc9070e3bfd8ba3de80df Mon Sep 17 00:00:00 2001
+From: Maxim Levitsky <mlevitsk@redhat.com>
+Date: Thu, 3 Nov 2022 16:13:44 +0200
+Subject: KVM: x86: nSVM: harden svm_free_nested against freeing vmcb02 while still in use
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+commit 16ae56d7e0528559bf8dc9070e3bfd8ba3de80df upstream.
+
+Make sure that KVM uses vmcb01 before freeing nested state, and warn if
+that is not the case.
+
+This is a minimal fix for CVE-2022-3344 making the kernel print a warning
+instead of a kernel panic.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Message-Id: <20221103141351.50662-3-mlevitsk@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/svm/nested.c |    3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/arch/x86/kvm/svm/nested.c
++++ b/arch/x86/kvm/svm/nested.c
+@@ -1143,6 +1143,9 @@ void svm_free_nested(struct vcpu_svm *sv
+       if (!svm->nested.initialized)
+               return;
++      if (WARN_ON_ONCE(svm->vmcb != svm->vmcb01.ptr))
++              svm_switch_vmcb(svm, &svm->vmcb01);
++
+       svm_vcpu_free_msrpm(svm->nested.msrpm);
+       svm->nested.msrpm = NULL;
diff --git a/queue-6.0/kvm-x86-nsvm-leave-nested-mode-on-vcpu-free.patch b/queue-6.0/kvm-x86-nsvm-leave-nested-mode-on-vcpu-free.patch
new file mode 100644 (file)
index 0000000..4c6c703
--- /dev/null
@@ -0,0 +1,33 @@
+From 917401f26a6af5756d89b550a8e1bd50cf42b07e Mon Sep 17 00:00:00 2001
+From: Maxim Levitsky <mlevitsk@redhat.com>
+Date: Thu, 3 Nov 2022 16:13:43 +0200
+Subject: KVM: x86: nSVM: leave nested mode on vCPU free
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+commit 917401f26a6af5756d89b550a8e1bd50cf42b07e upstream.
+
+If the VM was terminated while nested, we free the nested state
+while the vCPU still is in nested mode.
+
+Soon a warning will be added for this condition.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Message-Id: <20221103141351.50662-2-mlevitsk@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/svm/svm.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -1440,6 +1440,7 @@ static void svm_vcpu_free(struct kvm_vcp
+        */
+       svm_clear_current_vmcb(svm->vmcb);
++      svm_leave_nested(vcpu);
+       svm_free_nested(svm);
+       sev_free_vcpu(vcpu);
diff --git a/queue-6.0/kvm-x86-remove-exit_int_info-warning-in-svm_handle_exit.patch b/queue-6.0/kvm-x86-remove-exit_int_info-warning-in-svm_handle_exit.patch
new file mode 100644 (file)
index 0000000..73f4e75
--- /dev/null
@@ -0,0 +1,58 @@
+From 05311ce954aebe75935d9ae7d38ac82b5b796e33 Mon Sep 17 00:00:00 2001
+From: Maxim Levitsky <mlevitsk@redhat.com>
+Date: Thu, 3 Nov 2022 16:13:51 +0200
+Subject: KVM: x86: remove exit_int_info warning in svm_handle_exit
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+commit 05311ce954aebe75935d9ae7d38ac82b5b796e33 upstream.
+
+It is valid to receive external interrupt and have broken IDT entry,
+which will lead to #GP with exit_int_into that will contain the index of
+the IDT entry (e.g any value).
+
+Other exceptions can happen as well, like #NP or #SS
+(if stack switch fails).
+
+Thus this warning can be user triggred and has very little value.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Message-Id: <20221103141351.50662-10-mlevitsk@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/svm/svm.c |   15 ---------------
+ 1 file changed, 15 deletions(-)
+
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -346,12 +346,6 @@ int svm_set_efer(struct kvm_vcpu *vcpu,
+       return 0;
+ }
+-static int is_external_interrupt(u32 info)
+-{
+-      info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
+-      return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
+-}
+-
+ static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
+ {
+       struct vcpu_svm *svm = to_svm(vcpu);
+@@ -3427,15 +3421,6 @@ static int svm_handle_exit(struct kvm_vc
+               return 0;
+       }
+-      if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
+-          exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
+-          exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
+-          exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
+-              printk(KERN_ERR "%s: unexpected exit_int_info 0x%x "
+-                     "exit_code 0x%x\n",
+-                     __func__, svm->vmcb->control.exit_int_info,
+-                     exit_code);
+-
+       if (exit_fastpath != EXIT_FASTPATH_NONE)
+               return 1;
diff --git a/queue-6.0/kvm-x86-xen-only-do-in-kernel-acceleration-of-hypercalls-for-guest-cpl0.patch b/queue-6.0/kvm-x86-xen-only-do-in-kernel-acceleration-of-hypercalls-for-guest-cpl0.patch
new file mode 100644 (file)
index 0000000..d396904
--- /dev/null
@@ -0,0 +1,64 @@
+From c2b8cdfaf3a6721afe0c8c060a631b1c67a7f1ee Mon Sep 17 00:00:00 2001
+From: David Woodhouse <dwmw@amazon.co.uk>
+Date: Sat, 12 Nov 2022 13:52:25 +0000
+Subject: KVM: x86/xen: Only do in-kernel acceleration of hypercalls for guest CPL0
+
+From: David Woodhouse <dwmw@amazon.co.uk>
+
+commit c2b8cdfaf3a6721afe0c8c060a631b1c67a7f1ee upstream.
+
+There are almost no hypercalls which are valid from CPL > 0, and definitely
+none which are handled by the kernel.
+
+Fixes: 2fd6df2f2b47 ("KVM: x86/xen: intercept EVTCHNOP_send from guests")
+Reported-by: Michal Luczaj <mhal@rbox.co>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Reviewed-by: Sean Christopherson <seanjc@google.com>
+Cc: stable@kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/xen.c |   12 +++++++++++-
+ 1 file changed, 11 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/kvm/xen.c
++++ b/arch/x86/kvm/xen.c
+@@ -1216,6 +1216,7 @@ int kvm_xen_hypercall(struct kvm_vcpu *v
+       bool longmode;
+       u64 input, params[6], r = -ENOSYS;
+       bool handled = false;
++      u8 cpl;
+       input = (u64)kvm_register_read(vcpu, VCPU_REGS_RAX);
+@@ -1243,9 +1244,17 @@ int kvm_xen_hypercall(struct kvm_vcpu *v
+               params[5] = (u64)kvm_r9_read(vcpu);
+       }
+ #endif
++      cpl = static_call(kvm_x86_get_cpl)(vcpu);
+       trace_kvm_xen_hypercall(input, params[0], params[1], params[2],
+                               params[3], params[4], params[5]);
++      /*
++       * Only allow hypercall acceleration for CPL0. The rare hypercalls that
++       * are permitted in guest userspace can be handled by the VMM.
++       */
++      if (unlikely(cpl > 0))
++              goto handle_in_userspace;
++
+       switch (input) {
+       case __HYPERVISOR_xen_version:
+               if (params[0] == XENVER_version && vcpu->kvm->arch.xen.xen_version) {
+@@ -1280,10 +1289,11 @@ int kvm_xen_hypercall(struct kvm_vcpu *v
+       if (handled)
+               return kvm_xen_hypercall_set_result(vcpu, r);
++handle_in_userspace:
+       vcpu->run->exit_reason = KVM_EXIT_XEN;
+       vcpu->run->xen.type = KVM_EXIT_XEN_HCALL;
+       vcpu->run->xen.u.hcall.longmode = longmode;
+-      vcpu->run->xen.u.hcall.cpl = static_call(kvm_x86_get_cpl)(vcpu);
++      vcpu->run->xen.u.hcall.cpl = cpl;
+       vcpu->run->xen.u.hcall.input = input;
+       vcpu->run->xen.u.hcall.params[0] = params[0];
+       vcpu->run->xen.u.hcall.params[1] = params[1];
diff --git a/queue-6.0/kvm-x86-xen-validate-port-number-in-schedop_poll.patch b/queue-6.0/kvm-x86-xen-validate-port-number-in-schedop_poll.patch
new file mode 100644 (file)
index 0000000..9aac9e5
--- /dev/null
@@ -0,0 +1,68 @@
+From 4ea9439fd537313f3381f0af4ebbf05e3f51a58c Mon Sep 17 00:00:00 2001
+From: David Woodhouse <dwmw@amazon.co.uk>
+Date: Sat, 12 Nov 2022 13:48:58 +0000
+Subject: KVM: x86/xen: Validate port number in SCHEDOP_poll
+
+From: David Woodhouse <dwmw@amazon.co.uk>
+
+commit 4ea9439fd537313f3381f0af4ebbf05e3f51a58c upstream.
+
+We shouldn't allow guests to poll on arbitrary port numbers off the end
+of the event channel table.
+
+Fixes: 1a65105a5aba ("KVM: x86/xen: handle PV spinlocks slowpath")
+[dwmw2: my bug though; the original version did check the validity as a
+ side-effect of an idr_find() which I ripped out in refactoring.]
+Reported-by: Michal Luczaj <mhal@rbox.co>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Reviewed-by: Sean Christopherson <seanjc@google.com>
+Cc: stable@kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/xen.c |   20 ++++++++++++--------
+ 1 file changed, 12 insertions(+), 8 deletions(-)
+
+--- a/arch/x86/kvm/xen.c
++++ b/arch/x86/kvm/xen.c
+@@ -954,6 +954,14 @@ static int kvm_xen_hypercall_complete_us
+       return kvm_xen_hypercall_set_result(vcpu, run->xen.u.hcall.result);
+ }
++static inline int max_evtchn_port(struct kvm *kvm)
++{
++      if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode)
++              return EVTCHN_2L_NR_CHANNELS;
++      else
++              return COMPAT_EVTCHN_2L_NR_CHANNELS;
++}
++
+ static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports,
+                              evtchn_port_t *ports)
+ {
+@@ -1042,6 +1050,10 @@ static bool kvm_xen_schedop_poll(struct
+                       *r = -EFAULT;
+                       goto out;
+               }
++              if (ports[i] >= max_evtchn_port(vcpu->kvm)) {
++                      *r = -EINVAL;
++                      goto out;
++              }
+       }
+       if (sched_poll.nr_ports == 1)
+@@ -1308,14 +1320,6 @@ handle_in_userspace:
+       return 0;
+ }
+-static inline int max_evtchn_port(struct kvm *kvm)
+-{
+-      if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode)
+-              return EVTCHN_2L_NR_CHANNELS;
+-      else
+-              return COMPAT_EVTCHN_2L_NR_CHANNELS;
+-}
+-
+ static void kvm_xen_check_poller(struct kvm_vcpu *vcpu, int port)
+ {
+       int poll_evtchn = vcpu->arch.xen.poll_evtchn;
diff --git a/queue-6.0/mm-cgroup-reclaim-fix-dirty-pages-throttling-on-cgroup-v1.patch b/queue-6.0/mm-cgroup-reclaim-fix-dirty-pages-throttling-on-cgroup-v1.patch
new file mode 100644 (file)
index 0000000..665354f
--- /dev/null
@@ -0,0 +1,65 @@
+From 81a70c21d9170de67a45843bdd627f4cce9c4215 Mon Sep 17 00:00:00 2001
+From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
+Date: Fri, 18 Nov 2022 12:36:03 +0530
+Subject: mm/cgroup/reclaim: fix dirty pages throttling on cgroup v1
+
+From: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
+
+commit 81a70c21d9170de67a45843bdd627f4cce9c4215 upstream.
+
+balance_dirty_pages doesn't do the required dirty throttling on cgroupv1.
+See commit 9badce000e2c ("cgroup, writeback: don't enable cgroup writeback
+on traditional hierarchies").  Instead, the kernel depends on writeback
+throttling in shrink_folio_list to achieve the same goal.  With large
+memory systems, the flusher may not be able to writeback quickly enough
+such that we will start finding pages in the shrink_folio_list already in
+writeback.  Hence for cgroupv1 let's do a reclaim throttle after waking up
+the flusher.
+
+The below test which used to fail on a 256GB system completes till the the
+file system is full with this change.
+
+root@lp2:/sys/fs/cgroup/memory# mkdir test
+root@lp2:/sys/fs/cgroup/memory# cd test/
+root@lp2:/sys/fs/cgroup/memory/test# echo 120M > memory.limit_in_bytes
+root@lp2:/sys/fs/cgroup/memory/test# echo $$ > tasks
+root@lp2:/sys/fs/cgroup/memory/test# dd if=/dev/zero of=/home/kvaneesh/test bs=1M
+Killed
+
+Link: https://lkml.kernel.org/r/20221118070603.84081-1-aneesh.kumar@linux.ibm.com
+Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
+Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Tejun Heo <tj@kernel.org>
+Cc: zefan li <lizefan.x@bytedance.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/vmscan.c |   14 +++++++++++++-
+ 1 file changed, 13 insertions(+), 1 deletion(-)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -2472,8 +2472,20 @@ shrink_inactive_list(unsigned long nr_to
+        * the flushers simply cannot keep up with the allocation
+        * rate. Nudge the flusher threads in case they are asleep.
+        */
+-      if (stat.nr_unqueued_dirty == nr_taken)
++      if (stat.nr_unqueued_dirty == nr_taken) {
+               wakeup_flusher_threads(WB_REASON_VMSCAN);
++              /*
++               * For cgroupv1 dirty throttling is achieved by waking up
++               * the kernel flusher here and later waiting on folios
++               * which are in writeback to finish (see shrink_folio_list()).
++               *
++               * Flusher may not be able to issue writeback quickly
++               * enough for cgroupv1 writeback throttling to work
++               * on a large system.
++               */
++              if (!writeback_throttling_sane(sc))
++                      reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
++      }
+       sc->nr.dirty += stat.nr_dirty;
+       sc->nr.congested += stat.nr_congested;
diff --git a/queue-6.0/mm-vmscan-fix-extreme-overreclaim-and-swap-floods.patch b/queue-6.0/mm-vmscan-fix-extreme-overreclaim-and-swap-floods.patch
new file mode 100644 (file)
index 0000000..254288b
--- /dev/null
@@ -0,0 +1,137 @@
+From f53af4285d775cd9a9a146fc438bd0a1bee1838a Mon Sep 17 00:00:00 2001
+From: Johannes Weiner <hannes@cmpxchg.org>
+Date: Tue, 2 Aug 2022 12:28:11 -0400
+Subject: mm: vmscan: fix extreme overreclaim and swap floods
+
+From: Johannes Weiner <hannes@cmpxchg.org>
+
+commit f53af4285d775cd9a9a146fc438bd0a1bee1838a upstream.
+
+During proactive reclaim, we sometimes observe severe overreclaim, with
+several thousand times more pages reclaimed than requested.
+
+This trace was obtained from shrink_lruvec() during such an instance:
+
+    prio:0 anon_cost:1141521 file_cost:7767
+    nr_reclaimed:4387406 nr_to_reclaim:1047 (or_factor:4190)
+    nr=[7161123 345 578 1111]
+
+While he reclaimer requested 4M, vmscan reclaimed close to 16G, most of it
+by swapping.  These requests take over a minute, during which the write()
+to memory.reclaim is unkillably stuck inside the kernel.
+
+Digging into the source, this is caused by the proportional reclaim
+bailout logic.  This code tries to resolve a fundamental conflict: to
+reclaim roughly what was requested, while also aging all LRUs fairly and
+in accordance to their size, swappiness, refault rates etc.  The way it
+attempts fairness is that once the reclaim goal has been reached, it stops
+scanning the LRUs with the smaller remaining scan targets, and adjusts the
+remainder of the bigger LRUs according to how much of the smaller LRUs was
+scanned.  It then finishes scanning that remainder regardless of the
+reclaim goal.
+
+This works fine if priority levels are low and the LRU lists are
+comparable in size.  However, in this instance, the cgroup that is
+targeted by proactive reclaim has almost no files left - they've already
+been squeezed out by proactive reclaim earlier - and the remaining anon
+pages are hot.  Anon rotations cause the priority level to drop to 0,
+which results in reclaim targeting all of anon (a lot) and all of file
+(almost nothing).  By the time reclaim decides to bail, it has scanned
+most or all of the file target, and therefor must also scan most or all of
+the enormous anon target.  This target is thousands of times larger than
+the reclaim goal, thus causing the overreclaim.
+
+The bailout code hasn't changed in years, why is this failing now?  The
+most likely explanations are two other recent changes in anon reclaim:
+
+1. Before the series starting with commit 5df741963d52 ("mm: fix LRU
+   balancing effect of new transparent huge pages"), the VM was
+   overall relatively reluctant to swap at all, even if swap was
+   configured. This means the LRU balancing code didn't come into play
+   as often as it does now, and mostly in high pressure situations
+   where pronounced swap activity wouldn't be as surprising.
+
+2. For historic reasons, shrink_lruvec() loops on the scan targets of
+   all LRU lists except the active anon one, meaning it would bail if
+   the only remaining pages to scan were active anon - even if there
+   were a lot of them.
+
+   Before the series starting with commit ccc5dc67340c ("mm/vmscan:
+   make active/inactive ratio as 1:1 for anon lru"), most anon pages
+   would live on the active LRU; the inactive one would contain only a
+   handful of preselected reclaim candidates. After the series, anon
+   gets aged similarly to file, and the inactive list is the default
+   for new anon pages as well, making it often the much bigger list.
+
+   As a result, the VM is now more likely to actually finish large
+   anon targets than before.
+
+Change the code such that only one SWAP_CLUSTER_MAX-sized nudge toward the
+larger LRU lists is made before bailing out on a met reclaim goal.
+
+This fixes the extreme overreclaim problem.
+
+Fairness is more subtle and harder to evaluate.  No obvious misbehavior
+was observed on the test workload, in any case.  Conceptually, fairness
+should primarily be a cumulative effect from regular, lower priority
+scans.  Once the VM is in trouble and needs to escalate scan targets to
+make forward progress, fairness needs to take a backseat.  This is also
+acknowledged by the myriad exceptions in get_scan_count().  This patch
+makes fairness decrease gradually, as it keeps fairness work static over
+increasing priority levels with growing scan targets.  This should make
+more sense - although we may have to re-visit the exact values.
+
+Link: https://lkml.kernel.org/r/20220802162811.39216-1-hannes@cmpxchg.org
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+Reviewed-by: Rik van Riel <riel@surriel.com>
+Acked-by: Mel Gorman <mgorman@techsingularity.net>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/vmscan.c |   10 ++++------
+ 1 file changed, 4 insertions(+), 6 deletions(-)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -2967,8 +2967,8 @@ static void shrink_lruvec(struct lruvec
+       enum lru_list lru;
+       unsigned long nr_reclaimed = 0;
+       unsigned long nr_to_reclaim = sc->nr_to_reclaim;
++      bool proportional_reclaim;
+       struct blk_plug plug;
+-      bool scan_adjusted;
+       get_scan_count(lruvec, sc, nr);
+@@ -2986,8 +2986,8 @@ static void shrink_lruvec(struct lruvec
+        * abort proportional reclaim if either the file or anon lru has already
+        * dropped to zero at the first pass.
+        */
+-      scan_adjusted = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
+-                       sc->priority == DEF_PRIORITY);
++      proportional_reclaim = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
++                              sc->priority == DEF_PRIORITY);
+       blk_start_plug(&plug);
+       while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
+@@ -3007,7 +3007,7 @@ static void shrink_lruvec(struct lruvec
+               cond_resched();
+-              if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
++              if (nr_reclaimed < nr_to_reclaim || proportional_reclaim)
+                       continue;
+               /*
+@@ -3058,8 +3058,6 @@ static void shrink_lruvec(struct lruvec
+               nr_scanned = targets[lru] - nr[lru];
+               nr[lru] = targets[lru] * (100 - percentage) / 100;
+               nr[lru] -= min(nr[lru], nr_scanned);
+-
+-              scan_adjusted = true;
+       }
+       blk_finish_plug(&plug);
+       sc->nr_reclaimed += nr_reclaimed;
diff --git a/queue-6.0/nilfs2-fix-nilfs_sufile_mark_dirty-not-set-segment-usage-as-dirty.patch b/queue-6.0/nilfs2-fix-nilfs_sufile_mark_dirty-not-set-segment-usage-as-dirty.patch
new file mode 100644 (file)
index 0000000..df1c045
--- /dev/null
@@ -0,0 +1,77 @@
+From 512c5ca01a3610ab14ff6309db363de51f1c13a6 Mon Sep 17 00:00:00 2001
+From: Chen Zhongjin <chenzhongjin@huawei.com>
+Date: Fri, 18 Nov 2022 14:33:04 +0800
+Subject: nilfs2: fix nilfs_sufile_mark_dirty() not set segment usage as dirty
+
+From: Chen Zhongjin <chenzhongjin@huawei.com>
+
+commit 512c5ca01a3610ab14ff6309db363de51f1c13a6 upstream.
+
+When extending segments, nilfs_sufile_alloc() is called to get an
+unassigned segment, then mark it as dirty to avoid accidentally allocating
+the same segment in the future.
+
+But for some special cases such as a corrupted image it can be unreliable.
+If such corruption of the dirty state of the segment occurs, nilfs2 may
+reallocate a segment that is in use and pick the same segment for writing
+twice at the same time.
+
+This will cause the problem reported by syzkaller:
+https://syzkaller.appspot.com/bug?id=c7c4748e11ffcc367cef04f76e02e931833cbd24
+
+This case started with segbuf1.segnum = 3, nextnum = 4 when constructed.
+It supposed segment 4 has already been allocated and marked as dirty.
+
+However the dirty state was corrupted and segment 4 usage was not dirty.
+For the first time nilfs_segctor_extend_segments() segment 4 was allocated
+again, which made segbuf2 and next segbuf3 had same segment 4.
+
+sb_getblk() will get same bh for segbuf2 and segbuf3, and this bh is added
+to both buffer lists of two segbuf.  It makes the lists broken which
+causes NULL pointer dereference.
+
+Fix the problem by setting usage as dirty every time in
+nilfs_sufile_mark_dirty(), which is called during constructing current
+segment to be written out and before allocating next segment.
+
+[chenzhongjin@huawei.com: add lock protection per Ryusuke]
+  Link: https://lkml.kernel.org/r/20221121091141.214703-1-chenzhongjin@huawei.com
+Link: https://lkml.kernel.org/r/20221118063304.140187-1-chenzhongjin@huawei.com
+Fixes: 9ff05123e3bf ("nilfs2: segment constructor")
+Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com>
+Reported-by: <syzbot+77e4f0...@syzkaller.appspotmail.com>
+Reported-by: Liu Shixin <liushixin2@huawei.com>
+Acked-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
+Tested-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/nilfs2/sufile.c |    8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+--- a/fs/nilfs2/sufile.c
++++ b/fs/nilfs2/sufile.c
+@@ -495,14 +495,22 @@ void nilfs_sufile_do_free(struct inode *
+ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum)
+ {
+       struct buffer_head *bh;
++      void *kaddr;
++      struct nilfs_segment_usage *su;
+       int ret;
++      down_write(&NILFS_MDT(sufile)->mi_sem);
+       ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh);
+       if (!ret) {
+               mark_buffer_dirty(bh);
+               nilfs_mdt_mark_dirty(sufile);
++              kaddr = kmap_atomic(bh->b_page);
++              su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
++              nilfs_segment_usage_set_dirty(su);
++              kunmap_atomic(kaddr);
+               brelse(bh);
+       }
++      up_write(&NILFS_MDT(sufile)->mi_sem);
+       return ret;
+ }
index f633c9eca5b8959c9f95da160ae642ca4561b34d..28a92ec2bac365adba20efc171a48e8185cbfda0 100644 (file)
@@ -192,3 +192,21 @@ drm-amd-display-fix-calculation-for-cursor-cab-alloc.patch
 usb-dwc3-gadget-conditionally-remove-requests.patch
 usb-dwc3-gadget-return-eshutdown-on-ep-disable.patch
 usb-dwc3-gadget-clear-ep-descriptor-last.patch
+io_uring-cmpxchg-for-poll-arm-refs-release.patch
+io_uring-make-poll-refs-more-robust.patch
+io_uring-clear-tif_notify_signal-if-set-and-task_work-not-available.patch
+nilfs2-fix-nilfs_sufile_mark_dirty-not-set-segment-usage-as-dirty.patch
+gcov-clang-fix-the-buffer-overflow-issue.patch
+mm-cgroup-reclaim-fix-dirty-pages-throttling-on-cgroup-v1.patch
+mm-vmscan-fix-extreme-overreclaim-and-swap-floods.patch
+fpga-m10bmc-sec-fix-kconfig-dependencies.patch
+kvm-x86-mmu-fix-race-condition-in-direct_page_fault.patch
+kvm-x86-xen-only-do-in-kernel-acceleration-of-hypercalls-for-guest-cpl0.patch
+kvm-x86-xen-validate-port-number-in-schedop_poll.patch
+drm-i915-gvt-get-reference-to-kvm-iff-attachment-to-vm-is-successful.patch
+kvm-x86-nsvm-leave-nested-mode-on-vcpu-free.patch
+kvm-x86-forcibly-leave-nested-mode-on-vcpu-reset.patch
+kvm-x86-nsvm-harden-svm_free_nested-against-freeing-vmcb02-while-still-in-use.patch
+kvm-x86-add-kvm_leave_nested.patch
+kvm-x86-remove-exit_int_info-warning-in-svm_handle_exit.patch
+kvm-update-gfn_to_pfn_cache-khva-when-it-moves-within-the-same-page.patch