--- /dev/null
+From 9ed1fdee9ee324f3505ff066287ee53143caaaa2 Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc@google.com>
+Date: Fri, 11 Nov 2022 00:22:24 +0000
+Subject: drm/i915/gvt: Get reference to KVM iff attachment to VM is successful
+
+From: Sean Christopherson <seanjc@google.com>
+
+commit 9ed1fdee9ee324f3505ff066287ee53143caaaa2 upstream.
+
+Get a reference to KVM if and only if a vGPU is successfully attached to
+the VM to avoid leaking a reference if there's no available vGPU. On
+open_device() failure, vfio_device_open() doesn't invoke close_device().
+
+Fixes: 421cfe6596f6 ("vfio: remove VFIO_GROUP_NOTIFY_SET_KVM")
+Cc: stable@vger.kernel.org
+Reviewed-by: Kevin Tian <kevin.tian@intel.com>
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com>
+Link: http://patchwork.freedesktop.org/patch/msgid/20221111002225.2418386-2-seanjc@google.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/i915/gvt/kvmgt.c | 3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/drivers/gpu/drm/i915/gvt/kvmgt.c
++++ b/drivers/gpu/drm/i915/gvt/kvmgt.c
+@@ -765,8 +765,6 @@ static int intel_vgpu_open_device(struct
+ return -ESRCH;
+ }
+
+- kvm_get_kvm(vgpu->vfio_device.kvm);
+-
+ if (__kvmgt_vgpu_exist(vgpu))
+ return -EEXIST;
+
+@@ -777,6 +775,7 @@ static int intel_vgpu_open_device(struct
+
+ vgpu->track_node.track_write = kvmgt_page_track_write;
+ vgpu->track_node.track_flush_slot = kvmgt_page_track_flush_slot;
++ kvm_get_kvm(vgpu->vfio_device.kvm);
+ kvm_page_track_register_notifier(vgpu->vfio_device.kvm,
+ &vgpu->track_node);
+
--- /dev/null
+From dfd10332596ef11ceafd29c4e21b4117be423fc4 Mon Sep 17 00:00:00 2001
+From: Russ Weight <russell.h.weight@intel.com>
+Date: Mon, 14 Nov 2022 16:11:27 -0800
+Subject: fpga: m10bmc-sec: Fix kconfig dependencies
+
+From: Russ Weight <russell.h.weight@intel.com>
+
+commit dfd10332596ef11ceafd29c4e21b4117be423fc4 upstream.
+
+The secure update driver depends on the firmware-upload functionality of
+the firmware-loader. The firmware-loader is carried in the firmware-class
+driver which is enabled with the tristate CONFIG_FW_LOADER option. The
+firmware-upload functionality is included in the firmware-class driver if
+the bool FW_UPLOAD config is set.
+
+The current dependency statement, "depends on FW_UPLOAD", is not adequate
+because it does not implicitly turn on FW_LOADER. Instead of adding a
+dependency, follow the convention used by drivers that require the
+FW_LOADER_USER_HELPER functionality of the firmware-loader by using
+select for both FW_LOADER and FW_UPLOAD.
+
+Fixes: bdf86d0e6ca3 ("fpga: m10bmc-sec: create max10 bmc secure update")
+Reported-by: kernel test robot <lkp@intel.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Russ Weight <russell.h.weight@intel.com>
+Acked-by: Randy Dunlap <rdunlap@infradead.org>
+Acked-by: Xu Yilun <yilun.xu@intel.com>
+Link: https://lore.kernel.org/r/20221115001127.289890-1-russell.h.weight@intel.com
+Signed-off-by: Xu Yilun <yilun.xu@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/fpga/Kconfig | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/fpga/Kconfig b/drivers/fpga/Kconfig
+index 6c416955da53..bbe0a7cabb75 100644
+--- a/drivers/fpga/Kconfig
++++ b/drivers/fpga/Kconfig
+@@ -246,7 +246,9 @@ config FPGA_MGR_VERSAL_FPGA
+
+ config FPGA_M10_BMC_SEC_UPDATE
+ tristate "Intel MAX10 BMC Secure Update driver"
+- depends on MFD_INTEL_M10_BMC && FW_UPLOAD
++ depends on MFD_INTEL_M10_BMC
++ select FW_LOADER
++ select FW_UPLOAD
+ help
+ Secure update support for the Intel MAX10 board management
+ controller.
+--
+2.38.1
+
--- /dev/null
+From a6f810efabfd789d3bbafeacb4502958ec56c5ce Mon Sep 17 00:00:00 2001
+From: Mukesh Ojha <quic_mojha@quicinc.com>
+Date: Thu, 10 Nov 2022 00:31:37 +0530
+Subject: gcov: clang: fix the buffer overflow issue
+
+From: Mukesh Ojha <quic_mojha@quicinc.com>
+
+commit a6f810efabfd789d3bbafeacb4502958ec56c5ce upstream.
+
+Currently, in clang version of gcov code when module is getting removed
+gcov_info_add() incorrectly adds the sfn_ptr->counter to all the
+dst->functions and it result in the kernel panic in below crash report.
+Fix this by properly handling it.
+
+[ 8.899094][ T599] Unable to handle kernel write to read-only memory at virtual address ffffff80461cc000
+[ 8.899100][ T599] Mem abort info:
+[ 8.899102][ T599] ESR = 0x9600004f
+[ 8.899103][ T599] EC = 0x25: DABT (current EL), IL = 32 bits
+[ 8.899105][ T599] SET = 0, FnV = 0
+[ 8.899107][ T599] EA = 0, S1PTW = 0
+[ 8.899108][ T599] FSC = 0x0f: level 3 permission fault
+[ 8.899110][ T599] Data abort info:
+[ 8.899111][ T599] ISV = 0, ISS = 0x0000004f
+[ 8.899113][ T599] CM = 0, WnR = 1
+[ 8.899114][ T599] swapper pgtable: 4k pages, 39-bit VAs, pgdp=00000000ab8de000
+[ 8.899116][ T599] [ffffff80461cc000] pgd=18000009ffcde003, p4d=18000009ffcde003, pud=18000009ffcde003, pmd=18000009ffcad003, pte=00600000c61cc787
+[ 8.899124][ T599] Internal error: Oops: 9600004f [#1] PREEMPT SMP
+[ 8.899265][ T599] Skip md ftrace buffer dump for: 0x1609e0
+....
+..,
+[ 8.899544][ T599] CPU: 7 PID: 599 Comm: modprobe Tainted: G S OE 5.15.41-android13-8-g38e9b1af6bce #1
+[ 8.899547][ T599] Hardware name: XXX (DT)
+[ 8.899549][ T599] pstate: 82400005 (Nzcv daif +PAN -UAO +TCO -DIT -SSBS BTYPE=--)
+[ 8.899551][ T599] pc : gcov_info_add+0x9c/0xb8
+[ 8.899557][ T599] lr : gcov_event+0x28c/0x6b8
+[ 8.899559][ T599] sp : ffffffc00e733b00
+[ 8.899560][ T599] x29: ffffffc00e733b00 x28: ffffffc00e733d30 x27: ffffffe8dc297470
+[ 8.899563][ T599] x26: ffffffe8dc297000 x25: ffffffe8dc297000 x24: ffffffe8dc297000
+[ 8.899566][ T599] x23: ffffffe8dc0a6200 x22: ffffff880f68bf20 x21: 0000000000000000
+[ 8.899569][ T599] x20: ffffff880f68bf00 x19: ffffff8801babc00 x18: ffffffc00d7f9058
+[ 8.899572][ T599] x17: 0000000000088793 x16: ffffff80461cbe00 x15: 9100052952800785
+[ 8.899575][ T599] x14: 0000000000000200 x13: 0000000000000041 x12: 9100052952800785
+[ 8.899577][ T599] x11: ffffffe8dc297000 x10: ffffffe8dc297000 x9 : ffffff80461cbc80
+[ 8.899580][ T599] x8 : ffffff8801babe80 x7 : ffffffe8dc2ec000 x6 : ffffffe8dc2ed000
+[ 8.899583][ T599] x5 : 000000008020001f x4 : fffffffe2006eae0 x3 : 000000008020001f
+[ 8.899586][ T599] x2 : ffffff8027c49200 x1 : ffffff8801babc20 x0 : ffffff80461cb3a0
+[ 8.899589][ T599] Call trace:
+[ 8.899590][ T599] gcov_info_add+0x9c/0xb8
+[ 8.899592][ T599] gcov_module_notifier+0xbc/0x120
+[ 8.899595][ T599] blocking_notifier_call_chain+0xa0/0x11c
+[ 8.899598][ T599] do_init_module+0x2a8/0x33c
+[ 8.899600][ T599] load_module+0x23cc/0x261c
+[ 8.899602][ T599] __arm64_sys_finit_module+0x158/0x194
+[ 8.899604][ T599] invoke_syscall+0x94/0x2bc
+[ 8.899607][ T599] el0_svc_common+0x1d8/0x34c
+[ 8.899609][ T599] do_el0_svc+0x40/0x54
+[ 8.899611][ T599] el0_svc+0x94/0x2f0
+[ 8.899613][ T599] el0t_64_sync_handler+0x88/0xec
+[ 8.899615][ T599] el0t_64_sync+0x1b4/0x1b8
+[ 8.899618][ T599] Code: f905f56c f86e69ec f86e6a0f 8b0c01ec (f82e6a0c)
+[ 8.899620][ T599] ---[ end trace ed5218e9e5b6e2e6 ]---
+
+Link: https://lkml.kernel.org/r/1668020497-13142-1-git-send-email-quic_mojha@quicinc.com
+Fixes: e178a5beb369 ("gcov: clang support")
+Signed-off-by: Mukesh Ojha <quic_mojha@quicinc.com>
+Reviewed-by: Peter Oberparleiter <oberpar@linux.ibm.com>
+Tested-by: Peter Oberparleiter <oberpar@linux.ibm.com>
+Cc: Nathan Chancellor <nathan@kernel.org>
+Cc: Nick Desaulniers <ndesaulniers@google.com>
+Cc: Tom Rix <trix@redhat.com>
+Cc: <stable@vger.kernel.org> [5.2+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/gcov/clang.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/kernel/gcov/clang.c
++++ b/kernel/gcov/clang.c
+@@ -280,6 +280,8 @@ void gcov_info_add(struct gcov_info *dst
+
+ for (i = 0; i < sfn_ptr->num_counters; i++)
+ dfn_ptr->counters[i] += sfn_ptr->counters[i];
++
++ sfn_ptr = list_next_entry(sfn_ptr, head);
+ }
+ }
+
--- /dev/null
+From 7cfe7a09489c1cefee7181e07b5f2bcbaebd9f41 Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@kernel.dk>
+Date: Fri, 25 Nov 2022 09:36:29 -0700
+Subject: io_uring: clear TIF_NOTIFY_SIGNAL if set and task_work not available
+
+From: Jens Axboe <axboe@kernel.dk>
+
+commit 7cfe7a09489c1cefee7181e07b5f2bcbaebd9f41 upstream.
+
+With how task_work is added and signaled, we can have TIF_NOTIFY_SIGNAL
+set and no task_work pending as it got run in a previous loop. Treat
+TIF_NOTIFY_SIGNAL like get_signal(), always clear it if set regardless
+of whether or not task_work is pending to run.
+
+Cc: stable@vger.kernel.org
+Fixes: 46a525e199e4 ("io_uring: don't gate task_work run on TIF_NOTIFY_SIGNAL")
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ io_uring/io_uring.h | 9 +++++++--
+ 1 file changed, 7 insertions(+), 2 deletions(-)
+
+--- a/io_uring/io_uring.h
++++ b/io_uring/io_uring.h
+@@ -229,9 +229,14 @@ static inline unsigned int io_sqring_ent
+
+ static inline bool io_run_task_work(void)
+ {
++ /*
++ * Always check-and-clear the task_work notification signal. With how
++ * signaling works for task_work, we can find it set with nothing to
++ * run. We need to clear it for that case, like get_signal() does.
++ */
++ if (test_thread_flag(TIF_NOTIFY_SIGNAL))
++ clear_notify_signal();
+ if (task_work_pending(current)) {
+- if (test_thread_flag(TIF_NOTIFY_SIGNAL))
+- clear_notify_signal();
+ __set_current_state(TASK_RUNNING);
+ task_work_run();
+ return 1;
--- /dev/null
+From 2f3893437a4ebf2e892ca172e9e122841319d675 Mon Sep 17 00:00:00 2001
+From: Pavel Begunkov <asml.silence@gmail.com>
+Date: Sun, 20 Nov 2022 16:57:41 +0000
+Subject: io_uring: cmpxchg for poll arm refs release
+
+From: Pavel Begunkov <asml.silence@gmail.com>
+
+commit 2f3893437a4ebf2e892ca172e9e122841319d675 upstream.
+
+Replace atomically substracting the ownership reference at the end of
+arming a poll with a cmpxchg. We try to release ownership by setting 0
+assuming that poll_refs didn't change while we were arming. If it did
+change, we keep the ownership and use it to queue a tw, which is fully
+capable to process all events and (even tolerates spurious wake ups).
+
+It's a bit more elegant as we reduce races b/w setting the cancellation
+flag and getting refs with this release, and with that we don't have to
+worry about any kinds of underflows. It's not the fastest path for
+polling. The performance difference b/w cmpxchg and atomic dec is
+usually negligible and it's not the fastest path.
+
+Cc: stable@vger.kernel.org
+Fixes: aa43477b04025 ("io_uring: poll rework")
+Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
+Link: https://lore.kernel.org/r/0c95251624397ea6def568ff040cad2d7926fd51.1668963050.git.asml.silence@gmail.com
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ io_uring/poll.c | 8 +++-----
+ 1 file changed, 3 insertions(+), 5 deletions(-)
+
+--- a/io_uring/poll.c
++++ b/io_uring/poll.c
+@@ -519,7 +519,6 @@ static int __io_arm_poll_handler(struct
+ unsigned issue_flags)
+ {
+ struct io_ring_ctx *ctx = req->ctx;
+- int v;
+
+ INIT_HLIST_NODE(&req->hash_node);
+ req->work.cancel_seq = atomic_read(&ctx->cancel_seq);
+@@ -587,11 +586,10 @@ static int __io_arm_poll_handler(struct
+
+ if (ipt->owning) {
+ /*
+- * Release ownership. If someone tried to queue a tw while it was
+- * locked, kick it off for them.
++ * Try to release ownership. If we see a change of state, e.g.
++ * poll was waken up, queue up a tw, it'll deal with it.
+ */
+- v = atomic_dec_return(&req->poll_refs);
+- if (unlikely(v & IO_POLL_REF_MASK))
++ if (atomic_cmpxchg(&req->poll_refs, 1, 0) != 1)
+ __io_poll_execute(req, 0);
+ }
+ return 0;
--- /dev/null
+From a26a35e9019fd70bf3cf647dcfdae87abc7bacea Mon Sep 17 00:00:00 2001
+From: Pavel Begunkov <asml.silence@gmail.com>
+Date: Sun, 20 Nov 2022 16:57:42 +0000
+Subject: io_uring: make poll refs more robust
+
+From: Pavel Begunkov <asml.silence@gmail.com>
+
+commit a26a35e9019fd70bf3cf647dcfdae87abc7bacea upstream.
+
+poll_refs carry two functions, the first is ownership over the request.
+The second is notifying the io_poll_check_events() that there was an
+event but wake up couldn't grab the ownership, so io_poll_check_events()
+should retry.
+
+We want to make poll_refs more robust against overflows. Instead of
+always incrementing it, which covers two purposes with one atomic, check
+if poll_refs is elevated enough and if so set a retry flag without
+attempts to grab ownership. The gap between the bias check and following
+atomics may seem racy, but we don't need it to be strict. Moreover there
+might only be maximum 4 parallel updates: by the first and the second
+poll entries, __io_arm_poll_handler() and cancellation. From those four,
+only poll wake ups may be executed multiple times, but they're protected
+by a spin.
+
+Cc: stable@vger.kernel.org
+Reported-by: Lin Ma <linma@zju.edu.cn>
+Fixes: aa43477b04025 ("io_uring: poll rework")
+Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
+Link: https://lore.kernel.org/r/c762bc31f8683b3270f3587691348a7119ef9c9d.1668963050.git.asml.silence@gmail.com
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ io_uring/poll.c | 36 +++++++++++++++++++++++++++++++++++-
+ 1 file changed, 35 insertions(+), 1 deletion(-)
+
+--- a/io_uring/poll.c
++++ b/io_uring/poll.c
+@@ -40,7 +40,14 @@ struct io_poll_table {
+ };
+
+ #define IO_POLL_CANCEL_FLAG BIT(31)
+-#define IO_POLL_REF_MASK GENMASK(30, 0)
++#define IO_POLL_RETRY_FLAG BIT(30)
++#define IO_POLL_REF_MASK GENMASK(29, 0)
++
++/*
++ * We usually have 1-2 refs taken, 128 is more than enough and we want to
++ * maximise the margin between this amount and the moment when it overflows.
++ */
++#define IO_POLL_REF_BIAS 128
+
+ #define IO_WQE_F_DOUBLE 1
+
+@@ -58,6 +65,21 @@ static inline bool wqe_is_double(struct
+ return priv & IO_WQE_F_DOUBLE;
+ }
+
++static bool io_poll_get_ownership_slowpath(struct io_kiocb *req)
++{
++ int v;
++
++ /*
++ * poll_refs are already elevated and we don't have much hope for
++ * grabbing the ownership. Instead of incrementing set a retry flag
++ * to notify the loop that there might have been some change.
++ */
++ v = atomic_fetch_or(IO_POLL_RETRY_FLAG, &req->poll_refs);
++ if (v & IO_POLL_REF_MASK)
++ return false;
++ return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK);
++}
++
+ /*
+ * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can
+ * bump it and acquire ownership. It's disallowed to modify requests while not
+@@ -66,6 +88,8 @@ static inline bool wqe_is_double(struct
+ */
+ static inline bool io_poll_get_ownership(struct io_kiocb *req)
+ {
++ if (unlikely(atomic_read(&req->poll_refs) >= IO_POLL_REF_BIAS))
++ return io_poll_get_ownership_slowpath(req);
+ return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK);
+ }
+
+@@ -235,6 +259,16 @@ static int io_poll_check_events(struct i
+ */
+ if ((v & IO_POLL_REF_MASK) != 1)
+ req->cqe.res = 0;
++ if (v & IO_POLL_RETRY_FLAG) {
++ req->cqe.res = 0;
++ /*
++ * We won't find new events that came in between
++ * vfs_poll and the ref put unless we clear the flag
++ * in advance.
++ */
++ atomic_andnot(IO_POLL_RETRY_FLAG, &req->poll_refs);
++ v &= ~IO_POLL_RETRY_FLAG;
++ }
+
+ /* the mask was stashed in __io_poll_execute */
+ if (!req->cqe.res) {
--- /dev/null
+From 8332f0ed4f187c7b700831bd7cc83ce180a944b9 Mon Sep 17 00:00:00 2001
+From: David Woodhouse <dwmw@amazon.co.uk>
+Date: Sat, 19 Nov 2022 09:25:39 +0000
+Subject: KVM: Update gfn_to_pfn_cache khva when it moves within the same page
+
+From: David Woodhouse <dwmw@amazon.co.uk>
+
+commit 8332f0ed4f187c7b700831bd7cc83ce180a944b9 upstream.
+
+In the case where a GPC is refreshed to a different location within the
+same page, we didn't bother to update it. Mostly we don't need to, but
+since the ->khva field also includes the offset within the page, that
+does have to be updated.
+
+Fixes: 3ba2c95ea180 ("KVM: Do not incorporate page offset into gfn=>pfn cache user address")
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Reviewed-by: Paul Durrant <paul@xen.org>
+Reviewed-by: Sean Christopherson <seanjc@google.com>
+Cc: stable@kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ virt/kvm/pfncache.c | 7 ++++++-
+ 1 file changed, 6 insertions(+), 1 deletion(-)
+
+--- a/virt/kvm/pfncache.c
++++ b/virt/kvm/pfncache.c
+@@ -297,7 +297,12 @@ int kvm_gfn_to_pfn_cache_refresh(struct
+ if (!gpc->valid || old_uhva != gpc->uhva) {
+ ret = hva_to_pfn_retry(kvm, gpc);
+ } else {
+- /* If the HVA→PFN mapping was already valid, don't unmap it. */
++ /*
++ * If the HVA→PFN mapping was already valid, don't unmap it.
++ * But do update gpc->khva because the offset within the page
++ * may have changed.
++ */
++ gpc->khva = old_khva + page_offset;
+ old_pfn = KVM_PFN_ERR_FAULT;
+ old_khva = NULL;
+ ret = 0;
--- /dev/null
+From f9697df251438b0798780900e8b43bdb12a56d64 Mon Sep 17 00:00:00 2001
+From: Maxim Levitsky <mlevitsk@redhat.com>
+Date: Thu, 3 Nov 2022 16:13:45 +0200
+Subject: KVM: x86: add kvm_leave_nested
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+commit f9697df251438b0798780900e8b43bdb12a56d64 upstream.
+
+add kvm_leave_nested which wraps a call to nested_ops->leave_nested
+into a function.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Message-Id: <20221103141351.50662-4-mlevitsk@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/svm/nested.c | 3 ---
+ arch/x86/kvm/vmx/nested.c | 3 ---
+ arch/x86/kvm/x86.c | 8 +++++++-
+ 3 files changed, 7 insertions(+), 7 deletions(-)
+
+--- a/arch/x86/kvm/svm/nested.c
++++ b/arch/x86/kvm/svm/nested.c
+@@ -1164,9 +1164,6 @@ void svm_free_nested(struct vcpu_svm *sv
+ svm->nested.initialized = false;
+ }
+
+-/*
+- * Forcibly leave nested mode in order to be able to reset the VCPU later on.
+- */
+ void svm_leave_nested(struct kvm_vcpu *vcpu)
+ {
+ struct vcpu_svm *svm = to_svm(vcpu);
+--- a/arch/x86/kvm/vmx/nested.c
++++ b/arch/x86/kvm/vmx/nested.c
+@@ -6294,9 +6294,6 @@ out:
+ return kvm_state.size;
+ }
+
+-/*
+- * Forcibly leave nested mode in order to be able to reset the VCPU later on.
+- */
+ void vmx_leave_nested(struct kvm_vcpu *vcpu)
+ {
+ if (is_guest_mode(vcpu)) {
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -611,6 +611,12 @@ void kvm_deliver_exception_payload(struc
+ }
+ EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload);
+
++/* Forcibly leave the nested mode in cases like a vCPU reset */
++static void kvm_leave_nested(struct kvm_vcpu *vcpu)
++{
++ kvm_x86_ops.nested_ops->leave_nested(vcpu);
++}
++
+ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
+ unsigned nr, bool has_error, u32 error_code,
+ bool has_payload, unsigned long payload, bool reinject)
+@@ -5154,7 +5160,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_e
+
+ if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
+ if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) {
+- kvm_x86_ops.nested_ops->leave_nested(vcpu);
++ kvm_leave_nested(vcpu);
+ kvm_smm_changed(vcpu, events->smi.smm);
+ }
+
--- /dev/null
+From ed129ec9057f89d615ba0c81a4984a90345a1684 Mon Sep 17 00:00:00 2001
+From: Maxim Levitsky <mlevitsk@redhat.com>
+Date: Thu, 3 Nov 2022 16:13:46 +0200
+Subject: KVM: x86: forcibly leave nested mode on vCPU reset
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+commit ed129ec9057f89d615ba0c81a4984a90345a1684 upstream.
+
+While not obivous, kvm_vcpu_reset() leaves the nested mode by clearing
+'vcpu->arch.hflags' but it does so without all the required housekeeping.
+
+On SVM, it is possible to have a vCPU reset while in guest mode because
+unlike VMX, on SVM, INIT's are not latched in SVM non root mode and in
+addition to that L1 doesn't have to intercept triple fault, which should
+also trigger L1's reset if happens in L2 while L1 didn't intercept it.
+
+If one of the above conditions happen, KVM will continue to use vmcb02
+while not having in the guest mode.
+
+Later the IA32_EFER will be cleared which will lead to freeing of the
+nested guest state which will (correctly) free the vmcb02, but since
+KVM still uses it (incorrectly) this will lead to a use after free
+and kernel crash.
+
+This issue is assigned CVE-2022-3344
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Message-Id: <20221103141351.50662-5-mlevitsk@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/x86.c | 10 ++++++++++
+ 1 file changed, 10 insertions(+)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -11789,8 +11789,18 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcp
+ WARN_ON_ONCE(!init_event &&
+ (old_cr0 || kvm_read_cr3(vcpu) || kvm_read_cr4(vcpu)));
+
++ /*
++ * SVM doesn't unconditionally VM-Exit on INIT and SHUTDOWN, thus it's
++ * possible to INIT the vCPU while L2 is active. Force the vCPU back
++ * into L1 as EFER.SVME is cleared on INIT (along with all other EFER
++ * bits), i.e. virtualization is disabled.
++ */
++ if (is_guest_mode(vcpu))
++ kvm_leave_nested(vcpu);
++
+ kvm_lapic_reset(vcpu, init_event);
+
++ WARN_ON_ONCE(is_guest_mode(vcpu) || is_smm(vcpu));
+ vcpu->arch.hflags = 0;
+
+ vcpu->arch.smi_pending = 0;
--- /dev/null
+From 47b0c2e4c220f2251fd8dcfbb44479819c715e15 Mon Sep 17 00:00:00 2001
+From: Kazuki Takiguchi <takiguchi.kazuki171@gmail.com>
+Date: Wed, 23 Nov 2022 14:36:00 -0500
+Subject: KVM: x86/mmu: Fix race condition in direct_page_fault
+
+From: Kazuki Takiguchi <takiguchi.kazuki171@gmail.com>
+
+commit 47b0c2e4c220f2251fd8dcfbb44479819c715e15 upstream.
+
+make_mmu_pages_available() must be called with mmu_lock held for write.
+However, if the TDP MMU is used, it will be called with mmu_lock held for
+read.
+This function does nothing unless shadow pages are used, so there is no
+race unless nested TDP is used.
+Since nested TDP uses shadow pages, old shadow pages may be zapped by this
+function even when the TDP MMU is enabled.
+Since shadow pages are never allocated by kvm_tdp_mmu_map(), a race
+condition can be avoided by not calling make_mmu_pages_available() if the
+TDP MMU is currently in use.
+
+I encountered this when repeatedly starting and stopping nested VM.
+It can be artificially caused by allocating a large number of nested TDP
+SPTEs.
+
+For example, the following BUG and general protection fault are caused in
+the host kernel.
+
+pte_list_remove: 00000000cd54fc10 many->many
+------------[ cut here ]------------
+kernel BUG at arch/x86/kvm/mmu/mmu.c:963!
+invalid opcode: 0000 [#1] PREEMPT SMP NOPTI
+RIP: 0010:pte_list_remove.cold+0x16/0x48 [kvm]
+Call Trace:
+ <TASK>
+ drop_spte+0xe0/0x180 [kvm]
+ mmu_page_zap_pte+0x4f/0x140 [kvm]
+ __kvm_mmu_prepare_zap_page+0x62/0x3e0 [kvm]
+ kvm_mmu_zap_oldest_mmu_pages+0x7d/0xf0 [kvm]
+ direct_page_fault+0x3cb/0x9b0 [kvm]
+ kvm_tdp_page_fault+0x2c/0xa0 [kvm]
+ kvm_mmu_page_fault+0x207/0x930 [kvm]
+ npf_interception+0x47/0xb0 [kvm_amd]
+ svm_invoke_exit_handler+0x13c/0x1a0 [kvm_amd]
+ svm_handle_exit+0xfc/0x2c0 [kvm_amd]
+ kvm_arch_vcpu_ioctl_run+0xa79/0x1780 [kvm]
+ kvm_vcpu_ioctl+0x29b/0x6f0 [kvm]
+ __x64_sys_ioctl+0x95/0xd0
+ do_syscall_64+0x5c/0x90
+
+general protection fault, probably for non-canonical address
+0xdead000000000122: 0000 [#1] PREEMPT SMP NOPTI
+RIP: 0010:kvm_mmu_commit_zap_page.part.0+0x4b/0xe0 [kvm]
+Call Trace:
+ <TASK>
+ kvm_mmu_zap_oldest_mmu_pages+0xae/0xf0 [kvm]
+ direct_page_fault+0x3cb/0x9b0 [kvm]
+ kvm_tdp_page_fault+0x2c/0xa0 [kvm]
+ kvm_mmu_page_fault+0x207/0x930 [kvm]
+ npf_interception+0x47/0xb0 [kvm_amd]
+
+CVE: CVE-2022-45869
+Fixes: a2855afc7ee8 ("KVM: x86/mmu: Allow parallel page faults for the TDP MMU")
+Signed-off-by: Kazuki Takiguchi <takiguchi.kazuki171@gmail.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/mmu/mmu.c | 13 +++++++------
+ 1 file changed, 7 insertions(+), 6 deletions(-)
+
+--- a/arch/x86/kvm/mmu/mmu.c
++++ b/arch/x86/kvm/mmu/mmu.c
+@@ -2431,6 +2431,7 @@ static bool __kvm_mmu_prepare_zap_page(s
+ {
+ bool list_unstable, zapped_root = false;
+
++ lockdep_assert_held_write(&kvm->mmu_lock);
+ trace_kvm_mmu_prepare_zap_page(sp);
+ ++kvm->stat.mmu_shadow_zapped;
+ *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
+@@ -4250,14 +4251,14 @@ static int direct_page_fault(struct kvm_
+ if (is_page_fault_stale(vcpu, fault, mmu_seq))
+ goto out_unlock;
+
+- r = make_mmu_pages_available(vcpu);
+- if (r)
+- goto out_unlock;
+-
+- if (is_tdp_mmu_fault)
++ if (is_tdp_mmu_fault) {
+ r = kvm_tdp_mmu_map(vcpu, fault);
+- else
++ } else {
++ r = make_mmu_pages_available(vcpu);
++ if (r)
++ goto out_unlock;
+ r = __direct_map(vcpu, fault);
++ }
+
+ out_unlock:
+ if (is_tdp_mmu_fault)
--- /dev/null
+From 16ae56d7e0528559bf8dc9070e3bfd8ba3de80df Mon Sep 17 00:00:00 2001
+From: Maxim Levitsky <mlevitsk@redhat.com>
+Date: Thu, 3 Nov 2022 16:13:44 +0200
+Subject: KVM: x86: nSVM: harden svm_free_nested against freeing vmcb02 while still in use
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+commit 16ae56d7e0528559bf8dc9070e3bfd8ba3de80df upstream.
+
+Make sure that KVM uses vmcb01 before freeing nested state, and warn if
+that is not the case.
+
+This is a minimal fix for CVE-2022-3344 making the kernel print a warning
+instead of a kernel panic.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Message-Id: <20221103141351.50662-3-mlevitsk@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/svm/nested.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/arch/x86/kvm/svm/nested.c
++++ b/arch/x86/kvm/svm/nested.c
+@@ -1143,6 +1143,9 @@ void svm_free_nested(struct vcpu_svm *sv
+ if (!svm->nested.initialized)
+ return;
+
++ if (WARN_ON_ONCE(svm->vmcb != svm->vmcb01.ptr))
++ svm_switch_vmcb(svm, &svm->vmcb01);
++
+ svm_vcpu_free_msrpm(svm->nested.msrpm);
+ svm->nested.msrpm = NULL;
+
--- /dev/null
+From 917401f26a6af5756d89b550a8e1bd50cf42b07e Mon Sep 17 00:00:00 2001
+From: Maxim Levitsky <mlevitsk@redhat.com>
+Date: Thu, 3 Nov 2022 16:13:43 +0200
+Subject: KVM: x86: nSVM: leave nested mode on vCPU free
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+commit 917401f26a6af5756d89b550a8e1bd50cf42b07e upstream.
+
+If the VM was terminated while nested, we free the nested state
+while the vCPU still is in nested mode.
+
+Soon a warning will be added for this condition.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Message-Id: <20221103141351.50662-2-mlevitsk@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/svm/svm.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -1440,6 +1440,7 @@ static void svm_vcpu_free(struct kvm_vcp
+ */
+ svm_clear_current_vmcb(svm->vmcb);
+
++ svm_leave_nested(vcpu);
+ svm_free_nested(svm);
+
+ sev_free_vcpu(vcpu);
--- /dev/null
+From 05311ce954aebe75935d9ae7d38ac82b5b796e33 Mon Sep 17 00:00:00 2001
+From: Maxim Levitsky <mlevitsk@redhat.com>
+Date: Thu, 3 Nov 2022 16:13:51 +0200
+Subject: KVM: x86: remove exit_int_info warning in svm_handle_exit
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+commit 05311ce954aebe75935d9ae7d38ac82b5b796e33 upstream.
+
+It is valid to receive external interrupt and have broken IDT entry,
+which will lead to #GP with exit_int_into that will contain the index of
+the IDT entry (e.g any value).
+
+Other exceptions can happen as well, like #NP or #SS
+(if stack switch fails).
+
+Thus this warning can be user triggred and has very little value.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Message-Id: <20221103141351.50662-10-mlevitsk@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/svm/svm.c | 15 ---------------
+ 1 file changed, 15 deletions(-)
+
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -346,12 +346,6 @@ int svm_set_efer(struct kvm_vcpu *vcpu,
+ return 0;
+ }
+
+-static int is_external_interrupt(u32 info)
+-{
+- info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
+- return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
+-}
+-
+ static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
+ {
+ struct vcpu_svm *svm = to_svm(vcpu);
+@@ -3427,15 +3421,6 @@ static int svm_handle_exit(struct kvm_vc
+ return 0;
+ }
+
+- if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
+- exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
+- exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
+- exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
+- printk(KERN_ERR "%s: unexpected exit_int_info 0x%x "
+- "exit_code 0x%x\n",
+- __func__, svm->vmcb->control.exit_int_info,
+- exit_code);
+-
+ if (exit_fastpath != EXIT_FASTPATH_NONE)
+ return 1;
+
--- /dev/null
+From c2b8cdfaf3a6721afe0c8c060a631b1c67a7f1ee Mon Sep 17 00:00:00 2001
+From: David Woodhouse <dwmw@amazon.co.uk>
+Date: Sat, 12 Nov 2022 13:52:25 +0000
+Subject: KVM: x86/xen: Only do in-kernel acceleration of hypercalls for guest CPL0
+
+From: David Woodhouse <dwmw@amazon.co.uk>
+
+commit c2b8cdfaf3a6721afe0c8c060a631b1c67a7f1ee upstream.
+
+There are almost no hypercalls which are valid from CPL > 0, and definitely
+none which are handled by the kernel.
+
+Fixes: 2fd6df2f2b47 ("KVM: x86/xen: intercept EVTCHNOP_send from guests")
+Reported-by: Michal Luczaj <mhal@rbox.co>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Reviewed-by: Sean Christopherson <seanjc@google.com>
+Cc: stable@kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/xen.c | 12 +++++++++++-
+ 1 file changed, 11 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/kvm/xen.c
++++ b/arch/x86/kvm/xen.c
+@@ -1216,6 +1216,7 @@ int kvm_xen_hypercall(struct kvm_vcpu *v
+ bool longmode;
+ u64 input, params[6], r = -ENOSYS;
+ bool handled = false;
++ u8 cpl;
+
+ input = (u64)kvm_register_read(vcpu, VCPU_REGS_RAX);
+
+@@ -1243,9 +1244,17 @@ int kvm_xen_hypercall(struct kvm_vcpu *v
+ params[5] = (u64)kvm_r9_read(vcpu);
+ }
+ #endif
++ cpl = static_call(kvm_x86_get_cpl)(vcpu);
+ trace_kvm_xen_hypercall(input, params[0], params[1], params[2],
+ params[3], params[4], params[5]);
+
++ /*
++ * Only allow hypercall acceleration for CPL0. The rare hypercalls that
++ * are permitted in guest userspace can be handled by the VMM.
++ */
++ if (unlikely(cpl > 0))
++ goto handle_in_userspace;
++
+ switch (input) {
+ case __HYPERVISOR_xen_version:
+ if (params[0] == XENVER_version && vcpu->kvm->arch.xen.xen_version) {
+@@ -1280,10 +1289,11 @@ int kvm_xen_hypercall(struct kvm_vcpu *v
+ if (handled)
+ return kvm_xen_hypercall_set_result(vcpu, r);
+
++handle_in_userspace:
+ vcpu->run->exit_reason = KVM_EXIT_XEN;
+ vcpu->run->xen.type = KVM_EXIT_XEN_HCALL;
+ vcpu->run->xen.u.hcall.longmode = longmode;
+- vcpu->run->xen.u.hcall.cpl = static_call(kvm_x86_get_cpl)(vcpu);
++ vcpu->run->xen.u.hcall.cpl = cpl;
+ vcpu->run->xen.u.hcall.input = input;
+ vcpu->run->xen.u.hcall.params[0] = params[0];
+ vcpu->run->xen.u.hcall.params[1] = params[1];
--- /dev/null
+From 4ea9439fd537313f3381f0af4ebbf05e3f51a58c Mon Sep 17 00:00:00 2001
+From: David Woodhouse <dwmw@amazon.co.uk>
+Date: Sat, 12 Nov 2022 13:48:58 +0000
+Subject: KVM: x86/xen: Validate port number in SCHEDOP_poll
+
+From: David Woodhouse <dwmw@amazon.co.uk>
+
+commit 4ea9439fd537313f3381f0af4ebbf05e3f51a58c upstream.
+
+We shouldn't allow guests to poll on arbitrary port numbers off the end
+of the event channel table.
+
+Fixes: 1a65105a5aba ("KVM: x86/xen: handle PV spinlocks slowpath")
+[dwmw2: my bug though; the original version did check the validity as a
+ side-effect of an idr_find() which I ripped out in refactoring.]
+Reported-by: Michal Luczaj <mhal@rbox.co>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Reviewed-by: Sean Christopherson <seanjc@google.com>
+Cc: stable@kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/xen.c | 20 ++++++++++++--------
+ 1 file changed, 12 insertions(+), 8 deletions(-)
+
+--- a/arch/x86/kvm/xen.c
++++ b/arch/x86/kvm/xen.c
+@@ -954,6 +954,14 @@ static int kvm_xen_hypercall_complete_us
+ return kvm_xen_hypercall_set_result(vcpu, run->xen.u.hcall.result);
+ }
+
++static inline int max_evtchn_port(struct kvm *kvm)
++{
++ if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode)
++ return EVTCHN_2L_NR_CHANNELS;
++ else
++ return COMPAT_EVTCHN_2L_NR_CHANNELS;
++}
++
+ static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports,
+ evtchn_port_t *ports)
+ {
+@@ -1042,6 +1050,10 @@ static bool kvm_xen_schedop_poll(struct
+ *r = -EFAULT;
+ goto out;
+ }
++ if (ports[i] >= max_evtchn_port(vcpu->kvm)) {
++ *r = -EINVAL;
++ goto out;
++ }
+ }
+
+ if (sched_poll.nr_ports == 1)
+@@ -1308,14 +1320,6 @@ handle_in_userspace:
+ return 0;
+ }
+
+-static inline int max_evtchn_port(struct kvm *kvm)
+-{
+- if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode)
+- return EVTCHN_2L_NR_CHANNELS;
+- else
+- return COMPAT_EVTCHN_2L_NR_CHANNELS;
+-}
+-
+ static void kvm_xen_check_poller(struct kvm_vcpu *vcpu, int port)
+ {
+ int poll_evtchn = vcpu->arch.xen.poll_evtchn;
--- /dev/null
+From 81a70c21d9170de67a45843bdd627f4cce9c4215 Mon Sep 17 00:00:00 2001
+From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
+Date: Fri, 18 Nov 2022 12:36:03 +0530
+Subject: mm/cgroup/reclaim: fix dirty pages throttling on cgroup v1
+
+From: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
+
+commit 81a70c21d9170de67a45843bdd627f4cce9c4215 upstream.
+
+balance_dirty_pages doesn't do the required dirty throttling on cgroupv1.
+See commit 9badce000e2c ("cgroup, writeback: don't enable cgroup writeback
+on traditional hierarchies"). Instead, the kernel depends on writeback
+throttling in shrink_folio_list to achieve the same goal. With large
+memory systems, the flusher may not be able to writeback quickly enough
+such that we will start finding pages in the shrink_folio_list already in
+writeback. Hence for cgroupv1 let's do a reclaim throttle after waking up
+the flusher.
+
+The below test which used to fail on a 256GB system completes till the the
+file system is full with this change.
+
+root@lp2:/sys/fs/cgroup/memory# mkdir test
+root@lp2:/sys/fs/cgroup/memory# cd test/
+root@lp2:/sys/fs/cgroup/memory/test# echo 120M > memory.limit_in_bytes
+root@lp2:/sys/fs/cgroup/memory/test# echo $$ > tasks
+root@lp2:/sys/fs/cgroup/memory/test# dd if=/dev/zero of=/home/kvaneesh/test bs=1M
+Killed
+
+Link: https://lkml.kernel.org/r/20221118070603.84081-1-aneesh.kumar@linux.ibm.com
+Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
+Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Tejun Heo <tj@kernel.org>
+Cc: zefan li <lizefan.x@bytedance.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/vmscan.c | 14 +++++++++++++-
+ 1 file changed, 13 insertions(+), 1 deletion(-)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -2472,8 +2472,20 @@ shrink_inactive_list(unsigned long nr_to
+ * the flushers simply cannot keep up with the allocation
+ * rate. Nudge the flusher threads in case they are asleep.
+ */
+- if (stat.nr_unqueued_dirty == nr_taken)
++ if (stat.nr_unqueued_dirty == nr_taken) {
+ wakeup_flusher_threads(WB_REASON_VMSCAN);
++ /*
++ * For cgroupv1 dirty throttling is achieved by waking up
++ * the kernel flusher here and later waiting on folios
++ * which are in writeback to finish (see shrink_folio_list()).
++ *
++ * Flusher may not be able to issue writeback quickly
++ * enough for cgroupv1 writeback throttling to work
++ * on a large system.
++ */
++ if (!writeback_throttling_sane(sc))
++ reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
++ }
+
+ sc->nr.dirty += stat.nr_dirty;
+ sc->nr.congested += stat.nr_congested;
--- /dev/null
+From f53af4285d775cd9a9a146fc438bd0a1bee1838a Mon Sep 17 00:00:00 2001
+From: Johannes Weiner <hannes@cmpxchg.org>
+Date: Tue, 2 Aug 2022 12:28:11 -0400
+Subject: mm: vmscan: fix extreme overreclaim and swap floods
+
+From: Johannes Weiner <hannes@cmpxchg.org>
+
+commit f53af4285d775cd9a9a146fc438bd0a1bee1838a upstream.
+
+During proactive reclaim, we sometimes observe severe overreclaim, with
+several thousand times more pages reclaimed than requested.
+
+This trace was obtained from shrink_lruvec() during such an instance:
+
+ prio:0 anon_cost:1141521 file_cost:7767
+ nr_reclaimed:4387406 nr_to_reclaim:1047 (or_factor:4190)
+ nr=[7161123 345 578 1111]
+
+While he reclaimer requested 4M, vmscan reclaimed close to 16G, most of it
+by swapping. These requests take over a minute, during which the write()
+to memory.reclaim is unkillably stuck inside the kernel.
+
+Digging into the source, this is caused by the proportional reclaim
+bailout logic. This code tries to resolve a fundamental conflict: to
+reclaim roughly what was requested, while also aging all LRUs fairly and
+in accordance to their size, swappiness, refault rates etc. The way it
+attempts fairness is that once the reclaim goal has been reached, it stops
+scanning the LRUs with the smaller remaining scan targets, and adjusts the
+remainder of the bigger LRUs according to how much of the smaller LRUs was
+scanned. It then finishes scanning that remainder regardless of the
+reclaim goal.
+
+This works fine if priority levels are low and the LRU lists are
+comparable in size. However, in this instance, the cgroup that is
+targeted by proactive reclaim has almost no files left - they've already
+been squeezed out by proactive reclaim earlier - and the remaining anon
+pages are hot. Anon rotations cause the priority level to drop to 0,
+which results in reclaim targeting all of anon (a lot) and all of file
+(almost nothing). By the time reclaim decides to bail, it has scanned
+most or all of the file target, and therefor must also scan most or all of
+the enormous anon target. This target is thousands of times larger than
+the reclaim goal, thus causing the overreclaim.
+
+The bailout code hasn't changed in years, why is this failing now? The
+most likely explanations are two other recent changes in anon reclaim:
+
+1. Before the series starting with commit 5df741963d52 ("mm: fix LRU
+ balancing effect of new transparent huge pages"), the VM was
+ overall relatively reluctant to swap at all, even if swap was
+ configured. This means the LRU balancing code didn't come into play
+ as often as it does now, and mostly in high pressure situations
+ where pronounced swap activity wouldn't be as surprising.
+
+2. For historic reasons, shrink_lruvec() loops on the scan targets of
+ all LRU lists except the active anon one, meaning it would bail if
+ the only remaining pages to scan were active anon - even if there
+ were a lot of them.
+
+ Before the series starting with commit ccc5dc67340c ("mm/vmscan:
+ make active/inactive ratio as 1:1 for anon lru"), most anon pages
+ would live on the active LRU; the inactive one would contain only a
+ handful of preselected reclaim candidates. After the series, anon
+ gets aged similarly to file, and the inactive list is the default
+ for new anon pages as well, making it often the much bigger list.
+
+ As a result, the VM is now more likely to actually finish large
+ anon targets than before.
+
+Change the code such that only one SWAP_CLUSTER_MAX-sized nudge toward the
+larger LRU lists is made before bailing out on a met reclaim goal.
+
+This fixes the extreme overreclaim problem.
+
+Fairness is more subtle and harder to evaluate. No obvious misbehavior
+was observed on the test workload, in any case. Conceptually, fairness
+should primarily be a cumulative effect from regular, lower priority
+scans. Once the VM is in trouble and needs to escalate scan targets to
+make forward progress, fairness needs to take a backseat. This is also
+acknowledged by the myriad exceptions in get_scan_count(). This patch
+makes fairness decrease gradually, as it keeps fairness work static over
+increasing priority levels with growing scan targets. This should make
+more sense - although we may have to re-visit the exact values.
+
+Link: https://lkml.kernel.org/r/20220802162811.39216-1-hannes@cmpxchg.org
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+Reviewed-by: Rik van Riel <riel@surriel.com>
+Acked-by: Mel Gorman <mgorman@techsingularity.net>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/vmscan.c | 10 ++++------
+ 1 file changed, 4 insertions(+), 6 deletions(-)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -2967,8 +2967,8 @@ static void shrink_lruvec(struct lruvec
+ enum lru_list lru;
+ unsigned long nr_reclaimed = 0;
+ unsigned long nr_to_reclaim = sc->nr_to_reclaim;
++ bool proportional_reclaim;
+ struct blk_plug plug;
+- bool scan_adjusted;
+
+ get_scan_count(lruvec, sc, nr);
+
+@@ -2986,8 +2986,8 @@ static void shrink_lruvec(struct lruvec
+ * abort proportional reclaim if either the file or anon lru has already
+ * dropped to zero at the first pass.
+ */
+- scan_adjusted = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
+- sc->priority == DEF_PRIORITY);
++ proportional_reclaim = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
++ sc->priority == DEF_PRIORITY);
+
+ blk_start_plug(&plug);
+ while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
+@@ -3007,7 +3007,7 @@ static void shrink_lruvec(struct lruvec
+
+ cond_resched();
+
+- if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
++ if (nr_reclaimed < nr_to_reclaim || proportional_reclaim)
+ continue;
+
+ /*
+@@ -3058,8 +3058,6 @@ static void shrink_lruvec(struct lruvec
+ nr_scanned = targets[lru] - nr[lru];
+ nr[lru] = targets[lru] * (100 - percentage) / 100;
+ nr[lru] -= min(nr[lru], nr_scanned);
+-
+- scan_adjusted = true;
+ }
+ blk_finish_plug(&plug);
+ sc->nr_reclaimed += nr_reclaimed;
--- /dev/null
+From 512c5ca01a3610ab14ff6309db363de51f1c13a6 Mon Sep 17 00:00:00 2001
+From: Chen Zhongjin <chenzhongjin@huawei.com>
+Date: Fri, 18 Nov 2022 14:33:04 +0800
+Subject: nilfs2: fix nilfs_sufile_mark_dirty() not set segment usage as dirty
+
+From: Chen Zhongjin <chenzhongjin@huawei.com>
+
+commit 512c5ca01a3610ab14ff6309db363de51f1c13a6 upstream.
+
+When extending segments, nilfs_sufile_alloc() is called to get an
+unassigned segment, then mark it as dirty to avoid accidentally allocating
+the same segment in the future.
+
+But for some special cases such as a corrupted image it can be unreliable.
+If such corruption of the dirty state of the segment occurs, nilfs2 may
+reallocate a segment that is in use and pick the same segment for writing
+twice at the same time.
+
+This will cause the problem reported by syzkaller:
+https://syzkaller.appspot.com/bug?id=c7c4748e11ffcc367cef04f76e02e931833cbd24
+
+This case started with segbuf1.segnum = 3, nextnum = 4 when constructed.
+It supposed segment 4 has already been allocated and marked as dirty.
+
+However the dirty state was corrupted and segment 4 usage was not dirty.
+For the first time nilfs_segctor_extend_segments() segment 4 was allocated
+again, which made segbuf2 and next segbuf3 had same segment 4.
+
+sb_getblk() will get same bh for segbuf2 and segbuf3, and this bh is added
+to both buffer lists of two segbuf. It makes the lists broken which
+causes NULL pointer dereference.
+
+Fix the problem by setting usage as dirty every time in
+nilfs_sufile_mark_dirty(), which is called during constructing current
+segment to be written out and before allocating next segment.
+
+[chenzhongjin@huawei.com: add lock protection per Ryusuke]
+ Link: https://lkml.kernel.org/r/20221121091141.214703-1-chenzhongjin@huawei.com
+Link: https://lkml.kernel.org/r/20221118063304.140187-1-chenzhongjin@huawei.com
+Fixes: 9ff05123e3bf ("nilfs2: segment constructor")
+Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com>
+Reported-by: <syzbot+77e4f0...@syzkaller.appspotmail.com>
+Reported-by: Liu Shixin <liushixin2@huawei.com>
+Acked-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
+Tested-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/nilfs2/sufile.c | 8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+--- a/fs/nilfs2/sufile.c
++++ b/fs/nilfs2/sufile.c
+@@ -495,14 +495,22 @@ void nilfs_sufile_do_free(struct inode *
+ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum)
+ {
+ struct buffer_head *bh;
++ void *kaddr;
++ struct nilfs_segment_usage *su;
+ int ret;
+
++ down_write(&NILFS_MDT(sufile)->mi_sem);
+ ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh);
+ if (!ret) {
+ mark_buffer_dirty(bh);
+ nilfs_mdt_mark_dirty(sufile);
++ kaddr = kmap_atomic(bh->b_page);
++ su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
++ nilfs_segment_usage_set_dirty(su);
++ kunmap_atomic(kaddr);
+ brelse(bh);
+ }
++ up_write(&NILFS_MDT(sufile)->mi_sem);
+ return ret;
+ }
+
usb-dwc3-gadget-conditionally-remove-requests.patch
usb-dwc3-gadget-return-eshutdown-on-ep-disable.patch
usb-dwc3-gadget-clear-ep-descriptor-last.patch
+io_uring-cmpxchg-for-poll-arm-refs-release.patch
+io_uring-make-poll-refs-more-robust.patch
+io_uring-clear-tif_notify_signal-if-set-and-task_work-not-available.patch
+nilfs2-fix-nilfs_sufile_mark_dirty-not-set-segment-usage-as-dirty.patch
+gcov-clang-fix-the-buffer-overflow-issue.patch
+mm-cgroup-reclaim-fix-dirty-pages-throttling-on-cgroup-v1.patch
+mm-vmscan-fix-extreme-overreclaim-and-swap-floods.patch
+fpga-m10bmc-sec-fix-kconfig-dependencies.patch
+kvm-x86-mmu-fix-race-condition-in-direct_page_fault.patch
+kvm-x86-xen-only-do-in-kernel-acceleration-of-hypercalls-for-guest-cpl0.patch
+kvm-x86-xen-validate-port-number-in-schedop_poll.patch
+drm-i915-gvt-get-reference-to-kvm-iff-attachment-to-vm-is-successful.patch
+kvm-x86-nsvm-leave-nested-mode-on-vcpu-free.patch
+kvm-x86-forcibly-leave-nested-mode-on-vcpu-reset.patch
+kvm-x86-nsvm-harden-svm_free_nested-against-freeing-vmcb02-while-still-in-use.patch
+kvm-x86-add-kvm_leave_nested.patch
+kvm-x86-remove-exit_int_info-warning-in-svm_handle_exit.patch
+kvm-update-gfn_to_pfn_cache-khva-when-it-moves-within-the-same-page.patch