]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
5.15-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 30 Nov 2022 12:27:55 +0000 (13:27 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 30 Nov 2022 12:27:55 +0000 (13:27 +0100)
added patches:
gcov-clang-fix-the-buffer-overflow-issue.patch
kvm-x86-add-kvm_leave_nested.patch
kvm-x86-forcibly-leave-nested-mode-on-vcpu-reset.patch
kvm-x86-nsvm-harden-svm_free_nested-against-freeing-vmcb02-while-still-in-use.patch
kvm-x86-nsvm-leave-nested-mode-on-vcpu-free.patch
kvm-x86-remove-exit_int_info-warning-in-svm_handle_exit.patch
mm-vmscan-fix-extreme-overreclaim-and-swap-floods.patch
nilfs2-fix-nilfs_sufile_mark_dirty-not-set-segment-usage-as-dirty.patch

queue-5.15/gcov-clang-fix-the-buffer-overflow-issue.patch [new file with mode: 0644]
queue-5.15/kvm-x86-add-kvm_leave_nested.patch [new file with mode: 0644]
queue-5.15/kvm-x86-forcibly-leave-nested-mode-on-vcpu-reset.patch [new file with mode: 0644]
queue-5.15/kvm-x86-nsvm-harden-svm_free_nested-against-freeing-vmcb02-while-still-in-use.patch [new file with mode: 0644]
queue-5.15/kvm-x86-nsvm-leave-nested-mode-on-vcpu-free.patch [new file with mode: 0644]
queue-5.15/kvm-x86-remove-exit_int_info-warning-in-svm_handle_exit.patch [new file with mode: 0644]
queue-5.15/mm-vmscan-fix-extreme-overreclaim-and-swap-floods.patch [new file with mode: 0644]
queue-5.15/nilfs2-fix-nilfs_sufile_mark_dirty-not-set-segment-usage-as-dirty.patch [new file with mode: 0644]
queue-5.15/series

diff --git a/queue-5.15/gcov-clang-fix-the-buffer-overflow-issue.patch b/queue-5.15/gcov-clang-fix-the-buffer-overflow-issue.patch
new file mode 100644 (file)
index 0000000..55dfa6e
--- /dev/null
@@ -0,0 +1,88 @@
+From a6f810efabfd789d3bbafeacb4502958ec56c5ce Mon Sep 17 00:00:00 2001
+From: Mukesh Ojha <quic_mojha@quicinc.com>
+Date: Thu, 10 Nov 2022 00:31:37 +0530
+Subject: gcov: clang: fix the buffer overflow issue
+
+From: Mukesh Ojha <quic_mojha@quicinc.com>
+
+commit a6f810efabfd789d3bbafeacb4502958ec56c5ce upstream.
+
+Currently, in clang version of gcov code when module is getting removed
+gcov_info_add() incorrectly adds the sfn_ptr->counter to all the
+dst->functions and it result in the kernel panic in below crash report.
+Fix this by properly handling it.
+
+[    8.899094][  T599] Unable to handle kernel write to read-only memory at virtual address ffffff80461cc000
+[    8.899100][  T599] Mem abort info:
+[    8.899102][  T599]   ESR = 0x9600004f
+[    8.899103][  T599]   EC = 0x25: DABT (current EL), IL = 32 bits
+[    8.899105][  T599]   SET = 0, FnV = 0
+[    8.899107][  T599]   EA = 0, S1PTW = 0
+[    8.899108][  T599]   FSC = 0x0f: level 3 permission fault
+[    8.899110][  T599] Data abort info:
+[    8.899111][  T599]   ISV = 0, ISS = 0x0000004f
+[    8.899113][  T599]   CM = 0, WnR = 1
+[    8.899114][  T599] swapper pgtable: 4k pages, 39-bit VAs, pgdp=00000000ab8de000
+[    8.899116][  T599] [ffffff80461cc000] pgd=18000009ffcde003, p4d=18000009ffcde003, pud=18000009ffcde003, pmd=18000009ffcad003, pte=00600000c61cc787
+[    8.899124][  T599] Internal error: Oops: 9600004f [#1] PREEMPT SMP
+[    8.899265][  T599] Skip md ftrace buffer dump for: 0x1609e0
+....
+..,
+[    8.899544][  T599] CPU: 7 PID: 599 Comm: modprobe Tainted: G S         OE     5.15.41-android13-8-g38e9b1af6bce #1
+[    8.899547][  T599] Hardware name: XXX (DT)
+[    8.899549][  T599] pstate: 82400005 (Nzcv daif +PAN -UAO +TCO -DIT -SSBS BTYPE=--)
+[    8.899551][  T599] pc : gcov_info_add+0x9c/0xb8
+[    8.899557][  T599] lr : gcov_event+0x28c/0x6b8
+[    8.899559][  T599] sp : ffffffc00e733b00
+[    8.899560][  T599] x29: ffffffc00e733b00 x28: ffffffc00e733d30 x27: ffffffe8dc297470
+[    8.899563][  T599] x26: ffffffe8dc297000 x25: ffffffe8dc297000 x24: ffffffe8dc297000
+[    8.899566][  T599] x23: ffffffe8dc0a6200 x22: ffffff880f68bf20 x21: 0000000000000000
+[    8.899569][  T599] x20: ffffff880f68bf00 x19: ffffff8801babc00 x18: ffffffc00d7f9058
+[    8.899572][  T599] x17: 0000000000088793 x16: ffffff80461cbe00 x15: 9100052952800785
+[    8.899575][  T599] x14: 0000000000000200 x13: 0000000000000041 x12: 9100052952800785
+[    8.899577][  T599] x11: ffffffe8dc297000 x10: ffffffe8dc297000 x9 : ffffff80461cbc80
+[    8.899580][  T599] x8 : ffffff8801babe80 x7 : ffffffe8dc2ec000 x6 : ffffffe8dc2ed000
+[    8.899583][  T599] x5 : 000000008020001f x4 : fffffffe2006eae0 x3 : 000000008020001f
+[    8.899586][  T599] x2 : ffffff8027c49200 x1 : ffffff8801babc20 x0 : ffffff80461cb3a0
+[    8.899589][  T599] Call trace:
+[    8.899590][  T599]  gcov_info_add+0x9c/0xb8
+[    8.899592][  T599]  gcov_module_notifier+0xbc/0x120
+[    8.899595][  T599]  blocking_notifier_call_chain+0xa0/0x11c
+[    8.899598][  T599]  do_init_module+0x2a8/0x33c
+[    8.899600][  T599]  load_module+0x23cc/0x261c
+[    8.899602][  T599]  __arm64_sys_finit_module+0x158/0x194
+[    8.899604][  T599]  invoke_syscall+0x94/0x2bc
+[    8.899607][  T599]  el0_svc_common+0x1d8/0x34c
+[    8.899609][  T599]  do_el0_svc+0x40/0x54
+[    8.899611][  T599]  el0_svc+0x94/0x2f0
+[    8.899613][  T599]  el0t_64_sync_handler+0x88/0xec
+[    8.899615][  T599]  el0t_64_sync+0x1b4/0x1b8
+[    8.899618][  T599] Code: f905f56c f86e69ec f86e6a0f 8b0c01ec (f82e6a0c)
+[    8.899620][  T599] ---[ end trace ed5218e9e5b6e2e6 ]---
+
+Link: https://lkml.kernel.org/r/1668020497-13142-1-git-send-email-quic_mojha@quicinc.com
+Fixes: e178a5beb369 ("gcov: clang support")
+Signed-off-by: Mukesh Ojha <quic_mojha@quicinc.com>
+Reviewed-by: Peter Oberparleiter <oberpar@linux.ibm.com>
+Tested-by: Peter Oberparleiter <oberpar@linux.ibm.com>
+Cc: Nathan Chancellor <nathan@kernel.org>
+Cc: Nick Desaulniers <ndesaulniers@google.com>
+Cc: Tom Rix <trix@redhat.com>
+Cc: <stable@vger.kernel.org>   [5.2+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/gcov/clang.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/kernel/gcov/clang.c
++++ b/kernel/gcov/clang.c
+@@ -280,6 +280,8 @@ void gcov_info_add(struct gcov_info *dst
+               for (i = 0; i < sfn_ptr->num_counters; i++)
+                       dfn_ptr->counters[i] += sfn_ptr->counters[i];
++
++              sfn_ptr = list_next_entry(sfn_ptr, head);
+       }
+ }
diff --git a/queue-5.15/kvm-x86-add-kvm_leave_nested.patch b/queue-5.15/kvm-x86-add-kvm_leave_nested.patch
new file mode 100644 (file)
index 0000000..c464ec3
--- /dev/null
@@ -0,0 +1,71 @@
+From f9697df251438b0798780900e8b43bdb12a56d64 Mon Sep 17 00:00:00 2001
+From: Maxim Levitsky <mlevitsk@redhat.com>
+Date: Thu, 3 Nov 2022 16:13:45 +0200
+Subject: KVM: x86: add kvm_leave_nested
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+commit f9697df251438b0798780900e8b43bdb12a56d64 upstream.
+
+add kvm_leave_nested which wraps a call to nested_ops->leave_nested
+into a function.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Message-Id: <20221103141351.50662-4-mlevitsk@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/svm/nested.c |    3 ---
+ arch/x86/kvm/vmx/nested.c |    3 ---
+ arch/x86/kvm/x86.c        |    8 +++++++-
+ 3 files changed, 7 insertions(+), 7 deletions(-)
+
+--- a/arch/x86/kvm/svm/nested.c
++++ b/arch/x86/kvm/svm/nested.c
+@@ -940,9 +940,6 @@ void svm_free_nested(struct vcpu_svm *sv
+       svm->nested.initialized = false;
+ }
+-/*
+- * Forcibly leave nested mode in order to be able to reset the VCPU later on.
+- */
+ void svm_leave_nested(struct kvm_vcpu *vcpu)
+ {
+       struct vcpu_svm *svm = to_svm(vcpu);
+--- a/arch/x86/kvm/vmx/nested.c
++++ b/arch/x86/kvm/vmx/nested.c
+@@ -6276,9 +6276,6 @@ out:
+       return kvm_state.size;
+ }
+-/*
+- * Forcibly leave nested mode in order to be able to reset the VCPU later on.
+- */
+ void vmx_leave_nested(struct kvm_vcpu *vcpu)
+ {
+       if (is_guest_mode(vcpu)) {
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -608,6 +608,12 @@ void kvm_deliver_exception_payload(struc
+ }
+ EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload);
++/* Forcibly leave the nested mode in cases like a vCPU reset */
++static void kvm_leave_nested(struct kvm_vcpu *vcpu)
++{
++      kvm_x86_ops.nested_ops->leave_nested(vcpu);
++}
++
+ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
+               unsigned nr, bool has_error, u32 error_code,
+               bool has_payload, unsigned long payload, bool reinject)
+@@ -4775,7 +4781,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_e
+       if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
+               if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) {
+-                      kvm_x86_ops.nested_ops->leave_nested(vcpu);
++                      kvm_leave_nested(vcpu);
+                       kvm_smm_changed(vcpu, events->smi.smm);
+               }
diff --git a/queue-5.15/kvm-x86-forcibly-leave-nested-mode-on-vcpu-reset.patch b/queue-5.15/kvm-x86-forcibly-leave-nested-mode-on-vcpu-reset.patch
new file mode 100644 (file)
index 0000000..1c7c127
--- /dev/null
@@ -0,0 +1,57 @@
+From ed129ec9057f89d615ba0c81a4984a90345a1684 Mon Sep 17 00:00:00 2001
+From: Maxim Levitsky <mlevitsk@redhat.com>
+Date: Thu, 3 Nov 2022 16:13:46 +0200
+Subject: KVM: x86: forcibly leave nested mode on vCPU reset
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+commit ed129ec9057f89d615ba0c81a4984a90345a1684 upstream.
+
+While not obivous, kvm_vcpu_reset() leaves the nested mode by clearing
+'vcpu->arch.hflags' but it does so without all the required housekeeping.
+
+On SVM,        it is possible to have a vCPU reset while in guest mode because
+unlike VMX, on SVM, INIT's are not latched in SVM non root mode and in
+addition to that L1 doesn't have to intercept triple fault, which should
+also trigger L1's reset if happens in L2 while L1 didn't intercept it.
+
+If one of the above conditions happen, KVM will        continue to use vmcb02
+while not having in the guest mode.
+
+Later the IA32_EFER will be cleared which will lead to freeing of the
+nested guest state which will (correctly) free the vmcb02, but since
+KVM still uses it (incorrectly) this will lead to a use after free
+and kernel crash.
+
+This issue is assigned CVE-2022-3344
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Message-Id: <20221103141351.50662-5-mlevitsk@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/x86.c |   10 ++++++++++
+ 1 file changed, 10 insertions(+)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -11111,8 +11111,18 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcp
+       unsigned long new_cr0;
+       u32 eax, dummy;
++      /*
++       * SVM doesn't unconditionally VM-Exit on INIT and SHUTDOWN, thus it's
++       * possible to INIT the vCPU while L2 is active.  Force the vCPU back
++       * into L1 as EFER.SVME is cleared on INIT (along with all other EFER
++       * bits), i.e. virtualization is disabled.
++       */
++      if (is_guest_mode(vcpu))
++              kvm_leave_nested(vcpu);
++
+       kvm_lapic_reset(vcpu, init_event);
++      WARN_ON_ONCE(is_guest_mode(vcpu) || is_smm(vcpu));
+       vcpu->arch.hflags = 0;
+       vcpu->arch.smi_pending = 0;
diff --git a/queue-5.15/kvm-x86-nsvm-harden-svm_free_nested-against-freeing-vmcb02-while-still-in-use.patch b/queue-5.15/kvm-x86-nsvm-harden-svm_free_nested-against-freeing-vmcb02-while-still-in-use.patch
new file mode 100644 (file)
index 0000000..a8ba9e7
--- /dev/null
@@ -0,0 +1,36 @@
+From 16ae56d7e0528559bf8dc9070e3bfd8ba3de80df Mon Sep 17 00:00:00 2001
+From: Maxim Levitsky <mlevitsk@redhat.com>
+Date: Thu, 3 Nov 2022 16:13:44 +0200
+Subject: KVM: x86: nSVM: harden svm_free_nested against freeing vmcb02 while still in use
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+commit 16ae56d7e0528559bf8dc9070e3bfd8ba3de80df upstream.
+
+Make sure that KVM uses vmcb01 before freeing nested state, and warn if
+that is not the case.
+
+This is a minimal fix for CVE-2022-3344 making the kernel print a warning
+instead of a kernel panic.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Message-Id: <20221103141351.50662-3-mlevitsk@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/svm/nested.c |    3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/arch/x86/kvm/svm/nested.c
++++ b/arch/x86/kvm/svm/nested.c
+@@ -919,6 +919,9 @@ void svm_free_nested(struct vcpu_svm *sv
+       if (!svm->nested.initialized)
+               return;
++      if (WARN_ON_ONCE(svm->vmcb != svm->vmcb01.ptr))
++              svm_switch_vmcb(svm, &svm->vmcb01);
++
+       svm_vcpu_free_msrpm(svm->nested.msrpm);
+       svm->nested.msrpm = NULL;
diff --git a/queue-5.15/kvm-x86-nsvm-leave-nested-mode-on-vcpu-free.patch b/queue-5.15/kvm-x86-nsvm-leave-nested-mode-on-vcpu-free.patch
new file mode 100644 (file)
index 0000000..4fa4d35
--- /dev/null
@@ -0,0 +1,33 @@
+From 917401f26a6af5756d89b550a8e1bd50cf42b07e Mon Sep 17 00:00:00 2001
+From: Maxim Levitsky <mlevitsk@redhat.com>
+Date: Thu, 3 Nov 2022 16:13:43 +0200
+Subject: KVM: x86: nSVM: leave nested mode on vCPU free
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+commit 917401f26a6af5756d89b550a8e1bd50cf42b07e upstream.
+
+If the VM was terminated while nested, we free the nested state
+while the vCPU still is in nested mode.
+
+Soon a warning will be added for this condition.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Message-Id: <20221103141351.50662-2-mlevitsk@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/svm/svm.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -1427,6 +1427,7 @@ static void svm_free_vcpu(struct kvm_vcp
+        */
+       svm_clear_current_vmcb(svm->vmcb);
++      svm_leave_nested(vcpu);
+       svm_free_nested(svm);
+       sev_free_vcpu(vcpu);
diff --git a/queue-5.15/kvm-x86-remove-exit_int_info-warning-in-svm_handle_exit.patch b/queue-5.15/kvm-x86-remove-exit_int_info-warning-in-svm_handle_exit.patch
new file mode 100644 (file)
index 0000000..d01192b
--- /dev/null
@@ -0,0 +1,58 @@
+From 05311ce954aebe75935d9ae7d38ac82b5b796e33 Mon Sep 17 00:00:00 2001
+From: Maxim Levitsky <mlevitsk@redhat.com>
+Date: Thu, 3 Nov 2022 16:13:51 +0200
+Subject: KVM: x86: remove exit_int_info warning in svm_handle_exit
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+commit 05311ce954aebe75935d9ae7d38ac82b5b796e33 upstream.
+
+It is valid to receive external interrupt and have broken IDT entry,
+which will lead to #GP with exit_int_into that will contain the index of
+the IDT entry (e.g any value).
+
+Other exceptions can happen as well, like #NP or #SS
+(if stack switch fails).
+
+Thus this warning can be user triggred and has very little value.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Message-Id: <20221103141351.50662-10-mlevitsk@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/svm/svm.c |   15 ---------------
+ 1 file changed, 15 deletions(-)
+
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -317,12 +317,6 @@ int svm_set_efer(struct kvm_vcpu *vcpu,
+       return 0;
+ }
+-static int is_external_interrupt(u32 info)
+-{
+-      info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
+-      return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
+-}
+-
+ static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
+ {
+       struct vcpu_svm *svm = to_svm(vcpu);
+@@ -3360,15 +3354,6 @@ static int handle_exit(struct kvm_vcpu *
+               return 0;
+       }
+-      if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
+-          exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
+-          exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
+-          exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
+-              printk(KERN_ERR "%s: unexpected exit_int_info 0x%x "
+-                     "exit_code 0x%x\n",
+-                     __func__, svm->vmcb->control.exit_int_info,
+-                     exit_code);
+-
+       if (exit_fastpath != EXIT_FASTPATH_NONE)
+               return 1;
diff --git a/queue-5.15/mm-vmscan-fix-extreme-overreclaim-and-swap-floods.patch b/queue-5.15/mm-vmscan-fix-extreme-overreclaim-and-swap-floods.patch
new file mode 100644 (file)
index 0000000..b7e9919
--- /dev/null
@@ -0,0 +1,137 @@
+From f53af4285d775cd9a9a146fc438bd0a1bee1838a Mon Sep 17 00:00:00 2001
+From: Johannes Weiner <hannes@cmpxchg.org>
+Date: Tue, 2 Aug 2022 12:28:11 -0400
+Subject: mm: vmscan: fix extreme overreclaim and swap floods
+
+From: Johannes Weiner <hannes@cmpxchg.org>
+
+commit f53af4285d775cd9a9a146fc438bd0a1bee1838a upstream.
+
+During proactive reclaim, we sometimes observe severe overreclaim, with
+several thousand times more pages reclaimed than requested.
+
+This trace was obtained from shrink_lruvec() during such an instance:
+
+    prio:0 anon_cost:1141521 file_cost:7767
+    nr_reclaimed:4387406 nr_to_reclaim:1047 (or_factor:4190)
+    nr=[7161123 345 578 1111]
+
+While he reclaimer requested 4M, vmscan reclaimed close to 16G, most of it
+by swapping.  These requests take over a minute, during which the write()
+to memory.reclaim is unkillably stuck inside the kernel.
+
+Digging into the source, this is caused by the proportional reclaim
+bailout logic.  This code tries to resolve a fundamental conflict: to
+reclaim roughly what was requested, while also aging all LRUs fairly and
+in accordance to their size, swappiness, refault rates etc.  The way it
+attempts fairness is that once the reclaim goal has been reached, it stops
+scanning the LRUs with the smaller remaining scan targets, and adjusts the
+remainder of the bigger LRUs according to how much of the smaller LRUs was
+scanned.  It then finishes scanning that remainder regardless of the
+reclaim goal.
+
+This works fine if priority levels are low and the LRU lists are
+comparable in size.  However, in this instance, the cgroup that is
+targeted by proactive reclaim has almost no files left - they've already
+been squeezed out by proactive reclaim earlier - and the remaining anon
+pages are hot.  Anon rotations cause the priority level to drop to 0,
+which results in reclaim targeting all of anon (a lot) and all of file
+(almost nothing).  By the time reclaim decides to bail, it has scanned
+most or all of the file target, and therefor must also scan most or all of
+the enormous anon target.  This target is thousands of times larger than
+the reclaim goal, thus causing the overreclaim.
+
+The bailout code hasn't changed in years, why is this failing now?  The
+most likely explanations are two other recent changes in anon reclaim:
+
+1. Before the series starting with commit 5df741963d52 ("mm: fix LRU
+   balancing effect of new transparent huge pages"), the VM was
+   overall relatively reluctant to swap at all, even if swap was
+   configured. This means the LRU balancing code didn't come into play
+   as often as it does now, and mostly in high pressure situations
+   where pronounced swap activity wouldn't be as surprising.
+
+2. For historic reasons, shrink_lruvec() loops on the scan targets of
+   all LRU lists except the active anon one, meaning it would bail if
+   the only remaining pages to scan were active anon - even if there
+   were a lot of them.
+
+   Before the series starting with commit ccc5dc67340c ("mm/vmscan:
+   make active/inactive ratio as 1:1 for anon lru"), most anon pages
+   would live on the active LRU; the inactive one would contain only a
+   handful of preselected reclaim candidates. After the series, anon
+   gets aged similarly to file, and the inactive list is the default
+   for new anon pages as well, making it often the much bigger list.
+
+   As a result, the VM is now more likely to actually finish large
+   anon targets than before.
+
+Change the code such that only one SWAP_CLUSTER_MAX-sized nudge toward the
+larger LRU lists is made before bailing out on a met reclaim goal.
+
+This fixes the extreme overreclaim problem.
+
+Fairness is more subtle and harder to evaluate.  No obvious misbehavior
+was observed on the test workload, in any case.  Conceptually, fairness
+should primarily be a cumulative effect from regular, lower priority
+scans.  Once the VM is in trouble and needs to escalate scan targets to
+make forward progress, fairness needs to take a backseat.  This is also
+acknowledged by the myriad exceptions in get_scan_count().  This patch
+makes fairness decrease gradually, as it keeps fairness work static over
+increasing priority levels with growing scan targets.  This should make
+more sense - although we may have to re-visit the exact values.
+
+Link: https://lkml.kernel.org/r/20220802162811.39216-1-hannes@cmpxchg.org
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+Reviewed-by: Rik van Riel <riel@surriel.com>
+Acked-by: Mel Gorman <mgorman@techsingularity.net>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/vmscan.c |   10 ++++------
+ 1 file changed, 4 insertions(+), 6 deletions(-)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -2791,8 +2791,8 @@ static void shrink_lruvec(struct lruvec
+       enum lru_list lru;
+       unsigned long nr_reclaimed = 0;
+       unsigned long nr_to_reclaim = sc->nr_to_reclaim;
++      bool proportional_reclaim;
+       struct blk_plug plug;
+-      bool scan_adjusted;
+       get_scan_count(lruvec, sc, nr);
+@@ -2810,8 +2810,8 @@ static void shrink_lruvec(struct lruvec
+        * abort proportional reclaim if either the file or anon lru has already
+        * dropped to zero at the first pass.
+        */
+-      scan_adjusted = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
+-                       sc->priority == DEF_PRIORITY);
++      proportional_reclaim = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
++                              sc->priority == DEF_PRIORITY);
+       blk_start_plug(&plug);
+       while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
+@@ -2831,7 +2831,7 @@ static void shrink_lruvec(struct lruvec
+               cond_resched();
+-              if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
++              if (nr_reclaimed < nr_to_reclaim || proportional_reclaim)
+                       continue;
+               /*
+@@ -2882,8 +2882,6 @@ static void shrink_lruvec(struct lruvec
+               nr_scanned = targets[lru] - nr[lru];
+               nr[lru] = targets[lru] * (100 - percentage) / 100;
+               nr[lru] -= min(nr[lru], nr_scanned);
+-
+-              scan_adjusted = true;
+       }
+       blk_finish_plug(&plug);
+       sc->nr_reclaimed += nr_reclaimed;
diff --git a/queue-5.15/nilfs2-fix-nilfs_sufile_mark_dirty-not-set-segment-usage-as-dirty.patch b/queue-5.15/nilfs2-fix-nilfs_sufile_mark_dirty-not-set-segment-usage-as-dirty.patch
new file mode 100644 (file)
index 0000000..df1c045
--- /dev/null
@@ -0,0 +1,77 @@
+From 512c5ca01a3610ab14ff6309db363de51f1c13a6 Mon Sep 17 00:00:00 2001
+From: Chen Zhongjin <chenzhongjin@huawei.com>
+Date: Fri, 18 Nov 2022 14:33:04 +0800
+Subject: nilfs2: fix nilfs_sufile_mark_dirty() not set segment usage as dirty
+
+From: Chen Zhongjin <chenzhongjin@huawei.com>
+
+commit 512c5ca01a3610ab14ff6309db363de51f1c13a6 upstream.
+
+When extending segments, nilfs_sufile_alloc() is called to get an
+unassigned segment, then mark it as dirty to avoid accidentally allocating
+the same segment in the future.
+
+But for some special cases such as a corrupted image it can be unreliable.
+If such corruption of the dirty state of the segment occurs, nilfs2 may
+reallocate a segment that is in use and pick the same segment for writing
+twice at the same time.
+
+This will cause the problem reported by syzkaller:
+https://syzkaller.appspot.com/bug?id=c7c4748e11ffcc367cef04f76e02e931833cbd24
+
+This case started with segbuf1.segnum = 3, nextnum = 4 when constructed.
+It supposed segment 4 has already been allocated and marked as dirty.
+
+However the dirty state was corrupted and segment 4 usage was not dirty.
+For the first time nilfs_segctor_extend_segments() segment 4 was allocated
+again, which made segbuf2 and next segbuf3 had same segment 4.
+
+sb_getblk() will get same bh for segbuf2 and segbuf3, and this bh is added
+to both buffer lists of two segbuf.  It makes the lists broken which
+causes NULL pointer dereference.
+
+Fix the problem by setting usage as dirty every time in
+nilfs_sufile_mark_dirty(), which is called during constructing current
+segment to be written out and before allocating next segment.
+
+[chenzhongjin@huawei.com: add lock protection per Ryusuke]
+  Link: https://lkml.kernel.org/r/20221121091141.214703-1-chenzhongjin@huawei.com
+Link: https://lkml.kernel.org/r/20221118063304.140187-1-chenzhongjin@huawei.com
+Fixes: 9ff05123e3bf ("nilfs2: segment constructor")
+Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com>
+Reported-by: <syzbot+77e4f0...@syzkaller.appspotmail.com>
+Reported-by: Liu Shixin <liushixin2@huawei.com>
+Acked-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
+Tested-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/nilfs2/sufile.c |    8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+--- a/fs/nilfs2/sufile.c
++++ b/fs/nilfs2/sufile.c
+@@ -495,14 +495,22 @@ void nilfs_sufile_do_free(struct inode *
+ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum)
+ {
+       struct buffer_head *bh;
++      void *kaddr;
++      struct nilfs_segment_usage *su;
+       int ret;
++      down_write(&NILFS_MDT(sufile)->mi_sem);
+       ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh);
+       if (!ret) {
+               mark_buffer_dirty(bh);
+               nilfs_mdt_mark_dirty(sufile);
++              kaddr = kmap_atomic(bh->b_page);
++              su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
++              nilfs_segment_usage_set_dirty(su);
++              kunmap_atomic(kaddr);
+               brelse(bh);
+       }
++      up_write(&NILFS_MDT(sufile)->mi_sem);
+       return ret;
+ }
index 1743fb3a107ab185a7b9f5563f9c9412aadaff2c..182c7a957dce4d8a2cfeceb5eae33f535b1096da 100644 (file)
@@ -150,3 +150,11 @@ bus-ixp4xx-don-t-touch-bit-7-on-ixp42x.patch
 usb-dwc3-gadget-conditionally-remove-requests.patch
 usb-dwc3-gadget-return-eshutdown-on-ep-disable.patch
 usb-dwc3-gadget-clear-ep-descriptor-last.patch
+nilfs2-fix-nilfs_sufile_mark_dirty-not-set-segment-usage-as-dirty.patch
+gcov-clang-fix-the-buffer-overflow-issue.patch
+mm-vmscan-fix-extreme-overreclaim-and-swap-floods.patch
+kvm-x86-nsvm-leave-nested-mode-on-vcpu-free.patch
+kvm-x86-forcibly-leave-nested-mode-on-vcpu-reset.patch
+kvm-x86-nsvm-harden-svm_free_nested-against-freeing-vmcb02-while-still-in-use.patch
+kvm-x86-add-kvm_leave_nested.patch
+kvm-x86-remove-exit_int_info-warning-in-svm_handle_exit.patch