From e8ea4aafefaa509443497459267ad7d7e073e861 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 30 Nov 2022 13:27:55 +0100 Subject: [PATCH] 5.15-stable patches added patches: gcov-clang-fix-the-buffer-overflow-issue.patch kvm-x86-add-kvm_leave_nested.patch kvm-x86-forcibly-leave-nested-mode-on-vcpu-reset.patch kvm-x86-nsvm-harden-svm_free_nested-against-freeing-vmcb02-while-still-in-use.patch kvm-x86-nsvm-leave-nested-mode-on-vcpu-free.patch kvm-x86-remove-exit_int_info-warning-in-svm_handle_exit.patch mm-vmscan-fix-extreme-overreclaim-and-swap-floods.patch nilfs2-fix-nilfs_sufile_mark_dirty-not-set-segment-usage-as-dirty.patch --- ...-clang-fix-the-buffer-overflow-issue.patch | 88 +++++++++++ queue-5.15/kvm-x86-add-kvm_leave_nested.patch | 71 +++++++++ ...ibly-leave-nested-mode-on-vcpu-reset.patch | 57 ++++++++ ...st-freeing-vmcb02-while-still-in-use.patch | 36 +++++ ...-nsvm-leave-nested-mode-on-vcpu-free.patch | 33 +++++ ..._int_info-warning-in-svm_handle_exit.patch | 58 ++++++++ ...-extreme-overreclaim-and-swap-floods.patch | 137 ++++++++++++++++++ ...dirty-not-set-segment-usage-as-dirty.patch | 77 ++++++++++ queue-5.15/series | 8 + 9 files changed, 565 insertions(+) create mode 100644 queue-5.15/gcov-clang-fix-the-buffer-overflow-issue.patch create mode 100644 queue-5.15/kvm-x86-add-kvm_leave_nested.patch create mode 100644 queue-5.15/kvm-x86-forcibly-leave-nested-mode-on-vcpu-reset.patch create mode 100644 queue-5.15/kvm-x86-nsvm-harden-svm_free_nested-against-freeing-vmcb02-while-still-in-use.patch create mode 100644 queue-5.15/kvm-x86-nsvm-leave-nested-mode-on-vcpu-free.patch create mode 100644 queue-5.15/kvm-x86-remove-exit_int_info-warning-in-svm_handle_exit.patch create mode 100644 queue-5.15/mm-vmscan-fix-extreme-overreclaim-and-swap-floods.patch create mode 100644 queue-5.15/nilfs2-fix-nilfs_sufile_mark_dirty-not-set-segment-usage-as-dirty.patch diff --git a/queue-5.15/gcov-clang-fix-the-buffer-overflow-issue.patch b/queue-5.15/gcov-clang-fix-the-buffer-overflow-issue.patch new file mode 100644 index 00000000000..55dfa6e508c --- /dev/null +++ b/queue-5.15/gcov-clang-fix-the-buffer-overflow-issue.patch @@ -0,0 +1,88 @@ +From a6f810efabfd789d3bbafeacb4502958ec56c5ce Mon Sep 17 00:00:00 2001 +From: Mukesh Ojha +Date: Thu, 10 Nov 2022 00:31:37 +0530 +Subject: gcov: clang: fix the buffer overflow issue + +From: Mukesh Ojha + +commit a6f810efabfd789d3bbafeacb4502958ec56c5ce upstream. + +Currently, in clang version of gcov code when module is getting removed +gcov_info_add() incorrectly adds the sfn_ptr->counter to all the +dst->functions and it result in the kernel panic in below crash report. +Fix this by properly handling it. + +[ 8.899094][ T599] Unable to handle kernel write to read-only memory at virtual address ffffff80461cc000 +[ 8.899100][ T599] Mem abort info: +[ 8.899102][ T599] ESR = 0x9600004f +[ 8.899103][ T599] EC = 0x25: DABT (current EL), IL = 32 bits +[ 8.899105][ T599] SET = 0, FnV = 0 +[ 8.899107][ T599] EA = 0, S1PTW = 0 +[ 8.899108][ T599] FSC = 0x0f: level 3 permission fault +[ 8.899110][ T599] Data abort info: +[ 8.899111][ T599] ISV = 0, ISS = 0x0000004f +[ 8.899113][ T599] CM = 0, WnR = 1 +[ 8.899114][ T599] swapper pgtable: 4k pages, 39-bit VAs, pgdp=00000000ab8de000 +[ 8.899116][ T599] [ffffff80461cc000] pgd=18000009ffcde003, p4d=18000009ffcde003, pud=18000009ffcde003, pmd=18000009ffcad003, pte=00600000c61cc787 +[ 8.899124][ T599] Internal error: Oops: 9600004f [#1] PREEMPT SMP +[ 8.899265][ T599] Skip md ftrace buffer dump for: 0x1609e0 +.... +.., +[ 8.899544][ T599] CPU: 7 PID: 599 Comm: modprobe Tainted: G S OE 5.15.41-android13-8-g38e9b1af6bce #1 +[ 8.899547][ T599] Hardware name: XXX (DT) +[ 8.899549][ T599] pstate: 82400005 (Nzcv daif +PAN -UAO +TCO -DIT -SSBS BTYPE=--) +[ 8.899551][ T599] pc : gcov_info_add+0x9c/0xb8 +[ 8.899557][ T599] lr : gcov_event+0x28c/0x6b8 +[ 8.899559][ T599] sp : ffffffc00e733b00 +[ 8.899560][ T599] x29: ffffffc00e733b00 x28: ffffffc00e733d30 x27: ffffffe8dc297470 +[ 8.899563][ T599] x26: ffffffe8dc297000 x25: ffffffe8dc297000 x24: ffffffe8dc297000 +[ 8.899566][ T599] x23: ffffffe8dc0a6200 x22: ffffff880f68bf20 x21: 0000000000000000 +[ 8.899569][ T599] x20: ffffff880f68bf00 x19: ffffff8801babc00 x18: ffffffc00d7f9058 +[ 8.899572][ T599] x17: 0000000000088793 x16: ffffff80461cbe00 x15: 9100052952800785 +[ 8.899575][ T599] x14: 0000000000000200 x13: 0000000000000041 x12: 9100052952800785 +[ 8.899577][ T599] x11: ffffffe8dc297000 x10: ffffffe8dc297000 x9 : ffffff80461cbc80 +[ 8.899580][ T599] x8 : ffffff8801babe80 x7 : ffffffe8dc2ec000 x6 : ffffffe8dc2ed000 +[ 8.899583][ T599] x5 : 000000008020001f x4 : fffffffe2006eae0 x3 : 000000008020001f +[ 8.899586][ T599] x2 : ffffff8027c49200 x1 : ffffff8801babc20 x0 : ffffff80461cb3a0 +[ 8.899589][ T599] Call trace: +[ 8.899590][ T599] gcov_info_add+0x9c/0xb8 +[ 8.899592][ T599] gcov_module_notifier+0xbc/0x120 +[ 8.899595][ T599] blocking_notifier_call_chain+0xa0/0x11c +[ 8.899598][ T599] do_init_module+0x2a8/0x33c +[ 8.899600][ T599] load_module+0x23cc/0x261c +[ 8.899602][ T599] __arm64_sys_finit_module+0x158/0x194 +[ 8.899604][ T599] invoke_syscall+0x94/0x2bc +[ 8.899607][ T599] el0_svc_common+0x1d8/0x34c +[ 8.899609][ T599] do_el0_svc+0x40/0x54 +[ 8.899611][ T599] el0_svc+0x94/0x2f0 +[ 8.899613][ T599] el0t_64_sync_handler+0x88/0xec +[ 8.899615][ T599] el0t_64_sync+0x1b4/0x1b8 +[ 8.899618][ T599] Code: f905f56c f86e69ec f86e6a0f 8b0c01ec (f82e6a0c) +[ 8.899620][ T599] ---[ end trace ed5218e9e5b6e2e6 ]--- + +Link: https://lkml.kernel.org/r/1668020497-13142-1-git-send-email-quic_mojha@quicinc.com +Fixes: e178a5beb369 ("gcov: clang support") +Signed-off-by: Mukesh Ojha +Reviewed-by: Peter Oberparleiter +Tested-by: Peter Oberparleiter +Cc: Nathan Chancellor +Cc: Nick Desaulniers +Cc: Tom Rix +Cc: [5.2+] +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + kernel/gcov/clang.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/kernel/gcov/clang.c ++++ b/kernel/gcov/clang.c +@@ -280,6 +280,8 @@ void gcov_info_add(struct gcov_info *dst + + for (i = 0; i < sfn_ptr->num_counters; i++) + dfn_ptr->counters[i] += sfn_ptr->counters[i]; ++ ++ sfn_ptr = list_next_entry(sfn_ptr, head); + } + } + diff --git a/queue-5.15/kvm-x86-add-kvm_leave_nested.patch b/queue-5.15/kvm-x86-add-kvm_leave_nested.patch new file mode 100644 index 00000000000..c464ec380dc --- /dev/null +++ b/queue-5.15/kvm-x86-add-kvm_leave_nested.patch @@ -0,0 +1,71 @@ +From f9697df251438b0798780900e8b43bdb12a56d64 Mon Sep 17 00:00:00 2001 +From: Maxim Levitsky +Date: Thu, 3 Nov 2022 16:13:45 +0200 +Subject: KVM: x86: add kvm_leave_nested + +From: Maxim Levitsky + +commit f9697df251438b0798780900e8b43bdb12a56d64 upstream. + +add kvm_leave_nested which wraps a call to nested_ops->leave_nested +into a function. + +Cc: stable@vger.kernel.org +Signed-off-by: Maxim Levitsky +Message-Id: <20221103141351.50662-4-mlevitsk@redhat.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/svm/nested.c | 3 --- + arch/x86/kvm/vmx/nested.c | 3 --- + arch/x86/kvm/x86.c | 8 +++++++- + 3 files changed, 7 insertions(+), 7 deletions(-) + +--- a/arch/x86/kvm/svm/nested.c ++++ b/arch/x86/kvm/svm/nested.c +@@ -940,9 +940,6 @@ void svm_free_nested(struct vcpu_svm *sv + svm->nested.initialized = false; + } + +-/* +- * Forcibly leave nested mode in order to be able to reset the VCPU later on. +- */ + void svm_leave_nested(struct kvm_vcpu *vcpu) + { + struct vcpu_svm *svm = to_svm(vcpu); +--- a/arch/x86/kvm/vmx/nested.c ++++ b/arch/x86/kvm/vmx/nested.c +@@ -6276,9 +6276,6 @@ out: + return kvm_state.size; + } + +-/* +- * Forcibly leave nested mode in order to be able to reset the VCPU later on. +- */ + void vmx_leave_nested(struct kvm_vcpu *vcpu) + { + if (is_guest_mode(vcpu)) { +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -608,6 +608,12 @@ void kvm_deliver_exception_payload(struc + } + EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload); + ++/* Forcibly leave the nested mode in cases like a vCPU reset */ ++static void kvm_leave_nested(struct kvm_vcpu *vcpu) ++{ ++ kvm_x86_ops.nested_ops->leave_nested(vcpu); ++} ++ + static void kvm_multiple_exception(struct kvm_vcpu *vcpu, + unsigned nr, bool has_error, u32 error_code, + bool has_payload, unsigned long payload, bool reinject) +@@ -4775,7 +4781,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_e + + if (events->flags & KVM_VCPUEVENT_VALID_SMM) { + if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) { +- kvm_x86_ops.nested_ops->leave_nested(vcpu); ++ kvm_leave_nested(vcpu); + kvm_smm_changed(vcpu, events->smi.smm); + } + diff --git a/queue-5.15/kvm-x86-forcibly-leave-nested-mode-on-vcpu-reset.patch b/queue-5.15/kvm-x86-forcibly-leave-nested-mode-on-vcpu-reset.patch new file mode 100644 index 00000000000..1c7c1271f43 --- /dev/null +++ b/queue-5.15/kvm-x86-forcibly-leave-nested-mode-on-vcpu-reset.patch @@ -0,0 +1,57 @@ +From ed129ec9057f89d615ba0c81a4984a90345a1684 Mon Sep 17 00:00:00 2001 +From: Maxim Levitsky +Date: Thu, 3 Nov 2022 16:13:46 +0200 +Subject: KVM: x86: forcibly leave nested mode on vCPU reset + +From: Maxim Levitsky + +commit ed129ec9057f89d615ba0c81a4984a90345a1684 upstream. + +While not obivous, kvm_vcpu_reset() leaves the nested mode by clearing +'vcpu->arch.hflags' but it does so without all the required housekeeping. + +On SVM, it is possible to have a vCPU reset while in guest mode because +unlike VMX, on SVM, INIT's are not latched in SVM non root mode and in +addition to that L1 doesn't have to intercept triple fault, which should +also trigger L1's reset if happens in L2 while L1 didn't intercept it. + +If one of the above conditions happen, KVM will continue to use vmcb02 +while not having in the guest mode. + +Later the IA32_EFER will be cleared which will lead to freeing of the +nested guest state which will (correctly) free the vmcb02, but since +KVM still uses it (incorrectly) this will lead to a use after free +and kernel crash. + +This issue is assigned CVE-2022-3344 + +Cc: stable@vger.kernel.org +Signed-off-by: Maxim Levitsky +Message-Id: <20221103141351.50662-5-mlevitsk@redhat.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/x86.c | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -11111,8 +11111,18 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcp + unsigned long new_cr0; + u32 eax, dummy; + ++ /* ++ * SVM doesn't unconditionally VM-Exit on INIT and SHUTDOWN, thus it's ++ * possible to INIT the vCPU while L2 is active. Force the vCPU back ++ * into L1 as EFER.SVME is cleared on INIT (along with all other EFER ++ * bits), i.e. virtualization is disabled. ++ */ ++ if (is_guest_mode(vcpu)) ++ kvm_leave_nested(vcpu); ++ + kvm_lapic_reset(vcpu, init_event); + ++ WARN_ON_ONCE(is_guest_mode(vcpu) || is_smm(vcpu)); + vcpu->arch.hflags = 0; + + vcpu->arch.smi_pending = 0; diff --git a/queue-5.15/kvm-x86-nsvm-harden-svm_free_nested-against-freeing-vmcb02-while-still-in-use.patch b/queue-5.15/kvm-x86-nsvm-harden-svm_free_nested-against-freeing-vmcb02-while-still-in-use.patch new file mode 100644 index 00000000000..a8ba9e7e76c --- /dev/null +++ b/queue-5.15/kvm-x86-nsvm-harden-svm_free_nested-against-freeing-vmcb02-while-still-in-use.patch @@ -0,0 +1,36 @@ +From 16ae56d7e0528559bf8dc9070e3bfd8ba3de80df Mon Sep 17 00:00:00 2001 +From: Maxim Levitsky +Date: Thu, 3 Nov 2022 16:13:44 +0200 +Subject: KVM: x86: nSVM: harden svm_free_nested against freeing vmcb02 while still in use + +From: Maxim Levitsky + +commit 16ae56d7e0528559bf8dc9070e3bfd8ba3de80df upstream. + +Make sure that KVM uses vmcb01 before freeing nested state, and warn if +that is not the case. + +This is a minimal fix for CVE-2022-3344 making the kernel print a warning +instead of a kernel panic. + +Cc: stable@vger.kernel.org +Signed-off-by: Maxim Levitsky +Message-Id: <20221103141351.50662-3-mlevitsk@redhat.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/svm/nested.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/arch/x86/kvm/svm/nested.c ++++ b/arch/x86/kvm/svm/nested.c +@@ -919,6 +919,9 @@ void svm_free_nested(struct vcpu_svm *sv + if (!svm->nested.initialized) + return; + ++ if (WARN_ON_ONCE(svm->vmcb != svm->vmcb01.ptr)) ++ svm_switch_vmcb(svm, &svm->vmcb01); ++ + svm_vcpu_free_msrpm(svm->nested.msrpm); + svm->nested.msrpm = NULL; + diff --git a/queue-5.15/kvm-x86-nsvm-leave-nested-mode-on-vcpu-free.patch b/queue-5.15/kvm-x86-nsvm-leave-nested-mode-on-vcpu-free.patch new file mode 100644 index 00000000000..4fa4d35fd02 --- /dev/null +++ b/queue-5.15/kvm-x86-nsvm-leave-nested-mode-on-vcpu-free.patch @@ -0,0 +1,33 @@ +From 917401f26a6af5756d89b550a8e1bd50cf42b07e Mon Sep 17 00:00:00 2001 +From: Maxim Levitsky +Date: Thu, 3 Nov 2022 16:13:43 +0200 +Subject: KVM: x86: nSVM: leave nested mode on vCPU free + +From: Maxim Levitsky + +commit 917401f26a6af5756d89b550a8e1bd50cf42b07e upstream. + +If the VM was terminated while nested, we free the nested state +while the vCPU still is in nested mode. + +Soon a warning will be added for this condition. + +Cc: stable@vger.kernel.org +Signed-off-by: Maxim Levitsky +Message-Id: <20221103141351.50662-2-mlevitsk@redhat.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/svm/svm.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -1427,6 +1427,7 @@ static void svm_free_vcpu(struct kvm_vcp + */ + svm_clear_current_vmcb(svm->vmcb); + ++ svm_leave_nested(vcpu); + svm_free_nested(svm); + + sev_free_vcpu(vcpu); diff --git a/queue-5.15/kvm-x86-remove-exit_int_info-warning-in-svm_handle_exit.patch b/queue-5.15/kvm-x86-remove-exit_int_info-warning-in-svm_handle_exit.patch new file mode 100644 index 00000000000..d01192b4696 --- /dev/null +++ b/queue-5.15/kvm-x86-remove-exit_int_info-warning-in-svm_handle_exit.patch @@ -0,0 +1,58 @@ +From 05311ce954aebe75935d9ae7d38ac82b5b796e33 Mon Sep 17 00:00:00 2001 +From: Maxim Levitsky +Date: Thu, 3 Nov 2022 16:13:51 +0200 +Subject: KVM: x86: remove exit_int_info warning in svm_handle_exit + +From: Maxim Levitsky + +commit 05311ce954aebe75935d9ae7d38ac82b5b796e33 upstream. + +It is valid to receive external interrupt and have broken IDT entry, +which will lead to #GP with exit_int_into that will contain the index of +the IDT entry (e.g any value). + +Other exceptions can happen as well, like #NP or #SS +(if stack switch fails). + +Thus this warning can be user triggred and has very little value. + +Cc: stable@vger.kernel.org +Signed-off-by: Maxim Levitsky +Message-Id: <20221103141351.50662-10-mlevitsk@redhat.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/svm/svm.c | 15 --------------- + 1 file changed, 15 deletions(-) + +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -317,12 +317,6 @@ int svm_set_efer(struct kvm_vcpu *vcpu, + return 0; + } + +-static int is_external_interrupt(u32 info) +-{ +- info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; +- return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR); +-} +- + static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu) + { + struct vcpu_svm *svm = to_svm(vcpu); +@@ -3360,15 +3354,6 @@ static int handle_exit(struct kvm_vcpu * + return 0; + } + +- if (is_external_interrupt(svm->vmcb->control.exit_int_info) && +- exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR && +- exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH && +- exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI) +- printk(KERN_ERR "%s: unexpected exit_int_info 0x%x " +- "exit_code 0x%x\n", +- __func__, svm->vmcb->control.exit_int_info, +- exit_code); +- + if (exit_fastpath != EXIT_FASTPATH_NONE) + return 1; + diff --git a/queue-5.15/mm-vmscan-fix-extreme-overreclaim-and-swap-floods.patch b/queue-5.15/mm-vmscan-fix-extreme-overreclaim-and-swap-floods.patch new file mode 100644 index 00000000000..b7e9919666b --- /dev/null +++ b/queue-5.15/mm-vmscan-fix-extreme-overreclaim-and-swap-floods.patch @@ -0,0 +1,137 @@ +From f53af4285d775cd9a9a146fc438bd0a1bee1838a Mon Sep 17 00:00:00 2001 +From: Johannes Weiner +Date: Tue, 2 Aug 2022 12:28:11 -0400 +Subject: mm: vmscan: fix extreme overreclaim and swap floods + +From: Johannes Weiner + +commit f53af4285d775cd9a9a146fc438bd0a1bee1838a upstream. + +During proactive reclaim, we sometimes observe severe overreclaim, with +several thousand times more pages reclaimed than requested. + +This trace was obtained from shrink_lruvec() during such an instance: + + prio:0 anon_cost:1141521 file_cost:7767 + nr_reclaimed:4387406 nr_to_reclaim:1047 (or_factor:4190) + nr=[7161123 345 578 1111] + +While he reclaimer requested 4M, vmscan reclaimed close to 16G, most of it +by swapping. These requests take over a minute, during which the write() +to memory.reclaim is unkillably stuck inside the kernel. + +Digging into the source, this is caused by the proportional reclaim +bailout logic. This code tries to resolve a fundamental conflict: to +reclaim roughly what was requested, while also aging all LRUs fairly and +in accordance to their size, swappiness, refault rates etc. The way it +attempts fairness is that once the reclaim goal has been reached, it stops +scanning the LRUs with the smaller remaining scan targets, and adjusts the +remainder of the bigger LRUs according to how much of the smaller LRUs was +scanned. It then finishes scanning that remainder regardless of the +reclaim goal. + +This works fine if priority levels are low and the LRU lists are +comparable in size. However, in this instance, the cgroup that is +targeted by proactive reclaim has almost no files left - they've already +been squeezed out by proactive reclaim earlier - and the remaining anon +pages are hot. Anon rotations cause the priority level to drop to 0, +which results in reclaim targeting all of anon (a lot) and all of file +(almost nothing). By the time reclaim decides to bail, it has scanned +most or all of the file target, and therefor must also scan most or all of +the enormous anon target. This target is thousands of times larger than +the reclaim goal, thus causing the overreclaim. + +The bailout code hasn't changed in years, why is this failing now? The +most likely explanations are two other recent changes in anon reclaim: + +1. Before the series starting with commit 5df741963d52 ("mm: fix LRU + balancing effect of new transparent huge pages"), the VM was + overall relatively reluctant to swap at all, even if swap was + configured. This means the LRU balancing code didn't come into play + as often as it does now, and mostly in high pressure situations + where pronounced swap activity wouldn't be as surprising. + +2. For historic reasons, shrink_lruvec() loops on the scan targets of + all LRU lists except the active anon one, meaning it would bail if + the only remaining pages to scan were active anon - even if there + were a lot of them. + + Before the series starting with commit ccc5dc67340c ("mm/vmscan: + make active/inactive ratio as 1:1 for anon lru"), most anon pages + would live on the active LRU; the inactive one would contain only a + handful of preselected reclaim candidates. After the series, anon + gets aged similarly to file, and the inactive list is the default + for new anon pages as well, making it often the much bigger list. + + As a result, the VM is now more likely to actually finish large + anon targets than before. + +Change the code such that only one SWAP_CLUSTER_MAX-sized nudge toward the +larger LRU lists is made before bailing out on a met reclaim goal. + +This fixes the extreme overreclaim problem. + +Fairness is more subtle and harder to evaluate. No obvious misbehavior +was observed on the test workload, in any case. Conceptually, fairness +should primarily be a cumulative effect from regular, lower priority +scans. Once the VM is in trouble and needs to escalate scan targets to +make forward progress, fairness needs to take a backseat. This is also +acknowledged by the myriad exceptions in get_scan_count(). This patch +makes fairness decrease gradually, as it keeps fairness work static over +increasing priority levels with growing scan targets. This should make +more sense - although we may have to re-visit the exact values. + +Link: https://lkml.kernel.org/r/20220802162811.39216-1-hannes@cmpxchg.org +Signed-off-by: Johannes Weiner +Reviewed-by: Rik van Riel +Acked-by: Mel Gorman +Cc: Hugh Dickins +Cc: Joonsoo Kim +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/vmscan.c | 10 ++++------ + 1 file changed, 4 insertions(+), 6 deletions(-) + +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -2791,8 +2791,8 @@ static void shrink_lruvec(struct lruvec + enum lru_list lru; + unsigned long nr_reclaimed = 0; + unsigned long nr_to_reclaim = sc->nr_to_reclaim; ++ bool proportional_reclaim; + struct blk_plug plug; +- bool scan_adjusted; + + get_scan_count(lruvec, sc, nr); + +@@ -2810,8 +2810,8 @@ static void shrink_lruvec(struct lruvec + * abort proportional reclaim if either the file or anon lru has already + * dropped to zero at the first pass. + */ +- scan_adjusted = (!cgroup_reclaim(sc) && !current_is_kswapd() && +- sc->priority == DEF_PRIORITY); ++ proportional_reclaim = (!cgroup_reclaim(sc) && !current_is_kswapd() && ++ sc->priority == DEF_PRIORITY); + + blk_start_plug(&plug); + while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || +@@ -2831,7 +2831,7 @@ static void shrink_lruvec(struct lruvec + + cond_resched(); + +- if (nr_reclaimed < nr_to_reclaim || scan_adjusted) ++ if (nr_reclaimed < nr_to_reclaim || proportional_reclaim) + continue; + + /* +@@ -2882,8 +2882,6 @@ static void shrink_lruvec(struct lruvec + nr_scanned = targets[lru] - nr[lru]; + nr[lru] = targets[lru] * (100 - percentage) / 100; + nr[lru] -= min(nr[lru], nr_scanned); +- +- scan_adjusted = true; + } + blk_finish_plug(&plug); + sc->nr_reclaimed += nr_reclaimed; diff --git a/queue-5.15/nilfs2-fix-nilfs_sufile_mark_dirty-not-set-segment-usage-as-dirty.patch b/queue-5.15/nilfs2-fix-nilfs_sufile_mark_dirty-not-set-segment-usage-as-dirty.patch new file mode 100644 index 00000000000..df1c045df31 --- /dev/null +++ b/queue-5.15/nilfs2-fix-nilfs_sufile_mark_dirty-not-set-segment-usage-as-dirty.patch @@ -0,0 +1,77 @@ +From 512c5ca01a3610ab14ff6309db363de51f1c13a6 Mon Sep 17 00:00:00 2001 +From: Chen Zhongjin +Date: Fri, 18 Nov 2022 14:33:04 +0800 +Subject: nilfs2: fix nilfs_sufile_mark_dirty() not set segment usage as dirty + +From: Chen Zhongjin + +commit 512c5ca01a3610ab14ff6309db363de51f1c13a6 upstream. + +When extending segments, nilfs_sufile_alloc() is called to get an +unassigned segment, then mark it as dirty to avoid accidentally allocating +the same segment in the future. + +But for some special cases such as a corrupted image it can be unreliable. +If such corruption of the dirty state of the segment occurs, nilfs2 may +reallocate a segment that is in use and pick the same segment for writing +twice at the same time. + +This will cause the problem reported by syzkaller: +https://syzkaller.appspot.com/bug?id=c7c4748e11ffcc367cef04f76e02e931833cbd24 + +This case started with segbuf1.segnum = 3, nextnum = 4 when constructed. +It supposed segment 4 has already been allocated and marked as dirty. + +However the dirty state was corrupted and segment 4 usage was not dirty. +For the first time nilfs_segctor_extend_segments() segment 4 was allocated +again, which made segbuf2 and next segbuf3 had same segment 4. + +sb_getblk() will get same bh for segbuf2 and segbuf3, and this bh is added +to both buffer lists of two segbuf. It makes the lists broken which +causes NULL pointer dereference. + +Fix the problem by setting usage as dirty every time in +nilfs_sufile_mark_dirty(), which is called during constructing current +segment to be written out and before allocating next segment. + +[chenzhongjin@huawei.com: add lock protection per Ryusuke] + Link: https://lkml.kernel.org/r/20221121091141.214703-1-chenzhongjin@huawei.com +Link: https://lkml.kernel.org/r/20221118063304.140187-1-chenzhongjin@huawei.com +Fixes: 9ff05123e3bf ("nilfs2: segment constructor") +Signed-off-by: Chen Zhongjin +Reported-by: +Reported-by: Liu Shixin +Acked-by: Ryusuke Konishi +Tested-by: Ryusuke Konishi +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + fs/nilfs2/sufile.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +--- a/fs/nilfs2/sufile.c ++++ b/fs/nilfs2/sufile.c +@@ -495,14 +495,22 @@ void nilfs_sufile_do_free(struct inode * + int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum) + { + struct buffer_head *bh; ++ void *kaddr; ++ struct nilfs_segment_usage *su; + int ret; + ++ down_write(&NILFS_MDT(sufile)->mi_sem); + ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh); + if (!ret) { + mark_buffer_dirty(bh); + nilfs_mdt_mark_dirty(sufile); ++ kaddr = kmap_atomic(bh->b_page); ++ su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr); ++ nilfs_segment_usage_set_dirty(su); ++ kunmap_atomic(kaddr); + brelse(bh); + } ++ up_write(&NILFS_MDT(sufile)->mi_sem); + return ret; + } + diff --git a/queue-5.15/series b/queue-5.15/series index 1743fb3a107..182c7a957dc 100644 --- a/queue-5.15/series +++ b/queue-5.15/series @@ -150,3 +150,11 @@ bus-ixp4xx-don-t-touch-bit-7-on-ixp42x.patch usb-dwc3-gadget-conditionally-remove-requests.patch usb-dwc3-gadget-return-eshutdown-on-ep-disable.patch usb-dwc3-gadget-clear-ep-descriptor-last.patch +nilfs2-fix-nilfs_sufile_mark_dirty-not-set-segment-usage-as-dirty.patch +gcov-clang-fix-the-buffer-overflow-issue.patch +mm-vmscan-fix-extreme-overreclaim-and-swap-floods.patch +kvm-x86-nsvm-leave-nested-mode-on-vcpu-free.patch +kvm-x86-forcibly-leave-nested-mode-on-vcpu-reset.patch +kvm-x86-nsvm-harden-svm_free_nested-against-freeing-vmcb02-while-still-in-use.patch +kvm-x86-add-kvm_leave_nested.patch +kvm-x86-remove-exit_int_info-warning-in-svm_handle_exit.patch -- 2.47.3