From: Greg Kroah-Hartman Date: Thu, 8 Jan 2026 12:38:40 +0000 (+0100) Subject: 6.6-stable patches X-Git-Tag: v6.1.160~49 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=ecd45099fd5d0fa77daa7f7aba455eed04763a84;p=thirdparty%2Fkernel%2Fstable-queue.git 6.6-stable patches added patches: kvm-nsvm-always-recalculate-lbr-msr-intercepts-in-svm_update_lbrv.patch kvm-nsvm-fix-and-simplify-lbr-virtualization-handling-with-nested.patch kvm-svm-fix-redundant-updates-of-lbr-msr-intercepts.patch kvm-svm-introduce-svm_recalc_lbr_msr_intercepts.patch mm-damon-tests-core-kunit-handle-alloc-failres-in-damon_test_new_filter.patch mm-damon-tests-core-kunit-handle-alloc-failures-on-damon_test_merge_two.patch mm-damon-tests-core-kunit-handle-alloc-failures-on-damon_test_split_at.patch mm-damon-tests-core-kunit-handle-alloc-failures-on-dasmon_test_merge_regions_of.patch mm-damon-tests-core-kunit-handle-allocation-failures-in-damon_test_regions.patch mm-damon-tests-vaddr-kunit-handle-alloc-failures-in-damon_test_split_evenly_fail.patch mm-damon-tests-vaddr-kunit-handle-alloc-failures-on-damon_do_test_apply_three_regions.patch mm-damon-tests-vaddr-kunit-handle-alloc-failures-on-damon_test_split_evenly_succ.patch net-allow-to-use-smp-threads-for-backlog-napi.patch net-remove-conditional-threaded-napi-wakeup-based-on-task-state.patch rdma-core-fix-kasan-slab-use-after-free-read-in-ib_register_device-problem.patch rdma-rxe-fix-the-failure-of-ibv_query_device-and-ibv_query_device_ex-tests.patch rdma-rxe-remove-the-direct-link-to-net_device.patch sched-fair-proportional-newidle-balance.patch sched-fair-small-cleanup-to-sched_balance_newidle.patch sched-fair-small-cleanup-to-update_newidle_cost.patch --- diff --git a/queue-6.6/kvm-nsvm-always-recalculate-lbr-msr-intercepts-in-svm_update_lbrv.patch b/queue-6.6/kvm-nsvm-always-recalculate-lbr-msr-intercepts-in-svm_update_lbrv.patch new file mode 100644 index 0000000000..27366f8e91 --- /dev/null +++ b/queue-6.6/kvm-nsvm-always-recalculate-lbr-msr-intercepts-in-svm_update_lbrv.patch @@ -0,0 +1,100 @@ +From fbe5e5f030c22ae717ee422aaab0e00ea84fab5e Mon Sep 17 00:00:00 2001 +From: Yosry Ahmed +Date: Sat, 8 Nov 2025 00:45:20 +0000 +Subject: KVM: nSVM: Always recalculate LBR MSR intercepts in svm_update_lbrv() + +From: Yosry Ahmed + +commit fbe5e5f030c22ae717ee422aaab0e00ea84fab5e upstream. + +svm_update_lbrv() is called when MSR_IA32_DEBUGCTLMSR is updated, and on +nested transitions where LBRV is used. It checks whether LBRV enablement +needs to be changed in the current VMCB, and if it does, it also +recalculate intercepts to LBR MSRs. + +However, there are cases where intercepts need to be updated even when +LBRV enablement doesn't. Example scenario: +- L1 has MSR_IA32_DEBUGCTLMSR cleared. +- L1 runs L2 without LBR_CTL_ENABLE (no LBRV). +- L2 sets DEBUGCTLMSR_LBR in MSR_IA32_DEBUGCTLMSR, svm_update_lbrv() + sets LBR_CTL_ENABLE in VMCB02 and disables intercepts to LBR MSRs. +- L2 exits to L1, svm_update_lbrv() is not called on this transition. +- L1 clears MSR_IA32_DEBUGCTLMSR, svm_update_lbrv() finds that + LBR_CTL_ENABLE is already cleared in VMCB01 and does nothing. +- Intercepts remain disabled, L1 reads to LBR MSRs read the host MSRs. + +Fix it by always recalculating intercepts in svm_update_lbrv(). + +Fixes: 1d5a1b5860ed ("KVM: x86: nSVM: correctly virtualize LBR msrs when L2 is running") +Cc: stable@vger.kernel.org +Signed-off-by: Yosry Ahmed +Link: https://patch.msgid.link/20251108004524.1600006-3-yosry.ahmed@linux.dev +Signed-off-by: Paolo Bonzini +Signed-off-by: Yosry Ahmed +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/svm/svm.c | 29 +++++++++++++++++++---------- + 1 file changed, 19 insertions(+), 10 deletions(-) + +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -1031,26 +1031,30 @@ static void svm_recalc_lbr_msr_intercept + !intercept, !intercept); + } + +-void svm_enable_lbrv(struct kvm_vcpu *vcpu) ++static void __svm_enable_lbrv(struct kvm_vcpu *vcpu) + { + struct vcpu_svm *svm = to_svm(vcpu); + + svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; +- svm_recalc_lbr_msr_intercepts(vcpu); + + /* Move the LBR msrs to the vmcb02 so that the guest can see them. */ + if (is_guest_mode(vcpu)) + svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr); + } + +-static void svm_disable_lbrv(struct kvm_vcpu *vcpu) ++void svm_enable_lbrv(struct kvm_vcpu *vcpu) ++{ ++ __svm_enable_lbrv(vcpu); ++ svm_recalc_lbr_msr_intercepts(vcpu); ++} ++ ++static void __svm_disable_lbrv(struct kvm_vcpu *vcpu) + { + struct vcpu_svm *svm = to_svm(vcpu); + + KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm); + + svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK; +- svm_recalc_lbr_msr_intercepts(vcpu); + + /* + * Move the LBR msrs back to the vmcb01 to avoid copying them +@@ -1079,13 +1083,18 @@ void svm_update_lbrv(struct kvm_vcpu *vc + (is_guest_mode(vcpu) && guest_can_use(vcpu, X86_FEATURE_LBRV) && + (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK)); + +- if (enable_lbrv == current_enable_lbrv) +- return; ++ if (enable_lbrv && !current_enable_lbrv) ++ __svm_enable_lbrv(vcpu); ++ else if (!enable_lbrv && current_enable_lbrv) ++ __svm_disable_lbrv(vcpu); + +- if (enable_lbrv) +- svm_enable_lbrv(vcpu); +- else +- svm_disable_lbrv(vcpu); ++ /* ++ * During nested transitions, it is possible that the current VMCB has ++ * LBR_CTL set, but the previous LBR_CTL had it cleared (or vice versa). ++ * In this case, even though LBR_CTL does not need an update, intercepts ++ * do, so always recalculate the intercepts here. ++ */ ++ svm_recalc_lbr_msr_intercepts(vcpu); + } + + void disable_nmi_singlestep(struct vcpu_svm *svm) diff --git a/queue-6.6/kvm-nsvm-fix-and-simplify-lbr-virtualization-handling-with-nested.patch b/queue-6.6/kvm-nsvm-fix-and-simplify-lbr-virtualization-handling-with-nested.patch new file mode 100644 index 0000000000..c05070559c --- /dev/null +++ b/queue-6.6/kvm-nsvm-fix-and-simplify-lbr-virtualization-handling-with-nested.patch @@ -0,0 +1,193 @@ +From 8a4821412cf2c1429fffa07c012dd150f2edf78c Mon Sep 17 00:00:00 2001 +From: Yosry Ahmed +Date: Sat, 8 Nov 2025 00:45:21 +0000 +Subject: KVM: nSVM: Fix and simplify LBR virtualization handling with nested + +From: Yosry Ahmed + +commit 8a4821412cf2c1429fffa07c012dd150f2edf78c upstream. + +The current scheme for handling LBRV when nested is used is very +complicated, especially when L1 does not enable LBRV (i.e. does not set +LBR_CTL_ENABLE_MASK). + +To avoid copying LBRs between VMCB01 and VMCB02 on every nested +transition, the current implementation switches between using VMCB01 or +VMCB02 as the source of truth for the LBRs while L2 is running. If L2 +enables LBR, VMCB02 is used as the source of truth. When L2 disables +LBR, the LBRs are copied to VMCB01 and VMCB01 is used as the source of +truth. This introduces significant complexity, and incorrect behavior in +some cases. + +For example, on a nested #VMEXIT, the LBRs are only copied from VMCB02 +to VMCB01 if LBRV is enabled in VMCB01. This is because L2's writes to +MSR_IA32_DEBUGCTLMSR to enable LBR are intercepted and propagated to +VMCB01 instead of VMCB02. However, LBRV is only enabled in VMCB02 when +L2 is running. + +This means that if L2 enables LBR and exits to L1, the LBRs will not be +propagated from VMCB02 to VMCB01, because LBRV is disabled in VMCB01. + +There is no meaningful difference in CPUID rate in L2 when copying LBRs +on every nested transition vs. the current approach, so do the simple +and correct thing and always copy LBRs between VMCB01 and VMCB02 on +nested transitions (when LBRV is disabled by L1). Drop the conditional +LBRs copying in __svm_{enable/disable}_lbrv() as it is now unnecessary. + +VMCB02 becomes the only source of truth for LBRs when L2 is running, +regardless of LBRV being enabled by L1, drop svm_get_lbr_vmcb() and use +svm->vmcb directly in its place. + +Fixes: 1d5a1b5860ed ("KVM: x86: nSVM: correctly virtualize LBR msrs when L2 is running") +Cc: stable@vger.kernel.org +Signed-off-by: Yosry Ahmed +Link: https://patch.msgid.link/20251108004524.1600006-4-yosry.ahmed@linux.dev +Signed-off-by: Paolo Bonzini +Signed-off-by: Yosry Ahmed +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/svm/nested.c | 20 ++++++------------- + arch/x86/kvm/svm/svm.c | 47 +++++++++------------------------------------- + 2 files changed, 17 insertions(+), 50 deletions(-) + +--- a/arch/x86/kvm/svm/nested.c ++++ b/arch/x86/kvm/svm/nested.c +@@ -601,11 +601,10 @@ static void nested_vmcb02_prepare_save(s + */ + svm_copy_lbrs(vmcb02, vmcb12); + vmcb02->save.dbgctl &= ~DEBUGCTL_RESERVED_BITS; +- svm_update_lbrv(&svm->vcpu); +- +- } else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) { ++ } else { + svm_copy_lbrs(vmcb02, vmcb01); + } ++ svm_update_lbrv(&svm->vcpu); + } + + static inline bool is_evtinj_soft(u32 evtinj) +@@ -731,11 +730,7 @@ static void nested_vmcb02_prepare_contro + svm->soft_int_next_rip = vmcb12_rip; + } + +- vmcb02->control.virt_ext = vmcb01->control.virt_ext & +- LBR_CTL_ENABLE_MASK; +- if (guest_can_use(vcpu, X86_FEATURE_LBRV)) +- vmcb02->control.virt_ext |= +- (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK); ++ /* LBR_CTL_ENABLE_MASK is controlled by svm_update_lbrv() */ + + if (!nested_vmcb_needs_vls_intercept(svm)) + vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; +@@ -1066,13 +1061,12 @@ int nested_svm_vmexit(struct vcpu_svm *s + kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); + + if (unlikely(guest_can_use(vcpu, X86_FEATURE_LBRV) && +- (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) { ++ (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) + svm_copy_lbrs(vmcb12, vmcb02); +- svm_update_lbrv(vcpu); +- } else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) { ++ else + svm_copy_lbrs(vmcb01, vmcb02); +- svm_update_lbrv(vcpu); +- } ++ ++ svm_update_lbrv(vcpu); + + if (vnmi) { + if (vmcb02->control.int_ctl & V_NMI_BLOCKING_MASK) +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -1033,13 +1033,7 @@ static void svm_recalc_lbr_msr_intercept + + static void __svm_enable_lbrv(struct kvm_vcpu *vcpu) + { +- struct vcpu_svm *svm = to_svm(vcpu); +- +- svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; +- +- /* Move the LBR msrs to the vmcb02 so that the guest can see them. */ +- if (is_guest_mode(vcpu)) +- svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr); ++ to_svm(vcpu)->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; + } + + void svm_enable_lbrv(struct kvm_vcpu *vcpu) +@@ -1050,36 +1044,15 @@ void svm_enable_lbrv(struct kvm_vcpu *vc + + static void __svm_disable_lbrv(struct kvm_vcpu *vcpu) + { +- struct vcpu_svm *svm = to_svm(vcpu); +- + KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm); +- +- svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK; +- +- /* +- * Move the LBR msrs back to the vmcb01 to avoid copying them +- * on nested guest entries. +- */ +- if (is_guest_mode(vcpu)) +- svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb); +-} +- +-static struct vmcb *svm_get_lbr_vmcb(struct vcpu_svm *svm) +-{ +- /* +- * If LBR virtualization is disabled, the LBR MSRs are always kept in +- * vmcb01. If LBR virtualization is enabled and L1 is running VMs of +- * its own, the MSRs are moved between vmcb01 and vmcb02 as needed. +- */ +- return svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK ? svm->vmcb : +- svm->vmcb01.ptr; ++ to_svm(vcpu)->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK; + } + + void svm_update_lbrv(struct kvm_vcpu *vcpu) + { + struct vcpu_svm *svm = to_svm(vcpu); + bool current_enable_lbrv = svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK; +- bool enable_lbrv = (svm_get_lbr_vmcb(svm)->save.dbgctl & DEBUGCTLMSR_LBR) || ++ bool enable_lbrv = (svm->vmcb->save.dbgctl & DEBUGCTLMSR_LBR) || + (is_guest_mode(vcpu) && guest_can_use(vcpu, X86_FEATURE_LBRV) && + (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK)); + +@@ -2925,19 +2898,19 @@ static int svm_get_msr(struct kvm_vcpu * + msr_info->data = svm->tsc_aux; + break; + case MSR_IA32_DEBUGCTLMSR: +- msr_info->data = svm_get_lbr_vmcb(svm)->save.dbgctl; ++ msr_info->data = svm->vmcb->save.dbgctl; + break; + case MSR_IA32_LASTBRANCHFROMIP: +- msr_info->data = svm_get_lbr_vmcb(svm)->save.br_from; ++ msr_info->data = svm->vmcb->save.br_from; + break; + case MSR_IA32_LASTBRANCHTOIP: +- msr_info->data = svm_get_lbr_vmcb(svm)->save.br_to; ++ msr_info->data = svm->vmcb->save.br_to; + break; + case MSR_IA32_LASTINTFROMIP: +- msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_from; ++ msr_info->data = svm->vmcb->save.last_excp_from; + break; + case MSR_IA32_LASTINTTOIP: +- msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_to; ++ msr_info->data = svm->vmcb->save.last_excp_to; + break; + case MSR_VM_HSAVE_PA: + msr_info->data = svm->nested.hsave_msr; +@@ -3206,10 +3179,10 @@ static int svm_set_msr(struct kvm_vcpu * + if (data & DEBUGCTL_RESERVED_BITS) + return 1; + +- if (svm_get_lbr_vmcb(svm)->save.dbgctl == data) ++ if (svm->vmcb->save.dbgctl == data) + break; + +- svm_get_lbr_vmcb(svm)->save.dbgctl = data; ++ svm->vmcb->save.dbgctl = data; + vmcb_mark_dirty(svm->vmcb, VMCB_LBR); + svm_update_lbrv(vcpu); + break; diff --git a/queue-6.6/kvm-svm-fix-redundant-updates-of-lbr-msr-intercepts.patch b/queue-6.6/kvm-svm-fix-redundant-updates-of-lbr-msr-intercepts.patch new file mode 100644 index 0000000000..4d4a7cbb90 --- /dev/null +++ b/queue-6.6/kvm-svm-fix-redundant-updates-of-lbr-msr-intercepts.patch @@ -0,0 +1,95 @@ +From 3fa05f96fc08dff5e846c2cc283a249c1bf029a1 Mon Sep 17 00:00:00 2001 +From: Yosry Ahmed +Date: Wed, 12 Nov 2025 01:30:17 +0000 +Subject: KVM: SVM: Fix redundant updates of LBR MSR intercepts + +From: Yosry Ahmed + +commit 3fa05f96fc08dff5e846c2cc283a249c1bf029a1 upstream. + +Don't update the LBR MSR intercept bitmaps if they're already up-to-date, +as unconditionally updating the intercepts forces KVM to recalculate the +MSR bitmaps for vmcb02 on every nested VMRUN. The redundant updates are +functionally okay; however, they neuter an optimization in Hyper-V +nested virtualization enlightenments and this manifests as a self-test +failure. + +In particular, Hyper-V lets L1 mark "nested enlightenments" as clean, i.e. +tell KVM that no changes were made to the MSR bitmap since the last VMRUN. +The hyperv_svm_test KVM selftest intentionally changes the MSR bitmap +"without telling KVM about it" to verify that KVM honors the clean hint, +correctly fails because KVM notices the changed bitmap anyway: + + ==== Test Assertion Failure ==== + x86/hyperv_svm_test.c:120: vmcb->control.exit_code == 0x081 + pid=193558 tid=193558 errno=4 - Interrupted system call + 1 0x0000000000411361: assert_on_unhandled_exception at processor.c:659 + 2 0x0000000000406186: _vcpu_run at kvm_util.c:1699 + 3 (inlined by) vcpu_run at kvm_util.c:1710 + 4 0x0000000000401f2a: main at hyperv_svm_test.c:175 + 5 0x000000000041d0d3: __libc_start_call_main at libc-start.o:? + 6 0x000000000041f27c: __libc_start_main_impl at ??:? + 7 0x00000000004021a0: _start at ??:? + vmcb->control.exit_code == SVM_EXIT_VMMCALL + +Do *not* fix this by skipping svm_hv_vmcb_dirty_nested_enlightenments() +when svm_set_intercept_for_msr() performs a no-op change. changes to +the L0 MSR interception bitmap are only triggered by full CPUID updates +and MSR filter updates, both of which should be rare. Changing +svm_set_intercept_for_msr() risks hiding unintended pessimizations +like this one, and is actually more complex than this change. + +Fixes: fbe5e5f030c2 ("KVM: nSVM: Always recalculate LBR MSR intercepts in svm_update_lbrv()") +Cc: stable@vger.kernel.org +Signed-off-by: Yosry Ahmed +Link: https://patch.msgid.link/20251112013017.1836863-1-yosry.ahmed@linux.dev +[Rewritten commit message based on mailing list discussion. - Paolo] +Reviewed-by: Sean Christopherson +Tested-by: Sean Christopherson +Signed-off-by: Paolo Bonzini +Signed-off-by: Yosry Ahmed +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/svm/svm.c | 6 ++++++ + arch/x86/kvm/svm/svm.h | 1 + + 2 files changed, 7 insertions(+) + +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -1017,6 +1017,9 @@ static void svm_recalc_lbr_msr_intercept + struct vcpu_svm *svm = to_svm(vcpu); + bool intercept = !(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK); + ++ if (intercept == svm->lbr_msrs_intercepted) ++ return; ++ + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, + !intercept, !intercept); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, +@@ -1029,6 +1032,8 @@ static void svm_recalc_lbr_msr_intercept + if (sev_es_guest(vcpu->kvm)) + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_DEBUGCTLMSR, + !intercept, !intercept); ++ ++ svm->lbr_msrs_intercepted = intercept; + } + + static void __svm_enable_lbrv(struct kvm_vcpu *vcpu) +@@ -1473,6 +1478,7 @@ static int svm_vcpu_create(struct kvm_vc + } + + svm->x2avic_msrs_intercepted = true; ++ svm->lbr_msrs_intercepted = true; + + svm->vmcb01.ptr = page_address(vmcb01_page); + svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT); +--- a/arch/x86/kvm/svm/svm.h ++++ b/arch/x86/kvm/svm/svm.h +@@ -288,6 +288,7 @@ struct vcpu_svm { + bool guest_state_loaded; + + bool x2avic_msrs_intercepted; ++ bool lbr_msrs_intercepted; + + /* Guest GIF value, used when vGIF is not enabled */ + bool guest_gif; diff --git a/queue-6.6/kvm-svm-introduce-svm_recalc_lbr_msr_intercepts.patch b/queue-6.6/kvm-svm-introduce-svm_recalc_lbr_msr_intercepts.patch new file mode 100644 index 0000000000..e2005bdcb0 --- /dev/null +++ b/queue-6.6/kvm-svm-introduce-svm_recalc_lbr_msr_intercepts.patch @@ -0,0 +1,78 @@ +From yosry.ahmed@linux.dev Thu Jan 8 13:20:08 2026 +From: Yosry Ahmed +Date: Wed, 3 Dec 2025 18:42:17 +0000 +Subject: KVM: SVM: Introduce svm_recalc_lbr_msr_intercepts() +To: stable@vger.kernel.org +Cc: Paolo Bonzini , Sean Christopherson , Yosry Ahmed +Message-ID: <20251203184220.2693264-1-yosry.ahmed@linux.dev> + +From: Yosry Ahmed + +Introduce a helper updating the intercepts for LBR MSRs, similar to the +one introduced upstream by commit 160f143cc131 ("KVM: SVM: Manually +recalc all MSR intercepts on userspace MSR filter change"). The main +difference is that this version uses set_msr_interception(), which has +inverted polarity compared to svm_set_intercept_for_msr(). + +This is intended to simplify incoming backports. No functional changes +intended. + +Signed-off-by: Yosry Ahmed +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/svm/svm.c | 32 +++++++++++++++++++++----------- + 1 file changed, 21 insertions(+), 11 deletions(-) + +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -1012,18 +1012,31 @@ void svm_copy_lbrs(struct vmcb *to_vmcb, + vmcb_mark_dirty(to_vmcb, VMCB_LBR); + } + +-void svm_enable_lbrv(struct kvm_vcpu *vcpu) ++static void svm_recalc_lbr_msr_intercepts(struct kvm_vcpu *vcpu) + { + struct vcpu_svm *svm = to_svm(vcpu); ++ bool intercept = !(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK); + +- svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; +- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1); +- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); +- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); +- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1); ++ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, ++ !intercept, !intercept); ++ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, ++ !intercept, !intercept); ++ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, ++ !intercept, !intercept); ++ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, ++ !intercept, !intercept); + + if (sev_es_guest(vcpu->kvm)) +- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_DEBUGCTLMSR, 1, 1); ++ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_DEBUGCTLMSR, ++ !intercept, !intercept); ++} ++ ++void svm_enable_lbrv(struct kvm_vcpu *vcpu) ++{ ++ struct vcpu_svm *svm = to_svm(vcpu); ++ ++ svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; ++ svm_recalc_lbr_msr_intercepts(vcpu); + + /* Move the LBR msrs to the vmcb02 so that the guest can see them. */ + if (is_guest_mode(vcpu)) +@@ -1037,10 +1050,7 @@ static void svm_disable_lbrv(struct kvm_ + KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm); + + svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK; +- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0); +- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0); +- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0); +- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0); ++ svm_recalc_lbr_msr_intercepts(vcpu); + + /* + * Move the LBR msrs back to the vmcb01 to avoid copying them diff --git a/queue-6.6/mm-damon-tests-core-kunit-handle-alloc-failres-in-damon_test_new_filter.patch b/queue-6.6/mm-damon-tests-core-kunit-handle-alloc-failres-in-damon_test_new_filter.patch new file mode 100644 index 0000000000..6f6bc40d78 --- /dev/null +++ b/queue-6.6/mm-damon-tests-core-kunit-handle-alloc-failres-in-damon_test_new_filter.patch @@ -0,0 +1,40 @@ +From 28ab2265e9422ccd81e4beafc0ace90f78de04c4 Mon Sep 17 00:00:00 2001 +From: SeongJae Park +Date: Sat, 1 Nov 2025 11:20:07 -0700 +Subject: mm/damon/tests/core-kunit: handle alloc failres in damon_test_new_filter() + +From: SeongJae Park + +commit 28ab2265e9422ccd81e4beafc0ace90f78de04c4 upstream. + +damon_test_new_filter() is assuming all dynamic memory allocation in it +will succeed. Those are indeed likely in the real use cases since those +allocations are too small to fail, but theoretically those could fail. In +the case, inappropriate memory access can happen. Fix it by appropriately +cleanup pre-allocated memory and skip the execution of the remaining tests +in the failure cases. + +Link: https://lkml.kernel.org/r/20251101182021.74868-14-sj@kernel.org +Fixes: 2a158e956b98 ("mm/damon/core-test: add a test for damos_new_filter()") +Signed-off-by: SeongJae Park +Cc: Brendan Higgins +Cc: David Gow +Cc: Kefeng Wang +Cc: [6.6+] +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/damon/core-test.h | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/mm/damon/core-test.h ++++ b/mm/damon/core-test.h +@@ -346,6 +346,8 @@ static void damos_test_new_filter(struct + struct damos_filter *filter; + + filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true); ++ if (!filter) ++ kunit_skip(test, "filter alloc fail"); + KUNIT_EXPECT_EQ(test, filter->type, DAMOS_FILTER_TYPE_ANON); + KUNIT_EXPECT_EQ(test, filter->matching, true); + KUNIT_EXPECT_PTR_EQ(test, filter->list.prev, &filter->list); diff --git a/queue-6.6/mm-damon-tests-core-kunit-handle-alloc-failures-on-damon_test_merge_two.patch b/queue-6.6/mm-damon-tests-core-kunit-handle-alloc-failures-on-damon_test_merge_two.patch new file mode 100644 index 0000000000..b99c1e9b86 --- /dev/null +++ b/queue-6.6/mm-damon-tests-core-kunit-handle-alloc-failures-on-damon_test_merge_two.patch @@ -0,0 +1,52 @@ +From 3d443dd29a1db7efa587a4bb0c06a497e13ca9e4 Mon Sep 17 00:00:00 2001 +From: SeongJae Park +Date: Sat, 1 Nov 2025 11:20:00 -0700 +Subject: mm/damon/tests/core-kunit: handle alloc failures on damon_test_merge_two() + +From: SeongJae Park + +commit 3d443dd29a1db7efa587a4bb0c06a497e13ca9e4 upstream. + +damon_test_merge_two() is assuming all dynamic memory allocation in it +will succeed. Those are indeed likely in the real use cases since those +allocations are too small to fail, but theoretically those could fail. In +the case, inappropriate memory access can happen. Fix it by appropriately +cleanup pre-allocated memory and skip the execution of the remaining tests +in the failure cases. + +Link: https://lkml.kernel.org/r/20251101182021.74868-7-sj@kernel.org +Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests") +Signed-off-by: SeongJae Park +Cc: Brendan Higgins +Cc: David Gow +Cc: Kefeng Wang +Cc: [5.15+] +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/damon/core-test.h | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +--- a/mm/damon/core-test.h ++++ b/mm/damon/core-test.h +@@ -162,10 +162,20 @@ static void damon_test_merge_two(struct + int i; + + t = damon_new_target(); ++ if (!t) ++ kunit_skip(test, "target alloc fail"); + r = damon_new_region(0, 100); ++ if (!r) { ++ damon_free_target(t); ++ kunit_skip(test, "region alloc fail"); ++ } + r->nr_accesses = 10; + damon_add_region(r, t); + r2 = damon_new_region(100, 300); ++ if (!r2) { ++ damon_free_target(t); ++ kunit_skip(test, "second region alloc fail"); ++ } + r2->nr_accesses = 20; + damon_add_region(r2, t); + diff --git a/queue-6.6/mm-damon-tests-core-kunit-handle-alloc-failures-on-damon_test_split_at.patch b/queue-6.6/mm-damon-tests-core-kunit-handle-alloc-failures-on-damon_test_split_at.patch new file mode 100644 index 0000000000..f9d8719795 --- /dev/null +++ b/queue-6.6/mm-damon-tests-core-kunit-handle-alloc-failures-on-damon_test_split_at.patch @@ -0,0 +1,51 @@ +From 5e80d73f22043c59c8ad36452a3253937ed77955 Mon Sep 17 00:00:00 2001 +From: SeongJae Park +Date: Sat, 1 Nov 2025 11:19:59 -0700 +Subject: mm/damon/tests/core-kunit: handle alloc failures on damon_test_split_at() + +From: SeongJae Park + +commit 5e80d73f22043c59c8ad36452a3253937ed77955 upstream. + +damon_test_split_at() is assuming all dynamic memory allocation in it will +succeed. Those are indeed likely in the real use cases since those +allocations are too small to fail, but theoretically those could fail. In +the case, inappropriate memory access can happen. Fix it by appropriately +cleanup pre-allocated memory and skip the execution of the remaining tests +in the failure cases. + +Link: https://lkml.kernel.org/r/20251101182021.74868-6-sj@kernel.org +Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests") +Signed-off-by: SeongJae Park +Cc: Brendan Higgins +Cc: David Gow +Cc: Kefeng Wang +Cc: [5.15+] +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/damon/core-test.h | 11 +++++++++++ + 1 file changed, 11 insertions(+) + +--- a/mm/damon/core-test.h ++++ b/mm/damon/core-test.h +@@ -129,8 +129,19 @@ static void damon_test_split_at(struct k + struct damon_target *t; + struct damon_region *r; + ++ if (!c) ++ kunit_skip(test, "ctx alloc fail"); + t = damon_new_target(); ++ if (!t) { ++ damon_destroy_ctx(c); ++ kunit_skip(test, "target alloc fail"); ++ } + r = damon_new_region(0, 100); ++ if (!r) { ++ damon_destroy_ctx(c); ++ damon_free_target(t); ++ kunit_skip(test, "region alloc fail"); ++ } + damon_add_region(r, t); + damon_split_region_at(t, r, 25); + KUNIT_EXPECT_EQ(test, r->ar.start, 0ul); diff --git a/queue-6.6/mm-damon-tests-core-kunit-handle-alloc-failures-on-dasmon_test_merge_regions_of.patch b/queue-6.6/mm-damon-tests-core-kunit-handle-alloc-failures-on-dasmon_test_merge_regions_of.patch new file mode 100644 index 0000000000..65733cf0bf --- /dev/null +++ b/queue-6.6/mm-damon-tests-core-kunit-handle-alloc-failures-on-dasmon_test_merge_regions_of.patch @@ -0,0 +1,46 @@ +From 0998d2757218771c59d5ca59ccf13d1542a38f17 Mon Sep 17 00:00:00 2001 +From: SeongJae Park +Date: Sat, 1 Nov 2025 11:20:01 -0700 +Subject: mm/damon/tests/core-kunit: handle alloc failures on dasmon_test_merge_regions_of() + +From: SeongJae Park + +commit 0998d2757218771c59d5ca59ccf13d1542a38f17 upstream. + +damon_test_merge_regions_of() is assuming all dynamic memory allocation in +it will succeed. Those are indeed likely in the real use cases since +those allocations are too small to fail, but theoretically those could +fail. In the case, inappropriate memory access can happen. Fix it by +appropriately cleanup pre-allocated memory and skip the execution of the +remaining tests in the failure cases. + +Link: https://lkml.kernel.org/r/20251101182021.74868-8-sj@kernel.org +Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests") +Signed-off-by: SeongJae Park +Cc: Brendan Higgins +Cc: David Gow +Cc: Kefeng Wang +Cc: [5.15+] +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/damon/core-test.h | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/mm/damon/core-test.h ++++ b/mm/damon/core-test.h +@@ -210,8 +210,14 @@ static void damon_test_merge_regions_of( + int i; + + t = damon_new_target(); ++ if (!t) ++ kunit_skip(test, "target alloc fail"); + for (i = 0; i < ARRAY_SIZE(sa); i++) { + r = damon_new_region(sa[i], ea[i]); ++ if (!r) { ++ damon_free_target(t); ++ kunit_skip(test, "region alloc fail"); ++ } + r->nr_accesses = nrs[i]; + damon_add_region(r, t); + } diff --git a/queue-6.6/mm-damon-tests-core-kunit-handle-allocation-failures-in-damon_test_regions.patch b/queue-6.6/mm-damon-tests-core-kunit-handle-allocation-failures-in-damon_test_regions.patch new file mode 100644 index 0000000000..f9bf68b84b --- /dev/null +++ b/queue-6.6/mm-damon-tests-core-kunit-handle-allocation-failures-in-damon_test_regions.patch @@ -0,0 +1,49 @@ +From e16fdd4f754048d6e23c56bd8d920b71e41e3777 Mon Sep 17 00:00:00 2001 +From: SeongJae Park +Date: Sat, 1 Nov 2025 11:19:56 -0700 +Subject: mm/damon/tests/core-kunit: handle allocation failures in damon_test_regions() + +From: SeongJae Park + +commit e16fdd4f754048d6e23c56bd8d920b71e41e3777 upstream. + +damon_test_regions() is assuming all dynamic memory allocation in it will +succeed. Those are indeed likely in the real use cases since those +allocations are too small to fail, but theoretically those could fail. In +the case, inappropriate memory access can happen. Fix it by appropriately +cleanup pre-allocated memory and skip the execution of the remaining tests +in the failure cases. + +Link: https://lkml.kernel.org/r/20251101182021.74868-3-sj@kernel.org +Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests") +Signed-off-by: SeongJae Park +Cc: Brendan Higgins +Cc: David Gow +Cc: Kefeng Wang +Cc: [5.15+] +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/damon/core-test.h | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/mm/damon/core-test.h ++++ b/mm/damon/core-test.h +@@ -20,11 +20,17 @@ static void damon_test_regions(struct ku + struct damon_target *t; + + r = damon_new_region(1, 2); ++ if (!r) ++ kunit_skip(test, "region alloc fail"); + KUNIT_EXPECT_EQ(test, 1ul, r->ar.start); + KUNIT_EXPECT_EQ(test, 2ul, r->ar.end); + KUNIT_EXPECT_EQ(test, 0u, r->nr_accesses); + + t = damon_new_target(); ++ if (!t) { ++ damon_free_region(r); ++ kunit_skip(test, "target alloc fail"); ++ } + KUNIT_EXPECT_EQ(test, 0u, damon_nr_regions(t)); + + damon_add_region(r, t); diff --git a/queue-6.6/mm-damon-tests-vaddr-kunit-handle-alloc-failures-in-damon_test_split_evenly_fail.patch b/queue-6.6/mm-damon-tests-vaddr-kunit-handle-alloc-failures-in-damon_test_split_evenly_fail.patch new file mode 100644 index 0000000000..856ebc31e6 --- /dev/null +++ b/queue-6.6/mm-damon-tests-vaddr-kunit-handle-alloc-failures-in-damon_test_split_evenly_fail.patch @@ -0,0 +1,49 @@ +From 7890e5b5bb6e386155c6e755fe70e0cdcc77f18e Mon Sep 17 00:00:00 2001 +From: SeongJae Park +Date: Sat, 1 Nov 2025 11:20:12 -0700 +Subject: mm/damon/tests/vaddr-kunit: handle alloc failures in damon_test_split_evenly_fail() + +From: SeongJae Park + +commit 7890e5b5bb6e386155c6e755fe70e0cdcc77f18e upstream. + +damon_test_split_evenly_fail() is assuming all dynamic memory allocation +in it will succeed. Those are indeed likely in the real use cases since +those allocations are too small to fail, but theoretically those could +fail. In the case, inappropriate memory access can happen. Fix it by +appropriately cleanup pre-allocated memory and skip the execution of the +remaining tests in the failure cases. + +Link: https://lkml.kernel.org/r/20251101182021.74868-19-sj@kernel.org +Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests") +Signed-off-by: SeongJae Park +Cc: Brendan Higgins +Cc: David Gow +Cc: Kefeng Wang +Cc: [5.15+] +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/damon/vaddr-test.h | 11 ++++++++++- + 1 file changed, 10 insertions(+), 1 deletion(-) + +--- a/mm/damon/vaddr-test.h ++++ b/mm/damon/vaddr-test.h +@@ -250,7 +250,16 @@ static void damon_test_split_evenly_fail + unsigned long start, unsigned long end, unsigned int nr_pieces) + { + struct damon_target *t = damon_new_target(); +- struct damon_region *r = damon_new_region(start, end); ++ struct damon_region *r; ++ ++ if (!t) ++ kunit_skip(test, "target alloc fail"); ++ ++ r = damon_new_region(start, end); ++ if (!r) { ++ damon_free_target(t); ++ kunit_skip(test, "region alloc fail"); ++ } + + damon_add_region(r, t); + KUNIT_EXPECT_EQ(test, diff --git a/queue-6.6/mm-damon-tests-vaddr-kunit-handle-alloc-failures-on-damon_do_test_apply_three_regions.patch b/queue-6.6/mm-damon-tests-vaddr-kunit-handle-alloc-failures-on-damon_do_test_apply_three_regions.patch new file mode 100644 index 0000000000..a989848ce8 --- /dev/null +++ b/queue-6.6/mm-damon-tests-vaddr-kunit-handle-alloc-failures-on-damon_do_test_apply_three_regions.patch @@ -0,0 +1,47 @@ +From 2b22d0fcc6320ba29b2122434c1d2f0785fb0a25 Mon Sep 17 00:00:00 2001 +From: SeongJae Park +Date: Sat, 1 Nov 2025 11:20:11 -0700 +Subject: mm/damon/tests/vaddr-kunit: handle alloc failures on damon_do_test_apply_three_regions() + +From: SeongJae Park + +commit 2b22d0fcc6320ba29b2122434c1d2f0785fb0a25 upstream. + +damon_do_test_apply_three_regions() is assuming all dynamic memory +allocation in it will succeed. Those are indeed likely in the real use +cases since those allocations are too small to fail, but theoretically +those could fail. In the case, inappropriate memory access can happen. +Fix it by appropriately cleanup pre-allocated memory and skip the +execution of the remaining tests in the failure cases. + +Link: https://lkml.kernel.org/r/20251101182021.74868-18-sj@kernel.org +Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests") +Signed-off-by: SeongJae Park +Cc: Brendan Higgins +Cc: David Gow +Cc: Kefeng Wang +Cc: [5.15+] +Signed-off-by: Andrew Morton +Signed-off-by: SeongJae Park +Signed-off-by: Greg Kroah-Hartman +--- + mm/damon/vaddr-test.h | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/mm/damon/vaddr-test.h ++++ b/mm/damon/vaddr-test.h +@@ -136,8 +136,14 @@ static void damon_do_test_apply_three_re + int i; + + t = damon_new_target(); ++ if (!t) ++ kunit_skip(test, "target alloc fail"); + for (i = 0; i < nr_regions / 2; i++) { + r = damon_new_region(regions[i * 2], regions[i * 2 + 1]); ++ if (!r) { ++ damon_destroy_target(t); ++ kunit_skip(test, "region alloc fail"); ++ } + damon_add_region(r, t); + } + diff --git a/queue-6.6/mm-damon-tests-vaddr-kunit-handle-alloc-failures-on-damon_test_split_evenly_succ.patch b/queue-6.6/mm-damon-tests-vaddr-kunit-handle-alloc-failures-on-damon_test_split_evenly_succ.patch new file mode 100644 index 0000000000..c3e32d0b67 --- /dev/null +++ b/queue-6.6/mm-damon-tests-vaddr-kunit-handle-alloc-failures-on-damon_test_split_evenly_succ.patch @@ -0,0 +1,50 @@ +From 0a63a0e7570b9b2631dfb8d836dc572709dce39e Mon Sep 17 00:00:00 2001 +From: SeongJae Park +Date: Sat, 1 Nov 2025 11:20:13 -0700 +Subject: mm/damon/tests/vaddr-kunit: handle alloc failures on damon_test_split_evenly_succ() + +From: SeongJae Park + +commit 0a63a0e7570b9b2631dfb8d836dc572709dce39e upstream. + +damon_test_split_evenly_succ() is assuming all dynamic memory allocation +in it will succeed. Those are indeed likely in the real use cases since +those allocations are too small to fail, but theoretically those could +fail. In the case, inappropriate memory access can happen. Fix it by +appropriately cleanup pre-allocated memory and skip the execution of the +remaining tests in the failure cases. + +Link: https://lkml.kernel.org/r/20251101182021.74868-20-sj@kernel.org +Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests") +Signed-off-by: SeongJae Park +Cc: Brendan Higgins +Cc: David Gow +Cc: Kefeng Wang +Cc: [5.15+] +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/damon/vaddr-test.h | 9 ++++++++- + 1 file changed, 8 insertions(+), 1 deletion(-) + +--- a/mm/damon/vaddr-test.h ++++ b/mm/damon/vaddr-test.h +@@ -284,10 +284,17 @@ static void damon_test_split_evenly_succ + unsigned long start, unsigned long end, unsigned int nr_pieces) + { + struct damon_target *t = damon_new_target(); +- struct damon_region *r = damon_new_region(start, end); ++ struct damon_region *r; + unsigned long expected_width = (end - start) / nr_pieces; + unsigned long i = 0; + ++ if (!t) ++ kunit_skip(test, "target alloc fail"); ++ r = damon_new_region(start, end); ++ if (!r) { ++ damon_free_target(t); ++ kunit_skip(test, "region alloc fail"); ++ } + damon_add_region(r, t); + KUNIT_EXPECT_EQ(test, + damon_va_evenly_split_region(t, r, nr_pieces), 0); diff --git a/queue-6.6/net-allow-to-use-smp-threads-for-backlog-napi.patch b/queue-6.6/net-allow-to-use-smp-threads-for-backlog-napi.patch new file mode 100644 index 0000000000..4893514627 --- /dev/null +++ b/queue-6.6/net-allow-to-use-smp-threads-for-backlog-napi.patch @@ -0,0 +1,339 @@ +From wen.yang@linux.dev Mon Dec 29 08:53:54 2025 +From: wen.yang@linux.dev +Date: Mon, 29 Dec 2025 15:53:17 +0800 +Subject: net: Allow to use SMP threads for backlog NAPI. +To: Greg Kroah-Hartman +Cc: stable@vger.kernel.org, linux-kernel@vger.kernel.org, Sebastian Andrzej Siewior , Jakub Kicinski , Paolo Abeni , Wen Yang +Message-ID: <013481655ddb09ae214bc510502efe6cf32b3445.1766987153.git.wen.yang@linux.dev> + +From: Sebastian Andrzej Siewior + +commit dad6b97702639fba27a2bd3e986982ad6f0db3a7 upstream. + +Backlog NAPI is a per-CPU NAPI struct only (with no device behind it) +used by drivers which don't do NAPI them self, RPS and parts of the +stack which need to avoid recursive deadlocks while processing a packet. + +The non-NAPI driver use the CPU local backlog NAPI. If RPS is enabled +then a flow for the skb is computed and based on the flow the skb can be +enqueued on a remote CPU. Scheduling/ raising the softirq (for backlog's +NAPI) on the remote CPU isn't trivial because the softirq is only +scheduled on the local CPU and performed after the hardirq is done. +In order to schedule a softirq on the remote CPU, an IPI is sent to the +remote CPU which schedules the backlog-NAPI on the then local CPU. + +On PREEMPT_RT interrupts are force-threaded. The soft interrupts are +raised within the interrupt thread and processed after the interrupt +handler completed still within the context of the interrupt thread. The +softirq is handled in the context where it originated. + +With force-threaded interrupts enabled, ksoftirqd is woken up if a +softirq is raised from hardirq context. This is the case if it is raised +from an IPI. Additionally there is a warning on PREEMPT_RT if the +softirq is raised from the idle thread. +This was done for two reasons: +- With threaded interrupts the processing should happen in thread + context (where it originated) and ksoftirqd is the only thread for + this context if raised from hardirq. Using the currently running task + instead would "punish" a random task. +- Once ksoftirqd is active it consumes all further softirqs until it + stops running. This changed recently and is no longer the case. + +Instead of keeping the backlog NAPI in ksoftirqd (in force-threaded/ +PREEMPT_RT setups) I am proposing NAPI-threads for backlog. +The "proper" setup with threaded-NAPI is not doable because the threads +are not pinned to an individual CPU and can be modified by the user. +Additionally a dummy network device would have to be assigned. Also +CPU-hotplug has to be considered if additional CPUs show up. +All this can be probably done/ solved but the smpboot-threads already +provide this infrastructure. + +Sending UDP packets over loopback expects that the packet is processed +within the call. Delaying it by handing it over to the thread hurts +performance. It is not beneficial to the outcome if the context switch +happens immediately after enqueue or after a while to process a few +packets in a batch. +There is no need to always use the thread if the backlog NAPI is +requested on the local CPU. This restores the loopback throuput. The +performance drops mostly to the same value after enabling RPS on the +loopback comparing the IPI and the tread result. + +Create NAPI-threads for backlog if request during boot. The thread runs +the inner loop from napi_threaded_poll(), the wait part is different. It +checks for NAPI_STATE_SCHED (the backlog NAPI can not be disabled). + +The NAPI threads for backlog are optional, it has to be enabled via the boot +argument "thread_backlog_napi". It is mandatory for PREEMPT_RT to avoid the +wakeup of ksoftirqd from the IPI. + +Acked-by: Jakub Kicinski +Signed-off-by: Sebastian Andrzej Siewior +Signed-off-by: Paolo Abeni +Signed-off-by: Wen Yang +Signed-off-by: Greg Kroah-Hartman +--- + net/core/dev.c | 152 +++++++++++++++++++++++++++++++++++++++++++-------------- + 1 file changed, 115 insertions(+), 37 deletions(-) + +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -78,6 +78,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -217,6 +218,31 @@ static inline struct hlist_head *dev_ind + return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; + } + ++#ifndef CONFIG_PREEMPT_RT ++ ++static DEFINE_STATIC_KEY_FALSE(use_backlog_threads_key); ++ ++static int __init setup_backlog_napi_threads(char *arg) ++{ ++ static_branch_enable(&use_backlog_threads_key); ++ return 0; ++} ++early_param("thread_backlog_napi", setup_backlog_napi_threads); ++ ++static bool use_backlog_threads(void) ++{ ++ return static_branch_unlikely(&use_backlog_threads_key); ++} ++ ++#else ++ ++static bool use_backlog_threads(void) ++{ ++ return true; ++} ++ ++#endif ++ + static inline void rps_lock_irqsave(struct softnet_data *sd, + unsigned long *flags) + { +@@ -4494,6 +4520,7 @@ EXPORT_SYMBOL(__dev_direct_xmit); + /************************************************************************* + * Receiver routines + *************************************************************************/ ++static DEFINE_PER_CPU(struct task_struct *, backlog_napi); + + int netdev_max_backlog __read_mostly = 1000; + EXPORT_SYMBOL(netdev_max_backlog); +@@ -4526,12 +4553,16 @@ static inline void ____napi_schedule(str + */ + thread = READ_ONCE(napi->thread); + if (thread) { ++ if (use_backlog_threads() && thread == raw_cpu_read(backlog_napi)) ++ goto use_local_napi; ++ + set_bit(NAPI_STATE_SCHED_THREADED, &napi->state); + wake_up_process(thread); + return; + } + } + ++use_local_napi: + list_add_tail(&napi->poll_list, &sd->poll_list); + WRITE_ONCE(napi->list_owner, smp_processor_id()); + /* If not called from net_rx_action() +@@ -4777,6 +4808,11 @@ static void napi_schedule_rps(struct sof + + #ifdef CONFIG_RPS + if (sd != mysd) { ++ if (use_backlog_threads()) { ++ __napi_schedule_irqoff(&sd->backlog); ++ return; ++ } ++ + sd->rps_ipi_next = mysd->rps_ipi_list; + mysd->rps_ipi_list = sd; + +@@ -6000,7 +6036,7 @@ static void net_rps_action_and_irq_enabl + #ifdef CONFIG_RPS + struct softnet_data *remsd = sd->rps_ipi_list; + +- if (remsd) { ++ if (!use_backlog_threads() && remsd) { + sd->rps_ipi_list = NULL; + + local_irq_enable(); +@@ -6015,7 +6051,7 @@ static void net_rps_action_and_irq_enabl + static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) + { + #ifdef CONFIG_RPS +- return sd->rps_ipi_list != NULL; ++ return !use_backlog_threads() && sd->rps_ipi_list; + #else + return false; + #endif +@@ -6059,7 +6095,7 @@ static int process_backlog(struct napi_s + * We can use a plain write instead of clear_bit(), + * and we dont need an smp_mb() memory barrier. + */ +- napi->state = 0; ++ napi->state &= NAPIF_STATE_THREADED; + again = false; + } else { + skb_queue_splice_tail_init(&sd->input_pkt_queue, +@@ -6725,43 +6761,48 @@ static void skb_defer_free_flush(struct + } + } + +-static int napi_threaded_poll(void *data) ++static void napi_threaded_poll_loop(struct napi_struct *napi) + { +- struct napi_struct *napi = data; + struct softnet_data *sd; +- void *have; ++ unsigned long last_qs = jiffies; + +- while (!napi_thread_wait(napi)) { +- unsigned long last_qs = jiffies; ++ for (;;) { ++ bool repoll = false; ++ void *have; + +- for (;;) { +- bool repoll = false; ++ local_bh_disable(); ++ sd = this_cpu_ptr(&softnet_data); ++ sd->in_napi_threaded_poll = true; + +- local_bh_disable(); +- sd = this_cpu_ptr(&softnet_data); +- sd->in_napi_threaded_poll = true; +- +- have = netpoll_poll_lock(napi); +- __napi_poll(napi, &repoll); +- netpoll_poll_unlock(have); +- +- sd->in_napi_threaded_poll = false; +- barrier(); +- +- if (sd_has_rps_ipi_waiting(sd)) { +- local_irq_disable(); +- net_rps_action_and_irq_enable(sd); +- } +- skb_defer_free_flush(sd); +- local_bh_enable(); ++ have = netpoll_poll_lock(napi); ++ __napi_poll(napi, &repoll); ++ netpoll_poll_unlock(have); ++ ++ sd->in_napi_threaded_poll = false; ++ barrier(); ++ ++ if (sd_has_rps_ipi_waiting(sd)) { ++ local_irq_disable(); ++ net_rps_action_and_irq_enable(sd); ++ } ++ skb_defer_free_flush(sd); ++ local_bh_enable(); + +- if (!repoll) +- break; ++ if (!repoll) ++ break; + +- rcu_softirq_qs_periodic(last_qs); +- cond_resched(); +- } ++ rcu_softirq_qs_periodic(last_qs); ++ cond_resched(); + } ++} ++ ++static int napi_threaded_poll(void *data) ++{ ++ struct napi_struct *napi = data; ++ ++ while (!napi_thread_wait(napi)) ++ napi_threaded_poll_loop(napi); ++ + return 0; + } + +@@ -11346,7 +11387,7 @@ static int dev_cpu_dead(unsigned int old + + list_del_init(&napi->poll_list); + if (napi->poll == process_backlog) +- napi->state = 0; ++ napi->state &= NAPIF_STATE_THREADED; + else + ____napi_schedule(sd, napi); + } +@@ -11354,12 +11395,14 @@ static int dev_cpu_dead(unsigned int old + raise_softirq_irqoff(NET_TX_SOFTIRQ); + local_irq_enable(); + ++ if (!use_backlog_threads()) { + #ifdef CONFIG_RPS +- remsd = oldsd->rps_ipi_list; +- oldsd->rps_ipi_list = NULL; ++ remsd = oldsd->rps_ipi_list; ++ oldsd->rps_ipi_list = NULL; + #endif +- /* send out pending IPI's on offline CPU */ +- net_rps_send_ipi(remsd); ++ /* send out pending IPI's on offline CPU */ ++ net_rps_send_ipi(remsd); ++ } + + /* Process offline CPU's input_pkt_queue */ + while ((skb = __skb_dequeue(&oldsd->process_queue))) { +@@ -11622,6 +11665,38 @@ static struct pernet_operations __net_in + * + */ + ++static int backlog_napi_should_run(unsigned int cpu) ++{ ++ struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); ++ struct napi_struct *napi = &sd->backlog; ++ ++ return test_bit(NAPI_STATE_SCHED_THREADED, &napi->state); ++} ++ ++static void run_backlog_napi(unsigned int cpu) ++{ ++ struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); ++ ++ napi_threaded_poll_loop(&sd->backlog); ++} ++ ++static void backlog_napi_setup(unsigned int cpu) ++{ ++ struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); ++ struct napi_struct *napi = &sd->backlog; ++ ++ napi->thread = this_cpu_read(backlog_napi); ++ set_bit(NAPI_STATE_THREADED, &napi->state); ++} ++ ++static struct smp_hotplug_thread backlog_threads = { ++ .store = &backlog_napi, ++ .thread_should_run = backlog_napi_should_run, ++ .thread_fn = run_backlog_napi, ++ .thread_comm = "backlog_napi/%u", ++ .setup = backlog_napi_setup, ++}; ++ + /* + * This is called single threaded during boot, so no need + * to take the rtnl semaphore. +@@ -11672,7 +11747,10 @@ static int __init net_dev_init(void) + init_gro_hash(&sd->backlog); + sd->backlog.poll = process_backlog; + sd->backlog.weight = weight_p; ++ INIT_LIST_HEAD(&sd->backlog.poll_list); + } ++ if (use_backlog_threads()) ++ smpboot_register_percpu_thread(&backlog_threads); + + dev_boot_phase = 0; + diff --git a/queue-6.6/net-remove-conditional-threaded-napi-wakeup-based-on-task-state.patch b/queue-6.6/net-remove-conditional-threaded-napi-wakeup-based-on-task-state.patch new file mode 100644 index 0000000000..d51b779994 --- /dev/null +++ b/queue-6.6/net-remove-conditional-threaded-napi-wakeup-based-on-task-state.patch @@ -0,0 +1,83 @@ +From stable+bounces-203472-greg=kroah.com@vger.kernel.org Mon Dec 29 08:54:02 2025 +From: wen.yang@linux.dev +Date: Mon, 29 Dec 2025 15:53:16 +0800 +Subject: net: Remove conditional threaded-NAPI wakeup based on task state. +To: Greg Kroah-Hartman +Cc: stable@vger.kernel.org, linux-kernel@vger.kernel.org, Sebastian Andrzej Siewior , Jakub Kicinski , Paolo Abeni , Wen Yang +Message-ID: + +From: Sebastian Andrzej Siewior + +commit 56364c910691f6d10ba88c964c9041b9ab777bd6 upstream. + +A NAPI thread is scheduled by first setting NAPI_STATE_SCHED bit. If +successful (the bit was not yet set) then the NAPI_STATE_SCHED_THREADED +is set but only if thread's state is not TASK_INTERRUPTIBLE (is +TASK_RUNNING) followed by task wakeup. + +If the task is idle (TASK_INTERRUPTIBLE) then the +NAPI_STATE_SCHED_THREADED bit is not set. The thread is no relying on +the bit but always leaving the wait-loop after returning from schedule() +because there must have been a wakeup. + +The smpboot-threads implementation for per-CPU threads requires an +explicit condition and does not support "if we get out of schedule() +then there must be something to do". + +Removing this optimisation simplifies the following integration. + +Set NAPI_STATE_SCHED_THREADED unconditionally on wakeup and rely on it +in the wait path by removing the `woken' condition. + +Acked-by: Jakub Kicinski +Signed-off-by: Sebastian Andrzej Siewior +Signed-off-by: Paolo Abeni +Signed-off-by: Wen Yang +Signed-off-by: Greg Kroah-Hartman +--- + net/core/dev.c | 14 ++------------ + 1 file changed, 2 insertions(+), 12 deletions(-) + +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -4526,13 +4526,7 @@ static inline void ____napi_schedule(str + */ + thread = READ_ONCE(napi->thread); + if (thread) { +- /* Avoid doing set_bit() if the thread is in +- * INTERRUPTIBLE state, cause napi_thread_wait() +- * makes sure to proceed with napi polling +- * if the thread is explicitly woken from here. +- */ +- if (READ_ONCE(thread->__state) != TASK_INTERRUPTIBLE) +- set_bit(NAPI_STATE_SCHED_THREADED, &napi->state); ++ set_bit(NAPI_STATE_SCHED_THREADED, &napi->state); + wake_up_process(thread); + return; + } +@@ -6688,8 +6682,6 @@ static int napi_poll(struct napi_struct + + static int napi_thread_wait(struct napi_struct *napi) + { +- bool woken = false; +- + set_current_state(TASK_INTERRUPTIBLE); + + while (!kthread_should_stop()) { +@@ -6698,15 +6690,13 @@ static int napi_thread_wait(struct napi_ + * Testing SCHED bit is not enough because SCHED bit might be + * set by some other busy poll thread or by napi_disable(). + */ +- if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) { ++ if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) { + WARN_ON(!list_empty(&napi->poll_list)); + __set_current_state(TASK_RUNNING); + return 0; + } + + schedule(); +- /* woken being true indicates this thread owns this napi. */ +- woken = true; + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); diff --git a/queue-6.6/rdma-core-fix-kasan-slab-use-after-free-read-in-ib_register_device-problem.patch b/queue-6.6/rdma-core-fix-kasan-slab-use-after-free-read-in-ib_register_device-problem.patch new file mode 100644 index 0000000000..e268a1cfe3 --- /dev/null +++ b/queue-6.6/rdma-core-fix-kasan-slab-use-after-free-read-in-ib_register_device-problem.patch @@ -0,0 +1,85 @@ +From d0706bfd3ee40923c001c6827b786a309e2a8713 Mon Sep 17 00:00:00 2001 +From: Zhu Yanjun +Date: Tue, 6 May 2025 17:10:08 +0200 +Subject: RDMA/core: Fix "KASAN: slab-use-after-free Read in ib_register_device" problem + +From: Zhu Yanjun + +commit d0706bfd3ee40923c001c6827b786a309e2a8713 upstream. + +Call Trace: + + __dump_stack lib/dump_stack.c:94 [inline] + dump_stack_lvl+0x116/0x1f0 lib/dump_stack.c:120 + print_address_description mm/kasan/report.c:408 [inline] + print_report+0xc3/0x670 mm/kasan/report.c:521 + kasan_report+0xe0/0x110 mm/kasan/report.c:634 + strlen+0x93/0xa0 lib/string.c:420 + __fortify_strlen include/linux/fortify-string.h:268 [inline] + get_kobj_path_length lib/kobject.c:118 [inline] + kobject_get_path+0x3f/0x2a0 lib/kobject.c:158 + kobject_uevent_env+0x289/0x1870 lib/kobject_uevent.c:545 + ib_register_device drivers/infiniband/core/device.c:1472 [inline] + ib_register_device+0x8cf/0xe00 drivers/infiniband/core/device.c:1393 + rxe_register_device+0x275/0x320 drivers/infiniband/sw/rxe/rxe_verbs.c:1552 + rxe_net_add+0x8e/0xe0 drivers/infiniband/sw/rxe/rxe_net.c:550 + rxe_newlink+0x70/0x190 drivers/infiniband/sw/rxe/rxe.c:225 + nldev_newlink+0x3a3/0x680 drivers/infiniband/core/nldev.c:1796 + rdma_nl_rcv_msg+0x387/0x6e0 drivers/infiniband/core/netlink.c:195 + rdma_nl_rcv_skb.constprop.0.isra.0+0x2e5/0x450 + netlink_unicast_kernel net/netlink/af_netlink.c:1313 [inline] + netlink_unicast+0x53a/0x7f0 net/netlink/af_netlink.c:1339 + netlink_sendmsg+0x8d1/0xdd0 net/netlink/af_netlink.c:1883 + sock_sendmsg_nosec net/socket.c:712 [inline] + __sock_sendmsg net/socket.c:727 [inline] + ____sys_sendmsg+0xa95/0xc70 net/socket.c:2566 + ___sys_sendmsg+0x134/0x1d0 net/socket.c:2620 + __sys_sendmsg+0x16d/0x220 net/socket.c:2652 + do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] + do_syscall_64+0xcd/0x260 arch/x86/entry/syscall_64.c:94 + entry_SYSCALL_64_after_hwframe+0x77/0x7f + +This problem is similar to the problem that the +commit 1d6a9e7449e2 ("RDMA/core: Fix use-after-free when rename device name") +fixes. + +The root cause is: the function ib_device_rename() renames the name with +lock. But in the function kobject_uevent(), this name is accessed without +lock protection at the same time. + +The solution is to add the lock protection when this name is accessed in +the function kobject_uevent(). + +Fixes: 779e0bf47632 ("RDMA/core: Do not indicate device ready when device enablement fails") +Link: https://patch.msgid.link/r/20250506151008.75701-1-yanjun.zhu@linux.dev +Reported-by: syzbot+e2ce9e275ecc70a30b72@syzkaller.appspotmail.com +Closes: https://syzkaller.appspot.com/bug?extid=e2ce9e275ecc70a30b72 +Signed-off-by: Zhu Yanjun +Signed-off-by: Jason Gunthorpe +Signed-off-by: Sasha Levin +[ Ajay: Modified to apply on v5.10.y-v6.6.y + ib_device_notify_register() not present in v5.10.y-v6.6.y, + so directly added lock for kobject_uevent() ] +Signed-off-by: Ajay Kaher +Signed-off-by: Shivani Agarwal +Signed-off-by: Greg Kroah-Hartman +--- + drivers/infiniband/core/device.c | 5 +++++ + 1 file changed, 5 insertions(+) + +--- a/drivers/infiniband/core/device.c ++++ b/drivers/infiniband/core/device.c +@@ -1450,8 +1450,13 @@ int ib_register_device(struct ib_device + return ret; + } + dev_set_uevent_suppress(&device->dev, false); ++ ++ down_read(&devices_rwsem); ++ + /* Mark for userspace that device is ready */ + kobject_uevent(&device->dev.kobj, KOBJ_ADD); ++ ++ up_read(&devices_rwsem); + ib_device_put(device); + + return 0; diff --git a/queue-6.6/rdma-rxe-fix-the-failure-of-ibv_query_device-and-ibv_query_device_ex-tests.patch b/queue-6.6/rdma-rxe-fix-the-failure-of-ibv_query_device-and-ibv_query_device_ex-tests.patch new file mode 100644 index 0000000000..236093812c --- /dev/null +++ b/queue-6.6/rdma-rxe-fix-the-failure-of-ibv_query_device-and-ibv_query_device_ex-tests.patch @@ -0,0 +1,138 @@ +From shivani.agarwal@broadcom.com Thu Jan 8 11:26:51 2026 +From: Shivani Agarwal +Date: Thu, 8 Jan 2026 02:05:40 -0800 +Subject: RDMA/rxe: Fix the failure of ibv_query_device() and ibv_query_device_ex() tests +To: stable@vger.kernel.org, gregkh@linuxfoundation.org +Cc: jgg@ziepe.ca, leon@kernel.org, zyjzyj2000@gmail.com, mbloch@nvidia.com, parav@nvidia.com, mrgolin@amazon.com, roman.gushchin@linux.dev, wangliang74@huawei.com, marco.crivellari@suse.com, zhao.xichao@vivo.com, haggaie@mellanox.com, monis@mellanox.com, dledford@redhat.com, amirv@mellanox.com, kamalh@mellanox.com, linux-rdma@vger.kernel.org, linux-kernel@vger.kernel.org, ajay.kaher@broadcom.com, alexey.makhalov@broadcom.com, vamsi-krishna.brahmajosyula@broadcom.com, yin.ding@broadcom.com, tapas.kundu@broadcom.com, Zhu Yanjun , Daisuke Matsuda , Sasha Levin , Shivani Agarwal +Message-ID: <20260108100540.672666-3-shivani.agarwal@broadcom.com> + +From: Zhu Yanjun + +[ Upstream commit 8ce2eb9dfac8743d1c423b86339336a5b6a6069e ] + +In rdma-core, the following failures appear. + +" +$ ./build/bin/run_tests.py -k device +ssssssss....FF........s +====================================================================== +FAIL: test_query_device (tests.test_device.DeviceTest.test_query_device) +Test ibv_query_device() +---------------------------------------------------------------------- +Traceback (most recent call last): + File "/home/ubuntu/rdma-core/tests/test_device.py", line 63, in + test_query_device + self.verify_device_attr(attr, dev) + File "/home/ubuntu/rdma-core/tests/test_device.py", line 200, in + verify_device_attr + assert attr.sys_image_guid != 0 + ^^^^^^^^^^^^^^^^^^^^^^^^ +AssertionError + +====================================================================== +FAIL: test_query_device_ex (tests.test_device.DeviceTest.test_query_device_ex) +Test ibv_query_device_ex() +---------------------------------------------------------------------- +Traceback (most recent call last): + File "/home/ubuntu/rdma-core/tests/test_device.py", line 222, in + test_query_device_ex + self.verify_device_attr(attr_ex.orig_attr, dev) + File "/home/ubuntu/rdma-core/tests/test_device.py", line 200, in + verify_device_attr + assert attr.sys_image_guid != 0 + ^^^^^^^^^^^^^^^^^^^^^^^^ +AssertionError +" + +The root cause is: before a net device is set with rxe, this net device +is used to generate a sys_image_guid. + +Fixes: 2ac5415022d1 ("RDMA/rxe: Remove the direct link to net_device") +Signed-off-by: Zhu Yanjun +Link: https://patch.msgid.link/20250302215444.3742072-1-yanjun.zhu@linux.dev +Reviewed-by: Daisuke Matsuda +Tested-by: Daisuke Matsuda +Signed-off-by: Leon Romanovsky +Signed-off-by: Sasha Levin +[Shivani: Modified to apply on 6.6.y] +Signed-off-by: Shivani Agarwal +Signed-off-by: Greg Kroah-Hartman +--- + drivers/infiniband/sw/rxe/rxe.c | 25 ++++++------------------- + 1 file changed, 6 insertions(+), 19 deletions(-) + +--- a/drivers/infiniband/sw/rxe/rxe.c ++++ b/drivers/infiniband/sw/rxe/rxe.c +@@ -38,10 +38,8 @@ void rxe_dealloc(struct ib_device *ib_de + } + + /* initialize rxe device parameters */ +-static void rxe_init_device_param(struct rxe_dev *rxe) ++static void rxe_init_device_param(struct rxe_dev *rxe, struct net_device *ndev) + { +- struct net_device *ndev; +- + rxe->max_inline_data = RXE_MAX_INLINE_DATA; + + rxe->attr.vendor_id = RXE_VENDOR_ID; +@@ -74,15 +72,9 @@ static void rxe_init_device_param(struct + rxe->attr.max_pkeys = RXE_MAX_PKEYS; + rxe->attr.local_ca_ack_delay = RXE_LOCAL_CA_ACK_DELAY; + +- ndev = rxe_ib_device_get_netdev(&rxe->ib_dev); +- if (!ndev) +- return; +- + addrconf_addr_eui48((unsigned char *)&rxe->attr.sys_image_guid, + ndev->dev_addr); + +- dev_put(ndev); +- + rxe->max_ucontext = RXE_MAX_UCONTEXT; + } + +@@ -115,18 +107,13 @@ static void rxe_init_port_param(struct r + /* initialize port state, note IB convention that HCA ports are always + * numbered from 1 + */ +-static void rxe_init_ports(struct rxe_dev *rxe) ++static void rxe_init_ports(struct rxe_dev *rxe, struct net_device *ndev) + { + struct rxe_port *port = &rxe->port; +- struct net_device *ndev; + + rxe_init_port_param(port); +- ndev = rxe_ib_device_get_netdev(&rxe->ib_dev); +- if (!ndev) +- return; + addrconf_addr_eui48((unsigned char *)&port->port_guid, + ndev->dev_addr); +- dev_put(ndev); + spin_lock_init(&port->port_lock); + } + +@@ -144,12 +131,12 @@ static void rxe_init_pools(struct rxe_de + } + + /* initialize rxe device state */ +-static void rxe_init(struct rxe_dev *rxe) ++static void rxe_init(struct rxe_dev *rxe, struct net_device *ndev) + { + /* init default device parameters */ +- rxe_init_device_param(rxe); ++ rxe_init_device_param(rxe, ndev); + +- rxe_init_ports(rxe); ++ rxe_init_ports(rxe, ndev); + rxe_init_pools(rxe); + + /* init pending mmap list */ +@@ -186,7 +173,7 @@ void rxe_set_mtu(struct rxe_dev *rxe, un + int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name, + struct net_device *ndev) + { +- rxe_init(rxe); ++ rxe_init(rxe, ndev); + rxe_set_mtu(rxe, mtu); + + return rxe_register_device(rxe, ibdev_name, ndev); diff --git a/queue-6.6/rdma-rxe-remove-the-direct-link-to-net_device.patch b/queue-6.6/rdma-rxe-remove-the-direct-link-to-net_device.patch new file mode 100644 index 0000000000..5f43a9cb25 --- /dev/null +++ b/queue-6.6/rdma-rxe-remove-the-direct-link-to-net_device.patch @@ -0,0 +1,416 @@ +From stable+bounces-206303-greg=kroah.com@vger.kernel.org Thu Jan 8 11:31:56 2026 +From: Shivani Agarwal +Date: Thu, 8 Jan 2026 02:05:39 -0800 +Subject: RDMA/rxe: Remove the direct link to net_device +To: stable@vger.kernel.org, gregkh@linuxfoundation.org +Cc: jgg@ziepe.ca, leon@kernel.org, zyjzyj2000@gmail.com, mbloch@nvidia.com, parav@nvidia.com, mrgolin@amazon.com, roman.gushchin@linux.dev, wangliang74@huawei.com, marco.crivellari@suse.com, zhao.xichao@vivo.com, haggaie@mellanox.com, monis@mellanox.com, dledford@redhat.com, amirv@mellanox.com, kamalh@mellanox.com, linux-rdma@vger.kernel.org, linux-kernel@vger.kernel.org, ajay.kaher@broadcom.com, alexey.makhalov@broadcom.com, vamsi-krishna.brahmajosyula@broadcom.com, yin.ding@broadcom.com, tapas.kundu@broadcom.com, Zhu Yanjun , syzbot+4b87489410b4efd181bf@syzkaller.appspotmail.com, Sasha Levin , Shivani Agarwal +Message-ID: <20260108100540.672666-2-shivani.agarwal@broadcom.com> + +From: Zhu Yanjun + +[ Upstream commit 2ac5415022d16d63d912a39a06f32f1f51140261 ] + +The similar patch in siw is in the link: +https://git.kernel.org/rdma/rdma/c/16b87037b48889 + +This problem also occurred in RXE. The following analyze this problem. +In the following Call Traces: +" +BUG: KASAN: slab-use-after-free in dev_get_flags+0x188/0x1d0 net/core/dev.c:8782 +Read of size 4 at addr ffff8880554640b0 by task kworker/1:4/5295 + +CPU: 1 UID: 0 PID: 5295 Comm: kworker/1:4 Not tainted +6.12.0-rc3-syzkaller-00399-g9197b73fd7bb #0 +Hardware name: Google Compute Engine/Google Compute Engine, +BIOS Google 09/13/2024 +Workqueue: infiniband ib_cache_event_task +Call Trace: + + __dump_stack lib/dump_stack.c:94 [inline] + dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120 + print_address_description mm/kasan/report.c:377 [inline] + print_report+0x169/0x550 mm/kasan/report.c:488 + kasan_report+0x143/0x180 mm/kasan/report.c:601 + dev_get_flags+0x188/0x1d0 net/core/dev.c:8782 + rxe_query_port+0x12d/0x260 drivers/infiniband/sw/rxe/rxe_verbs.c:60 + __ib_query_port drivers/infiniband/core/device.c:2111 [inline] + ib_query_port+0x168/0x7d0 drivers/infiniband/core/device.c:2143 + ib_cache_update+0x1a9/0xb80 drivers/infiniband/core/cache.c:1494 + ib_cache_event_task+0xf3/0x1e0 drivers/infiniband/core/cache.c:1568 + process_one_work kernel/workqueue.c:3229 [inline] + process_scheduled_works+0xa65/0x1850 kernel/workqueue.c:3310 + worker_thread+0x870/0xd30 kernel/workqueue.c:3391 + kthread+0x2f2/0x390 kernel/kthread.c:389 + ret_from_fork+0x4d/0x80 arch/x86/kernel/process.c:147 + ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 + +" + +1). In the link [1], + +" + infiniband syz2: set down +" + +This means that on 839.350575, the event ib_cache_event_task was sent andi +queued in ib_wq. + +2). In the link [1], + +" + team0 (unregistering): Port device team_slave_0 removed +" + +It indicates that before 843.251853, the net device should be freed. + +3). In the link [1], + +" + BUG: KASAN: slab-use-after-free in dev_get_flags+0x188/0x1d0 +" + +This means that on 850.559070, this slab-use-after-free problem occurred. + +In all, on 839.350575, the event ib_cache_event_task was sent and queued +in ib_wq, + +before 843.251853, the net device veth was freed. + +on 850.559070, this event was executed, and the mentioned freed net device +was called. Thus, the above call trace occurred. + +[1] https://syzkaller.appspot.com/x/log.txt?x=12e7025f980000 + +Reported-by: syzbot+4b87489410b4efd181bf@syzkaller.appspotmail.com +Closes: https://syzkaller.appspot.com/bug?extid=4b87489410b4efd181bf +Fixes: 8700e3e7c485 ("Soft RoCE driver") +Signed-off-by: Zhu Yanjun +Link: https://patch.msgid.link/20241220222325.2487767-1-yanjun.zhu@linux.dev +Signed-off-by: Leon Romanovsky +Signed-off-by: Sasha Levin +[Shivani: - exported ib_device_get_netdev() function. + - added ib_device_get_netdev() to ib_verbs.h.] +Signed-off-by: Shivani Agarwal +Signed-off-by: Greg Kroah-Hartman +--- + drivers/infiniband/core/device.c | 1 + + drivers/infiniband/sw/rxe/rxe.c | 23 +++++++++++++++++++---- + drivers/infiniband/sw/rxe/rxe.h | 3 ++- + drivers/infiniband/sw/rxe/rxe_mcast.c | 22 ++++++++++++++++++++-- + drivers/infiniband/sw/rxe/rxe_net.c | 25 ++++++++++++++++++++----- + drivers/infiniband/sw/rxe/rxe_verbs.c | 26 +++++++++++++++++++++----- + drivers/infiniband/sw/rxe/rxe_verbs.h | 11 ++++++++--- + include/rdma/ib_verbs.h | 2 ++ + 8 files changed, 93 insertions(+), 20 deletions(-) + +--- a/drivers/infiniband/core/device.c ++++ b/drivers/infiniband/core/device.c +@@ -2259,6 +2259,7 @@ struct net_device *ib_device_get_netdev( + + return res; + } ++EXPORT_SYMBOL(ib_device_get_netdev); + + /** + * ib_device_get_by_netdev - Find an IB device associated with a netdev +--- a/drivers/infiniband/sw/rxe/rxe.c ++++ b/drivers/infiniband/sw/rxe/rxe.c +@@ -40,6 +40,8 @@ void rxe_dealloc(struct ib_device *ib_de + /* initialize rxe device parameters */ + static void rxe_init_device_param(struct rxe_dev *rxe) + { ++ struct net_device *ndev; ++ + rxe->max_inline_data = RXE_MAX_INLINE_DATA; + + rxe->attr.vendor_id = RXE_VENDOR_ID; +@@ -71,8 +73,15 @@ static void rxe_init_device_param(struct + rxe->attr.max_fast_reg_page_list_len = RXE_MAX_FMR_PAGE_LIST_LEN; + rxe->attr.max_pkeys = RXE_MAX_PKEYS; + rxe->attr.local_ca_ack_delay = RXE_LOCAL_CA_ACK_DELAY; ++ ++ ndev = rxe_ib_device_get_netdev(&rxe->ib_dev); ++ if (!ndev) ++ return; ++ + addrconf_addr_eui48((unsigned char *)&rxe->attr.sys_image_guid, +- rxe->ndev->dev_addr); ++ ndev->dev_addr); ++ ++ dev_put(ndev); + + rxe->max_ucontext = RXE_MAX_UCONTEXT; + } +@@ -109,10 +118,15 @@ static void rxe_init_port_param(struct r + static void rxe_init_ports(struct rxe_dev *rxe) + { + struct rxe_port *port = &rxe->port; ++ struct net_device *ndev; + + rxe_init_port_param(port); ++ ndev = rxe_ib_device_get_netdev(&rxe->ib_dev); ++ if (!ndev) ++ return; + addrconf_addr_eui48((unsigned char *)&port->port_guid, +- rxe->ndev->dev_addr); ++ ndev->dev_addr); ++ dev_put(ndev); + spin_lock_init(&port->port_lock); + } + +@@ -169,12 +183,13 @@ void rxe_set_mtu(struct rxe_dev *rxe, un + /* called by ifc layer to create new rxe device. + * The caller should allocate memory for rxe by calling ib_alloc_device. + */ +-int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name) ++int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name, ++ struct net_device *ndev) + { + rxe_init(rxe); + rxe_set_mtu(rxe, mtu); + +- return rxe_register_device(rxe, ibdev_name); ++ return rxe_register_device(rxe, ibdev_name, ndev); + } + + static int rxe_newlink(const char *ibdev_name, struct net_device *ndev) +--- a/drivers/infiniband/sw/rxe/rxe.h ++++ b/drivers/infiniband/sw/rxe/rxe.h +@@ -139,7 +139,8 @@ enum resp_states { + + void rxe_set_mtu(struct rxe_dev *rxe, unsigned int dev_mtu); + +-int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name); ++int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name, ++ struct net_device *ndev); + + void rxe_rcv(struct sk_buff *skb); + +--- a/drivers/infiniband/sw/rxe/rxe_mcast.c ++++ b/drivers/infiniband/sw/rxe/rxe_mcast.c +@@ -31,10 +31,19 @@ + static int rxe_mcast_add(struct rxe_dev *rxe, union ib_gid *mgid) + { + unsigned char ll_addr[ETH_ALEN]; ++ struct net_device *ndev; ++ int ret; ++ ++ ndev = rxe_ib_device_get_netdev(&rxe->ib_dev); ++ if (!ndev) ++ return -ENODEV; + + ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr); + +- return dev_mc_add(rxe->ndev, ll_addr); ++ ret = dev_mc_add(ndev, ll_addr); ++ dev_put(ndev); ++ ++ return ret; + } + + /** +@@ -47,10 +56,19 @@ static int rxe_mcast_add(struct rxe_dev + static int rxe_mcast_del(struct rxe_dev *rxe, union ib_gid *mgid) + { + unsigned char ll_addr[ETH_ALEN]; ++ struct net_device *ndev; ++ int ret; ++ ++ ndev = rxe_ib_device_get_netdev(&rxe->ib_dev); ++ if (!ndev) ++ return -ENODEV; + + ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr); + +- return dev_mc_del(rxe->ndev, ll_addr); ++ ret = dev_mc_del(ndev, ll_addr); ++ dev_put(ndev); ++ ++ return ret; + } + + /** +--- a/drivers/infiniband/sw/rxe/rxe_net.c ++++ b/drivers/infiniband/sw/rxe/rxe_net.c +@@ -509,7 +509,16 @@ out: + */ + const char *rxe_parent_name(struct rxe_dev *rxe, unsigned int port_num) + { +- return rxe->ndev->name; ++ struct net_device *ndev; ++ char *ndev_name; ++ ++ ndev = rxe_ib_device_get_netdev(&rxe->ib_dev); ++ if (!ndev) ++ return NULL; ++ ndev_name = ndev->name; ++ dev_put(ndev); ++ ++ return ndev_name; + } + + int rxe_net_add(const char *ibdev_name, struct net_device *ndev) +@@ -521,9 +530,7 @@ int rxe_net_add(const char *ibdev_name, + if (!rxe) + return -ENOMEM; + +- rxe->ndev = ndev; +- +- err = rxe_add(rxe, ndev->mtu, ibdev_name); ++ err = rxe_add(rxe, ndev->mtu, ibdev_name, ndev); + if (err) { + ib_dealloc_device(&rxe->ib_dev); + return err; +@@ -571,10 +578,18 @@ void rxe_port_down(struct rxe_dev *rxe) + + void rxe_set_port_state(struct rxe_dev *rxe) + { +- if (netif_running(rxe->ndev) && netif_carrier_ok(rxe->ndev)) ++ struct net_device *ndev; ++ ++ ndev = rxe_ib_device_get_netdev(&rxe->ib_dev); ++ if (!ndev) ++ return; ++ ++ if (netif_running(ndev) && netif_carrier_ok(ndev)) + rxe_port_up(rxe); + else + rxe_port_down(rxe); ++ ++ dev_put(ndev); + } + + static int rxe_notify(struct notifier_block *not_blk, +--- a/drivers/infiniband/sw/rxe/rxe_verbs.c ++++ b/drivers/infiniband/sw/rxe/rxe_verbs.c +@@ -41,6 +41,7 @@ static int rxe_query_port(struct ib_devi + u32 port_num, struct ib_port_attr *attr) + { + struct rxe_dev *rxe = to_rdev(ibdev); ++ struct net_device *ndev; + int err, ret; + + if (port_num != 1) { +@@ -51,19 +52,26 @@ static int rxe_query_port(struct ib_devi + + memcpy(attr, &rxe->port.attr, sizeof(*attr)); + ++ ndev = rxe_ib_device_get_netdev(ibdev); ++ if (!ndev) { ++ err = -ENODEV; ++ goto err_out; ++ } ++ + mutex_lock(&rxe->usdev_lock); + ret = ib_get_eth_speed(ibdev, port_num, &attr->active_speed, + &attr->active_width); + + if (attr->state == IB_PORT_ACTIVE) + attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; +- else if (dev_get_flags(rxe->ndev) & IFF_UP) ++ else if (dev_get_flags(ndev) & IFF_UP) + attr->phys_state = IB_PORT_PHYS_STATE_POLLING; + else + attr->phys_state = IB_PORT_PHYS_STATE_DISABLED; + + mutex_unlock(&rxe->usdev_lock); + ++ dev_put(ndev); + return ret; + + err_out: +@@ -1428,9 +1436,16 @@ static const struct attribute_group rxe_ + static int rxe_enable_driver(struct ib_device *ib_dev) + { + struct rxe_dev *rxe = container_of(ib_dev, struct rxe_dev, ib_dev); ++ struct net_device *ndev; ++ ++ ndev = rxe_ib_device_get_netdev(ib_dev); ++ if (!ndev) ++ return -ENODEV; + + rxe_set_port_state(rxe); +- dev_info(&rxe->ib_dev.dev, "added %s\n", netdev_name(rxe->ndev)); ++ dev_info(&rxe->ib_dev.dev, "added %s\n", netdev_name(ndev)); ++ ++ dev_put(ndev); + return 0; + } + +@@ -1498,7 +1513,8 @@ static const struct ib_device_ops rxe_de + INIT_RDMA_OBJ_SIZE(ib_mw, rxe_mw, ibmw), + }; + +-int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name) ++int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name, ++ struct net_device *ndev) + { + int err; + struct ib_device *dev = &rxe->ib_dev; +@@ -1510,13 +1526,13 @@ int rxe_register_device(struct rxe_dev * + dev->num_comp_vectors = num_possible_cpus(); + dev->local_dma_lkey = 0; + addrconf_addr_eui48((unsigned char *)&dev->node_guid, +- rxe->ndev->dev_addr); ++ ndev->dev_addr); + + dev->uverbs_cmd_mask |= BIT_ULL(IB_USER_VERBS_CMD_POST_SEND) | + BIT_ULL(IB_USER_VERBS_CMD_REQ_NOTIFY_CQ); + + ib_set_device_ops(dev, &rxe_dev_ops); +- err = ib_device_set_netdev(&rxe->ib_dev, rxe->ndev, 1); ++ err = ib_device_set_netdev(&rxe->ib_dev, ndev, 1); + if (err) + return err; + +--- a/drivers/infiniband/sw/rxe/rxe_verbs.h ++++ b/drivers/infiniband/sw/rxe/rxe_verbs.h +@@ -369,6 +369,7 @@ struct rxe_port { + u32 qp_gsi_index; + }; + ++#define RXE_PORT 1 + struct rxe_dev { + struct ib_device ib_dev; + struct ib_device_attr attr; +@@ -376,8 +377,6 @@ struct rxe_dev { + int max_inline_data; + struct mutex usdev_lock; + +- struct net_device *ndev; +- + struct rxe_pool uc_pool; + struct rxe_pool pd_pool; + struct rxe_pool ah_pool; +@@ -405,6 +404,11 @@ struct rxe_dev { + struct crypto_shash *tfm; + }; + ++static inline struct net_device *rxe_ib_device_get_netdev(struct ib_device *dev) ++{ ++ return ib_device_get_netdev(dev, RXE_PORT); ++} ++ + static inline void rxe_counter_inc(struct rxe_dev *rxe, enum rxe_counters index) + { + atomic64_inc(&rxe->stats_counters[index]); +@@ -470,6 +474,7 @@ static inline struct rxe_pd *rxe_mw_pd(s + return to_rpd(mw->ibmw.pd); + } + +-int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name); ++int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name, ++ struct net_device *ndev); + + #endif /* RXE_VERBS_H */ +--- a/include/rdma/ib_verbs.h ++++ b/include/rdma/ib_verbs.h +@@ -4444,6 +4444,8 @@ struct net_device *ib_get_net_dev_by_par + const struct sockaddr *addr); + int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, + unsigned int port); ++struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, ++ u32 port); + struct ib_wq *ib_create_wq(struct ib_pd *pd, + struct ib_wq_init_attr *init_attr); + int ib_destroy_wq_user(struct ib_wq *wq, struct ib_udata *udata); diff --git a/queue-6.6/sched-fair-proportional-newidle-balance.patch b/queue-6.6/sched-fair-proportional-newidle-balance.patch new file mode 100644 index 0000000000..943c80bec6 --- /dev/null +++ b/queue-6.6/sched-fair-proportional-newidle-balance.patch @@ -0,0 +1,206 @@ +From stable+bounces-198201-greg=kroah.com@vger.kernel.org Wed Dec 3 12:41:03 2025 +From: Ajay Kaher +Date: Wed, 3 Dec 2025 11:22:55 +0000 +Subject: sched/fair: Proportional newidle balance +To: stable@vger.kernel.org, gregkh@linuxfoundation.org +Cc: mingo@redhat.com, peterz@infradead.org, juri.lelli@redhat.com, vincent.guittot@linaro.org, dietmar.eggemann@arm.com, rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de, vschneid@redhat.com, linux-kernel@vger.kernel.org, ajay.kaher@broadcom.com, alexey.makhalov@broadcom.com, yin.ding@broadcom.com, tapas.kundu@broadcom.com, Chris Mason +Message-ID: <20251203112255.1738272-5-ajay.kaher@broadcom.com> + +From: Peter Zijlstra (Intel) + +commit 33cf66d88306663d16e4759e9d24766b0aaa2e17 upstream. + +Add a randomized algorithm that runs newidle balancing proportional to +its success rate. + +This improves schbench significantly: + + 6.18-rc4: 2.22 Mrps/s + 6.18-rc4+revert: 2.04 Mrps/s + 6.18-rc4+revert+random: 2.18 Mrps/S + +Conversely, per Adam Li this affects SpecJBB slightly, reducing it by 1%: + + 6.17: -6% + 6.17+revert: 0% + 6.17+revert+random: -1% + +Signed-off-by: Peter Zijlstra (Intel) +Reviewed-by: Dietmar Eggemann +Tested-by: Dietmar Eggemann +Tested-by: Chris Mason +Link: https://lkml.kernel.org/r/6825c50d-7fa7-45d8-9b81-c6e7e25738e2@meta.com +Link: https://patch.msgid.link/20251107161739.770122091@infradead.org +[ Ajay: Modified to apply on v6.6 ] +Signed-off-by: Ajay Kaher +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/sched/topology.h | 3 ++ + kernel/sched/core.c | 3 ++ + kernel/sched/fair.c | 44 +++++++++++++++++++++++++++++++++++++---- + kernel/sched/features.h | 5 ++++ + kernel/sched/sched.h | 7 ++++++ + kernel/sched/topology.c | 6 +++++ + 6 files changed, 64 insertions(+), 4 deletions(-) + +--- a/include/linux/sched/topology.h ++++ b/include/linux/sched/topology.h +@@ -106,6 +106,9 @@ struct sched_domain { + unsigned int nr_balance_failed; /* initialise to 0 */ + + /* idle_balance() stats */ ++ unsigned int newidle_call; ++ unsigned int newidle_success; ++ unsigned int newidle_ratio; + u64 max_newidle_lb_cost; + unsigned long last_decay_max_lb_cost; + +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -116,6 +116,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_ + EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp); + + DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++DEFINE_PER_CPU(struct rnd_state, sched_rnd_state); + + #ifdef CONFIG_SCHED_DEBUG + /* +@@ -9872,6 +9873,8 @@ void __init sched_init_smp(void) + { + sched_init_numa(NUMA_NO_NODE); + ++ prandom_init_once(&sched_rnd_state); ++ + /* + * There's no userspace yet to cause hotplug operations; hence all the + * CPU masks are stable and all blatant races in the below code cannot +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -11716,11 +11716,27 @@ void update_max_interval(void) + max_load_balance_interval = HZ*num_online_cpus()/10; + } + +-static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost) ++static inline void update_newidle_stats(struct sched_domain *sd, unsigned int success) ++{ ++ sd->newidle_call++; ++ sd->newidle_success += success; ++ ++ if (sd->newidle_call >= 1024) { ++ sd->newidle_ratio = sd->newidle_success; ++ sd->newidle_call /= 2; ++ sd->newidle_success /= 2; ++ } ++} ++ ++static inline bool ++update_newidle_cost(struct sched_domain *sd, u64 cost, unsigned int success) + { + unsigned long next_decay = sd->last_decay_max_lb_cost + HZ; + unsigned long now = jiffies; + ++ if (cost) ++ update_newidle_stats(sd, success); ++ + if (cost > sd->max_newidle_lb_cost) { + /* + * Track max cost of a domain to make sure to not delay the +@@ -11768,7 +11784,7 @@ static void rebalance_domains(struct rq + * Decay the newidle max times here because this is a regular + * visit to all the domains. + */ +- need_decay = update_newidle_cost(sd, 0); ++ need_decay = update_newidle_cost(sd, 0, 0); + max_cost += sd->max_newidle_lb_cost; + + /* +@@ -12406,6 +12422,22 @@ static int sched_balance_newidle(struct + break; + + if (sd->flags & SD_BALANCE_NEWIDLE) { ++ unsigned int weight = 1; ++ ++ if (sched_feat(NI_RANDOM)) { ++ /* ++ * Throw a 1k sided dice; and only run ++ * newidle_balance according to the success ++ * rate. ++ */ ++ u32 d1k = sched_rng() % 1024; ++ weight = 1 + sd->newidle_ratio; ++ if (d1k > weight) { ++ update_newidle_stats(sd, 0); ++ continue; ++ } ++ weight = (1024 + weight/2) / weight; ++ } + + pulled_task = load_balance(this_cpu, this_rq, + sd, CPU_NEWLY_IDLE, +@@ -12413,10 +12445,14 @@ static int sched_balance_newidle(struct + + t1 = sched_clock_cpu(this_cpu); + domain_cost = t1 - t0; +- update_newidle_cost(sd, domain_cost); +- + curr_cost += domain_cost; + t0 = t1; ++ ++ /* ++ * Track max cost of a domain to make sure to not delay the ++ * next wakeup on the CPU. ++ */ ++ update_newidle_cost(sd, domain_cost, weight * !!pulled_task); + } + + /* +--- a/kernel/sched/features.h ++++ b/kernel/sched/features.h +@@ -88,4 +88,9 @@ SCHED_FEAT(UTIL_EST_FASTUP, true) + + SCHED_FEAT(LATENCY_WARN, false) + ++/* ++ * Do newidle balancing proportional to its success rate using randomization. ++ */ ++SCHED_FEAT(NI_RANDOM, true) ++ + SCHED_FEAT(HZ_BW, true) +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -5,6 +5,7 @@ + #ifndef _KERNEL_SCHED_SCHED_H + #define _KERNEL_SCHED_SCHED_H + ++#include + #include + #include + #include +@@ -1205,6 +1206,12 @@ static inline bool is_migration_disabled + } + + DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++DECLARE_PER_CPU(struct rnd_state, sched_rnd_state); ++ ++static inline u32 sched_rng(void) ++{ ++ return prandom_u32_state(this_cpu_ptr(&sched_rnd_state)); ++} + + #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) + #define this_rq() this_cpu_ptr(&runqueues) +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -1600,6 +1600,12 @@ sd_init(struct sched_domain_topology_lev + + .last_balance = jiffies, + .balance_interval = sd_weight, ++ ++ /* 50% success rate */ ++ .newidle_call = 512, ++ .newidle_success = 256, ++ .newidle_ratio = 512, ++ + .max_newidle_lb_cost = 0, + .last_decay_max_lb_cost = jiffies, + .child = child, diff --git a/queue-6.6/sched-fair-small-cleanup-to-sched_balance_newidle.patch b/queue-6.6/sched-fair-small-cleanup-to-sched_balance_newidle.patch new file mode 100644 index 0000000000..50ca624dad --- /dev/null +++ b/queue-6.6/sched-fair-small-cleanup-to-sched_balance_newidle.patch @@ -0,0 +1,49 @@ +From stable+bounces-198199-greg=kroah.com@vger.kernel.org Wed Dec 3 12:40:53 2025 +From: Ajay Kaher +Date: Wed, 3 Dec 2025 11:22:53 +0000 +Subject: sched/fair: Small cleanup to sched_balance_newidle() +To: stable@vger.kernel.org, gregkh@linuxfoundation.org +Cc: mingo@redhat.com, peterz@infradead.org, juri.lelli@redhat.com, vincent.guittot@linaro.org, dietmar.eggemann@arm.com, rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de, vschneid@redhat.com, linux-kernel@vger.kernel.org, ajay.kaher@broadcom.com, alexey.makhalov@broadcom.com, yin.ding@broadcom.com, tapas.kundu@broadcom.com, Chris Mason +Message-ID: <20251203112255.1738272-3-ajay.kaher@broadcom.com> + +From: Peter Zijlstra + +commit e78e70dbf603c1425f15f32b455ca148c932f6c1 upstream. + +Pull out the !sd check to simplify code. + +Signed-off-by: Peter Zijlstra (Intel) +Reviewed-by: Dietmar Eggemann +Tested-by: Dietmar Eggemann +Tested-by: Chris Mason +Link: https://patch.msgid.link/20251107161739.525916173@infradead.org +[ Ajay: Modified to apply on v6.6 ] +Signed-off-by: Ajay Kaher +Signed-off-by: Greg Kroah-Hartman +--- + kernel/sched/fair.c | 11 ++++++----- + 1 file changed, 6 insertions(+), 5 deletions(-) + +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -12374,14 +12374,15 @@ static int sched_balance_newidle(struct + + rcu_read_lock(); + sd = rcu_dereference_check_sched_domain(this_rq->sd); ++ if (!sd) { ++ rcu_read_unlock(); ++ goto out; ++ } + + if (!READ_ONCE(this_rq->rd->overload) || +- (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) { +- +- if (sd) +- update_next_balance(sd, &next_balance); ++ this_rq->avg_idle < sd->max_newidle_lb_cost) { ++ update_next_balance(sd, &next_balance); + rcu_read_unlock(); +- + goto out; + } + rcu_read_unlock(); diff --git a/queue-6.6/sched-fair-small-cleanup-to-update_newidle_cost.patch b/queue-6.6/sched-fair-small-cleanup-to-update_newidle_cost.patch new file mode 100644 index 0000000000..e82a34ce59 --- /dev/null +++ b/queue-6.6/sched-fair-small-cleanup-to-update_newidle_cost.patch @@ -0,0 +1,58 @@ +From stable+bounces-198200-greg=kroah.com@vger.kernel.org Wed Dec 3 12:40:49 2025 +From: Ajay Kaher +Date: Wed, 3 Dec 2025 11:22:54 +0000 +Subject: sched/fair: Small cleanup to update_newidle_cost() +To: stable@vger.kernel.org, gregkh@linuxfoundation.org +Cc: mingo@redhat.com, peterz@infradead.org, juri.lelli@redhat.com, vincent.guittot@linaro.org, dietmar.eggemann@arm.com, rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de, vschneid@redhat.com, linux-kernel@vger.kernel.org, ajay.kaher@broadcom.com, alexey.makhalov@broadcom.com, yin.ding@broadcom.com, tapas.kundu@broadcom.com, Chris Mason +Message-ID: <20251203112255.1738272-4-ajay.kaher@broadcom.com> + +From: Peter Zijlstra + +commit 08d473dd8718e4a4d698b1113a14a40ad64a909b upstream. + +Simplify code by adding a few variables. + +Signed-off-by: Peter Zijlstra (Intel) +Reviewed-by: Dietmar Eggemann +Tested-by: Dietmar Eggemann +Tested-by: Chris Mason +Link: https://patch.msgid.link/20251107161739.655208666@infradead.org +[ Ajay: Modified to apply on v6.6 ] +Signed-off-by: Ajay Kaher +Signed-off-by: Greg Kroah-Hartman +--- + kernel/sched/fair.c | 11 +++++++---- + 1 file changed, 7 insertions(+), 4 deletions(-) + +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -11718,22 +11718,25 @@ void update_max_interval(void) + + static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost) + { ++ unsigned long next_decay = sd->last_decay_max_lb_cost + HZ; ++ unsigned long now = jiffies; ++ + if (cost > sd->max_newidle_lb_cost) { + /* + * Track max cost of a domain to make sure to not delay the + * next wakeup on the CPU. + */ + sd->max_newidle_lb_cost = cost; +- sd->last_decay_max_lb_cost = jiffies; +- } else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) { ++ sd->last_decay_max_lb_cost = now; ++ ++ } else if (time_after(now, next_decay)) { + /* + * Decay the newidle max times by ~1% per second to ensure that + * it is not outdated and the current max cost is actually + * shorter. + */ + sd->max_newidle_lb_cost = (sd->max_newidle_lb_cost * 253) / 256; +- sd->last_decay_max_lb_cost = jiffies; +- ++ sd->last_decay_max_lb_cost = now; + return true; + } + diff --git a/queue-6.6/series b/queue-6.6/series index 5443c67f89..2b6e83b3e4 100644 --- a/queue-6.6/series +++ b/queue-6.6/series @@ -699,3 +699,23 @@ media-amphion-add-a-frame-flush-mode-for-decoder.patch media-amphion-make-some-vpu_v4l2-functions-static.patch media-amphion-remove-vpu_vb_is_codecconfig.patch media-mediatek-vcodec-use-spinlock-for-context-list-protection-lock.patch +kvm-svm-introduce-svm_recalc_lbr_msr_intercepts.patch +kvm-nsvm-always-recalculate-lbr-msr-intercepts-in-svm_update_lbrv.patch +kvm-nsvm-fix-and-simplify-lbr-virtualization-handling-with-nested.patch +kvm-svm-fix-redundant-updates-of-lbr-msr-intercepts.patch +mm-damon-tests-vaddr-kunit-handle-alloc-failures-in-damon_test_split_evenly_fail.patch +mm-damon-tests-vaddr-kunit-handle-alloc-failures-on-damon_do_test_apply_three_regions.patch +rdma-core-fix-kasan-slab-use-after-free-read-in-ib_register_device-problem.patch +sched-fair-small-cleanup-to-sched_balance_newidle.patch +sched-fair-small-cleanup-to-update_newidle_cost.patch +sched-fair-proportional-newidle-balance.patch +net-remove-conditional-threaded-napi-wakeup-based-on-task-state.patch +net-allow-to-use-smp-threads-for-backlog-napi.patch +rdma-rxe-remove-the-direct-link-to-net_device.patch +rdma-rxe-fix-the-failure-of-ibv_query_device-and-ibv_query_device_ex-tests.patch +mm-damon-tests-vaddr-kunit-handle-alloc-failures-on-damon_test_split_evenly_succ.patch +mm-damon-tests-core-kunit-handle-alloc-failres-in-damon_test_new_filter.patch +mm-damon-tests-core-kunit-handle-allocation-failures-in-damon_test_regions.patch +mm-damon-tests-core-kunit-handle-alloc-failures-on-damon_test_split_at.patch +mm-damon-tests-core-kunit-handle-alloc-failures-on-dasmon_test_merge_regions_of.patch +mm-damon-tests-core-kunit-handle-alloc-failures-on-damon_test_merge_two.patch