--- /dev/null
+From fbe5e5f030c22ae717ee422aaab0e00ea84fab5e Mon Sep 17 00:00:00 2001
+From: Yosry Ahmed <yosry.ahmed@linux.dev>
+Date: Sat, 8 Nov 2025 00:45:20 +0000
+Subject: KVM: nSVM: Always recalculate LBR MSR intercepts in svm_update_lbrv()
+
+From: Yosry Ahmed <yosry.ahmed@linux.dev>
+
+commit fbe5e5f030c22ae717ee422aaab0e00ea84fab5e upstream.
+
+svm_update_lbrv() is called when MSR_IA32_DEBUGCTLMSR is updated, and on
+nested transitions where LBRV is used. It checks whether LBRV enablement
+needs to be changed in the current VMCB, and if it does, it also
+recalculate intercepts to LBR MSRs.
+
+However, there are cases where intercepts need to be updated even when
+LBRV enablement doesn't. Example scenario:
+- L1 has MSR_IA32_DEBUGCTLMSR cleared.
+- L1 runs L2 without LBR_CTL_ENABLE (no LBRV).
+- L2 sets DEBUGCTLMSR_LBR in MSR_IA32_DEBUGCTLMSR, svm_update_lbrv()
+ sets LBR_CTL_ENABLE in VMCB02 and disables intercepts to LBR MSRs.
+- L2 exits to L1, svm_update_lbrv() is not called on this transition.
+- L1 clears MSR_IA32_DEBUGCTLMSR, svm_update_lbrv() finds that
+ LBR_CTL_ENABLE is already cleared in VMCB01 and does nothing.
+- Intercepts remain disabled, L1 reads to LBR MSRs read the host MSRs.
+
+Fix it by always recalculating intercepts in svm_update_lbrv().
+
+Fixes: 1d5a1b5860ed ("KVM: x86: nSVM: correctly virtualize LBR msrs when L2 is running")
+Cc: stable@vger.kernel.org
+Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
+Link: https://patch.msgid.link/20251108004524.1600006-3-yosry.ahmed@linux.dev
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/svm/svm.c | 29 +++++++++++++++++++----------
+ 1 file changed, 19 insertions(+), 10 deletions(-)
+
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -1031,26 +1031,30 @@ static void svm_recalc_lbr_msr_intercept
+ !intercept, !intercept);
+ }
+
+-void svm_enable_lbrv(struct kvm_vcpu *vcpu)
++static void __svm_enable_lbrv(struct kvm_vcpu *vcpu)
+ {
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
+- svm_recalc_lbr_msr_intercepts(vcpu);
+
+ /* Move the LBR msrs to the vmcb02 so that the guest can see them. */
+ if (is_guest_mode(vcpu))
+ svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr);
+ }
+
+-static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
++void svm_enable_lbrv(struct kvm_vcpu *vcpu)
++{
++ __svm_enable_lbrv(vcpu);
++ svm_recalc_lbr_msr_intercepts(vcpu);
++}
++
++static void __svm_disable_lbrv(struct kvm_vcpu *vcpu)
+ {
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm);
+
+ svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
+- svm_recalc_lbr_msr_intercepts(vcpu);
+
+ /*
+ * Move the LBR msrs back to the vmcb01 to avoid copying them
+@@ -1079,13 +1083,18 @@ void svm_update_lbrv(struct kvm_vcpu *vc
+ (is_guest_mode(vcpu) && guest_can_use(vcpu, X86_FEATURE_LBRV) &&
+ (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK));
+
+- if (enable_lbrv == current_enable_lbrv)
+- return;
++ if (enable_lbrv && !current_enable_lbrv)
++ __svm_enable_lbrv(vcpu);
++ else if (!enable_lbrv && current_enable_lbrv)
++ __svm_disable_lbrv(vcpu);
+
+- if (enable_lbrv)
+- svm_enable_lbrv(vcpu);
+- else
+- svm_disable_lbrv(vcpu);
++ /*
++ * During nested transitions, it is possible that the current VMCB has
++ * LBR_CTL set, but the previous LBR_CTL had it cleared (or vice versa).
++ * In this case, even though LBR_CTL does not need an update, intercepts
++ * do, so always recalculate the intercepts here.
++ */
++ svm_recalc_lbr_msr_intercepts(vcpu);
+ }
+
+ void disable_nmi_singlestep(struct vcpu_svm *svm)
--- /dev/null
+From 8a4821412cf2c1429fffa07c012dd150f2edf78c Mon Sep 17 00:00:00 2001
+From: Yosry Ahmed <yosry.ahmed@linux.dev>
+Date: Sat, 8 Nov 2025 00:45:21 +0000
+Subject: KVM: nSVM: Fix and simplify LBR virtualization handling with nested
+
+From: Yosry Ahmed <yosry.ahmed@linux.dev>
+
+commit 8a4821412cf2c1429fffa07c012dd150f2edf78c upstream.
+
+The current scheme for handling LBRV when nested is used is very
+complicated, especially when L1 does not enable LBRV (i.e. does not set
+LBR_CTL_ENABLE_MASK).
+
+To avoid copying LBRs between VMCB01 and VMCB02 on every nested
+transition, the current implementation switches between using VMCB01 or
+VMCB02 as the source of truth for the LBRs while L2 is running. If L2
+enables LBR, VMCB02 is used as the source of truth. When L2 disables
+LBR, the LBRs are copied to VMCB01 and VMCB01 is used as the source of
+truth. This introduces significant complexity, and incorrect behavior in
+some cases.
+
+For example, on a nested #VMEXIT, the LBRs are only copied from VMCB02
+to VMCB01 if LBRV is enabled in VMCB01. This is because L2's writes to
+MSR_IA32_DEBUGCTLMSR to enable LBR are intercepted and propagated to
+VMCB01 instead of VMCB02. However, LBRV is only enabled in VMCB02 when
+L2 is running.
+
+This means that if L2 enables LBR and exits to L1, the LBRs will not be
+propagated from VMCB02 to VMCB01, because LBRV is disabled in VMCB01.
+
+There is no meaningful difference in CPUID rate in L2 when copying LBRs
+on every nested transition vs. the current approach, so do the simple
+and correct thing and always copy LBRs between VMCB01 and VMCB02 on
+nested transitions (when LBRV is disabled by L1). Drop the conditional
+LBRs copying in __svm_{enable/disable}_lbrv() as it is now unnecessary.
+
+VMCB02 becomes the only source of truth for LBRs when L2 is running,
+regardless of LBRV being enabled by L1, drop svm_get_lbr_vmcb() and use
+svm->vmcb directly in its place.
+
+Fixes: 1d5a1b5860ed ("KVM: x86: nSVM: correctly virtualize LBR msrs when L2 is running")
+Cc: stable@vger.kernel.org
+Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
+Link: https://patch.msgid.link/20251108004524.1600006-4-yosry.ahmed@linux.dev
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/svm/nested.c | 20 ++++++-------------
+ arch/x86/kvm/svm/svm.c | 47 +++++++++-------------------------------------
+ 2 files changed, 17 insertions(+), 50 deletions(-)
+
+--- a/arch/x86/kvm/svm/nested.c
++++ b/arch/x86/kvm/svm/nested.c
+@@ -601,11 +601,10 @@ static void nested_vmcb02_prepare_save(s
+ */
+ svm_copy_lbrs(vmcb02, vmcb12);
+ vmcb02->save.dbgctl &= ~DEBUGCTL_RESERVED_BITS;
+- svm_update_lbrv(&svm->vcpu);
+-
+- } else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) {
++ } else {
+ svm_copy_lbrs(vmcb02, vmcb01);
+ }
++ svm_update_lbrv(&svm->vcpu);
+ }
+
+ static inline bool is_evtinj_soft(u32 evtinj)
+@@ -731,11 +730,7 @@ static void nested_vmcb02_prepare_contro
+ svm->soft_int_next_rip = vmcb12_rip;
+ }
+
+- vmcb02->control.virt_ext = vmcb01->control.virt_ext &
+- LBR_CTL_ENABLE_MASK;
+- if (guest_can_use(vcpu, X86_FEATURE_LBRV))
+- vmcb02->control.virt_ext |=
+- (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK);
++ /* LBR_CTL_ENABLE_MASK is controlled by svm_update_lbrv() */
+
+ if (!nested_vmcb_needs_vls_intercept(svm))
+ vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+@@ -1066,13 +1061,12 @@ int nested_svm_vmexit(struct vcpu_svm *s
+ kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+
+ if (unlikely(guest_can_use(vcpu, X86_FEATURE_LBRV) &&
+- (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) {
++ (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK)))
+ svm_copy_lbrs(vmcb12, vmcb02);
+- svm_update_lbrv(vcpu);
+- } else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) {
++ else
+ svm_copy_lbrs(vmcb01, vmcb02);
+- svm_update_lbrv(vcpu);
+- }
++
++ svm_update_lbrv(vcpu);
+
+ if (vnmi) {
+ if (vmcb02->control.int_ctl & V_NMI_BLOCKING_MASK)
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -1033,13 +1033,7 @@ static void svm_recalc_lbr_msr_intercept
+
+ static void __svm_enable_lbrv(struct kvm_vcpu *vcpu)
+ {
+- struct vcpu_svm *svm = to_svm(vcpu);
+-
+- svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
+-
+- /* Move the LBR msrs to the vmcb02 so that the guest can see them. */
+- if (is_guest_mode(vcpu))
+- svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr);
++ to_svm(vcpu)->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
+ }
+
+ void svm_enable_lbrv(struct kvm_vcpu *vcpu)
+@@ -1050,36 +1044,15 @@ void svm_enable_lbrv(struct kvm_vcpu *vc
+
+ static void __svm_disable_lbrv(struct kvm_vcpu *vcpu)
+ {
+- struct vcpu_svm *svm = to_svm(vcpu);
+-
+ KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm);
+-
+- svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
+-
+- /*
+- * Move the LBR msrs back to the vmcb01 to avoid copying them
+- * on nested guest entries.
+- */
+- if (is_guest_mode(vcpu))
+- svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb);
+-}
+-
+-static struct vmcb *svm_get_lbr_vmcb(struct vcpu_svm *svm)
+-{
+- /*
+- * If LBR virtualization is disabled, the LBR MSRs are always kept in
+- * vmcb01. If LBR virtualization is enabled and L1 is running VMs of
+- * its own, the MSRs are moved between vmcb01 and vmcb02 as needed.
+- */
+- return svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK ? svm->vmcb :
+- svm->vmcb01.ptr;
++ to_svm(vcpu)->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
+ }
+
+ void svm_update_lbrv(struct kvm_vcpu *vcpu)
+ {
+ struct vcpu_svm *svm = to_svm(vcpu);
+ bool current_enable_lbrv = svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK;
+- bool enable_lbrv = (svm_get_lbr_vmcb(svm)->save.dbgctl & DEBUGCTLMSR_LBR) ||
++ bool enable_lbrv = (svm->vmcb->save.dbgctl & DEBUGCTLMSR_LBR) ||
+ (is_guest_mode(vcpu) && guest_can_use(vcpu, X86_FEATURE_LBRV) &&
+ (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK));
+
+@@ -2925,19 +2898,19 @@ static int svm_get_msr(struct kvm_vcpu *
+ msr_info->data = svm->tsc_aux;
+ break;
+ case MSR_IA32_DEBUGCTLMSR:
+- msr_info->data = svm_get_lbr_vmcb(svm)->save.dbgctl;
++ msr_info->data = svm->vmcb->save.dbgctl;
+ break;
+ case MSR_IA32_LASTBRANCHFROMIP:
+- msr_info->data = svm_get_lbr_vmcb(svm)->save.br_from;
++ msr_info->data = svm->vmcb->save.br_from;
+ break;
+ case MSR_IA32_LASTBRANCHTOIP:
+- msr_info->data = svm_get_lbr_vmcb(svm)->save.br_to;
++ msr_info->data = svm->vmcb->save.br_to;
+ break;
+ case MSR_IA32_LASTINTFROMIP:
+- msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_from;
++ msr_info->data = svm->vmcb->save.last_excp_from;
+ break;
+ case MSR_IA32_LASTINTTOIP:
+- msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_to;
++ msr_info->data = svm->vmcb->save.last_excp_to;
+ break;
+ case MSR_VM_HSAVE_PA:
+ msr_info->data = svm->nested.hsave_msr;
+@@ -3206,10 +3179,10 @@ static int svm_set_msr(struct kvm_vcpu *
+ if (data & DEBUGCTL_RESERVED_BITS)
+ return 1;
+
+- if (svm_get_lbr_vmcb(svm)->save.dbgctl == data)
++ if (svm->vmcb->save.dbgctl == data)
+ break;
+
+- svm_get_lbr_vmcb(svm)->save.dbgctl = data;
++ svm->vmcb->save.dbgctl = data;
+ vmcb_mark_dirty(svm->vmcb, VMCB_LBR);
+ svm_update_lbrv(vcpu);
+ break;
--- /dev/null
+From 3fa05f96fc08dff5e846c2cc283a249c1bf029a1 Mon Sep 17 00:00:00 2001
+From: Yosry Ahmed <yosry.ahmed@linux.dev>
+Date: Wed, 12 Nov 2025 01:30:17 +0000
+Subject: KVM: SVM: Fix redundant updates of LBR MSR intercepts
+
+From: Yosry Ahmed <yosry.ahmed@linux.dev>
+
+commit 3fa05f96fc08dff5e846c2cc283a249c1bf029a1 upstream.
+
+Don't update the LBR MSR intercept bitmaps if they're already up-to-date,
+as unconditionally updating the intercepts forces KVM to recalculate the
+MSR bitmaps for vmcb02 on every nested VMRUN. The redundant updates are
+functionally okay; however, they neuter an optimization in Hyper-V
+nested virtualization enlightenments and this manifests as a self-test
+failure.
+
+In particular, Hyper-V lets L1 mark "nested enlightenments" as clean, i.e.
+tell KVM that no changes were made to the MSR bitmap since the last VMRUN.
+The hyperv_svm_test KVM selftest intentionally changes the MSR bitmap
+"without telling KVM about it" to verify that KVM honors the clean hint,
+correctly fails because KVM notices the changed bitmap anyway:
+
+ ==== Test Assertion Failure ====
+ x86/hyperv_svm_test.c:120: vmcb->control.exit_code == 0x081
+ pid=193558 tid=193558 errno=4 - Interrupted system call
+ 1 0x0000000000411361: assert_on_unhandled_exception at processor.c:659
+ 2 0x0000000000406186: _vcpu_run at kvm_util.c:1699
+ 3 (inlined by) vcpu_run at kvm_util.c:1710
+ 4 0x0000000000401f2a: main at hyperv_svm_test.c:175
+ 5 0x000000000041d0d3: __libc_start_call_main at libc-start.o:?
+ 6 0x000000000041f27c: __libc_start_main_impl at ??:?
+ 7 0x00000000004021a0: _start at ??:?
+ vmcb->control.exit_code == SVM_EXIT_VMMCALL
+
+Do *not* fix this by skipping svm_hv_vmcb_dirty_nested_enlightenments()
+when svm_set_intercept_for_msr() performs a no-op change. changes to
+the L0 MSR interception bitmap are only triggered by full CPUID updates
+and MSR filter updates, both of which should be rare. Changing
+svm_set_intercept_for_msr() risks hiding unintended pessimizations
+like this one, and is actually more complex than this change.
+
+Fixes: fbe5e5f030c2 ("KVM: nSVM: Always recalculate LBR MSR intercepts in svm_update_lbrv()")
+Cc: stable@vger.kernel.org
+Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
+Link: https://patch.msgid.link/20251112013017.1836863-1-yosry.ahmed@linux.dev
+[Rewritten commit message based on mailing list discussion. - Paolo]
+Reviewed-by: Sean Christopherson <seanjc@google.com>
+Tested-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/svm/svm.c | 6 ++++++
+ arch/x86/kvm/svm/svm.h | 1 +
+ 2 files changed, 7 insertions(+)
+
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -1017,6 +1017,9 @@ static void svm_recalc_lbr_msr_intercept
+ struct vcpu_svm *svm = to_svm(vcpu);
+ bool intercept = !(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK);
+
++ if (intercept == svm->lbr_msrs_intercepted)
++ return;
++
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP,
+ !intercept, !intercept);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP,
+@@ -1029,6 +1032,8 @@ static void svm_recalc_lbr_msr_intercept
+ if (sev_es_guest(vcpu->kvm))
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_DEBUGCTLMSR,
+ !intercept, !intercept);
++
++ svm->lbr_msrs_intercepted = intercept;
+ }
+
+ static void __svm_enable_lbrv(struct kvm_vcpu *vcpu)
+@@ -1473,6 +1478,7 @@ static int svm_vcpu_create(struct kvm_vc
+ }
+
+ svm->x2avic_msrs_intercepted = true;
++ svm->lbr_msrs_intercepted = true;
+
+ svm->vmcb01.ptr = page_address(vmcb01_page);
+ svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
+--- a/arch/x86/kvm/svm/svm.h
++++ b/arch/x86/kvm/svm/svm.h
+@@ -288,6 +288,7 @@ struct vcpu_svm {
+ bool guest_state_loaded;
+
+ bool x2avic_msrs_intercepted;
++ bool lbr_msrs_intercepted;
+
+ /* Guest GIF value, used when vGIF is not enabled */
+ bool guest_gif;
--- /dev/null
+From yosry.ahmed@linux.dev Thu Jan 8 13:20:08 2026
+From: Yosry Ahmed <yosry.ahmed@linux.dev>
+Date: Wed, 3 Dec 2025 18:42:17 +0000
+Subject: KVM: SVM: Introduce svm_recalc_lbr_msr_intercepts()
+To: stable@vger.kernel.org
+Cc: Paolo Bonzini <pbonzini@redhat.com>, Sean Christopherson <seanjc@google.com>, Yosry Ahmed <yosry.ahmed@linux.dev>
+Message-ID: <20251203184220.2693264-1-yosry.ahmed@linux.dev>
+
+From: Yosry Ahmed <yosry.ahmed@linux.dev>
+
+Introduce a helper updating the intercepts for LBR MSRs, similar to the
+one introduced upstream by commit 160f143cc131 ("KVM: SVM: Manually
+recalc all MSR intercepts on userspace MSR filter change"). The main
+difference is that this version uses set_msr_interception(), which has
+inverted polarity compared to svm_set_intercept_for_msr().
+
+This is intended to simplify incoming backports. No functional changes
+intended.
+
+Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/svm/svm.c | 32 +++++++++++++++++++++-----------
+ 1 file changed, 21 insertions(+), 11 deletions(-)
+
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -1012,18 +1012,31 @@ void svm_copy_lbrs(struct vmcb *to_vmcb,
+ vmcb_mark_dirty(to_vmcb, VMCB_LBR);
+ }
+
+-void svm_enable_lbrv(struct kvm_vcpu *vcpu)
++static void svm_recalc_lbr_msr_intercepts(struct kvm_vcpu *vcpu)
+ {
+ struct vcpu_svm *svm = to_svm(vcpu);
++ bool intercept = !(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK);
+
+- svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
+- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
+- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
+- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
+- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
++ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP,
++ !intercept, !intercept);
++ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP,
++ !intercept, !intercept);
++ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP,
++ !intercept, !intercept);
++ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP,
++ !intercept, !intercept);
+
+ if (sev_es_guest(vcpu->kvm))
+- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_DEBUGCTLMSR, 1, 1);
++ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_DEBUGCTLMSR,
++ !intercept, !intercept);
++}
++
++void svm_enable_lbrv(struct kvm_vcpu *vcpu)
++{
++ struct vcpu_svm *svm = to_svm(vcpu);
++
++ svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
++ svm_recalc_lbr_msr_intercepts(vcpu);
+
+ /* Move the LBR msrs to the vmcb02 so that the guest can see them. */
+ if (is_guest_mode(vcpu))
+@@ -1037,10 +1050,7 @@ static void svm_disable_lbrv(struct kvm_
+ KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm);
+
+ svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
+- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
+- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
+- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
+- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
++ svm_recalc_lbr_msr_intercepts(vcpu);
+
+ /*
+ * Move the LBR msrs back to the vmcb01 to avoid copying them
--- /dev/null
+From 28ab2265e9422ccd81e4beafc0ace90f78de04c4 Mon Sep 17 00:00:00 2001
+From: SeongJae Park <sj@kernel.org>
+Date: Sat, 1 Nov 2025 11:20:07 -0700
+Subject: mm/damon/tests/core-kunit: handle alloc failres in damon_test_new_filter()
+
+From: SeongJae Park <sj@kernel.org>
+
+commit 28ab2265e9422ccd81e4beafc0ace90f78de04c4 upstream.
+
+damon_test_new_filter() is assuming all dynamic memory allocation in it
+will succeed. Those are indeed likely in the real use cases since those
+allocations are too small to fail, but theoretically those could fail. In
+the case, inappropriate memory access can happen. Fix it by appropriately
+cleanup pre-allocated memory and skip the execution of the remaining tests
+in the failure cases.
+
+Link: https://lkml.kernel.org/r/20251101182021.74868-14-sj@kernel.org
+Fixes: 2a158e956b98 ("mm/damon/core-test: add a test for damos_new_filter()")
+Signed-off-by: SeongJae Park <sj@kernel.org>
+Cc: Brendan Higgins <brendan.higgins@linux.dev>
+Cc: David Gow <davidgow@google.com>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: <stable@vger.kernel.org> [6.6+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/damon/core-test.h | 2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/mm/damon/core-test.h
++++ b/mm/damon/core-test.h
+@@ -346,6 +346,8 @@ static void damos_test_new_filter(struct
+ struct damos_filter *filter;
+
+ filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true);
++ if (!filter)
++ kunit_skip(test, "filter alloc fail");
+ KUNIT_EXPECT_EQ(test, filter->type, DAMOS_FILTER_TYPE_ANON);
+ KUNIT_EXPECT_EQ(test, filter->matching, true);
+ KUNIT_EXPECT_PTR_EQ(test, filter->list.prev, &filter->list);
--- /dev/null
+From 3d443dd29a1db7efa587a4bb0c06a497e13ca9e4 Mon Sep 17 00:00:00 2001
+From: SeongJae Park <sj@kernel.org>
+Date: Sat, 1 Nov 2025 11:20:00 -0700
+Subject: mm/damon/tests/core-kunit: handle alloc failures on damon_test_merge_two()
+
+From: SeongJae Park <sj@kernel.org>
+
+commit 3d443dd29a1db7efa587a4bb0c06a497e13ca9e4 upstream.
+
+damon_test_merge_two() is assuming all dynamic memory allocation in it
+will succeed. Those are indeed likely in the real use cases since those
+allocations are too small to fail, but theoretically those could fail. In
+the case, inappropriate memory access can happen. Fix it by appropriately
+cleanup pre-allocated memory and skip the execution of the remaining tests
+in the failure cases.
+
+Link: https://lkml.kernel.org/r/20251101182021.74868-7-sj@kernel.org
+Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests")
+Signed-off-by: SeongJae Park <sj@kernel.org>
+Cc: Brendan Higgins <brendan.higgins@linux.dev>
+Cc: David Gow <davidgow@google.com>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: <stable@vger.kernel.org> [5.15+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/damon/core-test.h | 10 ++++++++++
+ 1 file changed, 10 insertions(+)
+
+--- a/mm/damon/core-test.h
++++ b/mm/damon/core-test.h
+@@ -162,10 +162,20 @@ static void damon_test_merge_two(struct
+ int i;
+
+ t = damon_new_target();
++ if (!t)
++ kunit_skip(test, "target alloc fail");
+ r = damon_new_region(0, 100);
++ if (!r) {
++ damon_free_target(t);
++ kunit_skip(test, "region alloc fail");
++ }
+ r->nr_accesses = 10;
+ damon_add_region(r, t);
+ r2 = damon_new_region(100, 300);
++ if (!r2) {
++ damon_free_target(t);
++ kunit_skip(test, "second region alloc fail");
++ }
+ r2->nr_accesses = 20;
+ damon_add_region(r2, t);
+
--- /dev/null
+From 5e80d73f22043c59c8ad36452a3253937ed77955 Mon Sep 17 00:00:00 2001
+From: SeongJae Park <sj@kernel.org>
+Date: Sat, 1 Nov 2025 11:19:59 -0700
+Subject: mm/damon/tests/core-kunit: handle alloc failures on damon_test_split_at()
+
+From: SeongJae Park <sj@kernel.org>
+
+commit 5e80d73f22043c59c8ad36452a3253937ed77955 upstream.
+
+damon_test_split_at() is assuming all dynamic memory allocation in it will
+succeed. Those are indeed likely in the real use cases since those
+allocations are too small to fail, but theoretically those could fail. In
+the case, inappropriate memory access can happen. Fix it by appropriately
+cleanup pre-allocated memory and skip the execution of the remaining tests
+in the failure cases.
+
+Link: https://lkml.kernel.org/r/20251101182021.74868-6-sj@kernel.org
+Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests")
+Signed-off-by: SeongJae Park <sj@kernel.org>
+Cc: Brendan Higgins <brendan.higgins@linux.dev>
+Cc: David Gow <davidgow@google.com>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: <stable@vger.kernel.org> [5.15+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/damon/core-test.h | 11 +++++++++++
+ 1 file changed, 11 insertions(+)
+
+--- a/mm/damon/core-test.h
++++ b/mm/damon/core-test.h
+@@ -129,8 +129,19 @@ static void damon_test_split_at(struct k
+ struct damon_target *t;
+ struct damon_region *r;
+
++ if (!c)
++ kunit_skip(test, "ctx alloc fail");
+ t = damon_new_target();
++ if (!t) {
++ damon_destroy_ctx(c);
++ kunit_skip(test, "target alloc fail");
++ }
+ r = damon_new_region(0, 100);
++ if (!r) {
++ damon_destroy_ctx(c);
++ damon_free_target(t);
++ kunit_skip(test, "region alloc fail");
++ }
+ damon_add_region(r, t);
+ damon_split_region_at(t, r, 25);
+ KUNIT_EXPECT_EQ(test, r->ar.start, 0ul);
--- /dev/null
+From 0998d2757218771c59d5ca59ccf13d1542a38f17 Mon Sep 17 00:00:00 2001
+From: SeongJae Park <sj@kernel.org>
+Date: Sat, 1 Nov 2025 11:20:01 -0700
+Subject: mm/damon/tests/core-kunit: handle alloc failures on dasmon_test_merge_regions_of()
+
+From: SeongJae Park <sj@kernel.org>
+
+commit 0998d2757218771c59d5ca59ccf13d1542a38f17 upstream.
+
+damon_test_merge_regions_of() is assuming all dynamic memory allocation in
+it will succeed. Those are indeed likely in the real use cases since
+those allocations are too small to fail, but theoretically those could
+fail. In the case, inappropriate memory access can happen. Fix it by
+appropriately cleanup pre-allocated memory and skip the execution of the
+remaining tests in the failure cases.
+
+Link: https://lkml.kernel.org/r/20251101182021.74868-8-sj@kernel.org
+Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests")
+Signed-off-by: SeongJae Park <sj@kernel.org>
+Cc: Brendan Higgins <brendan.higgins@linux.dev>
+Cc: David Gow <davidgow@google.com>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: <stable@vger.kernel.org> [5.15+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/damon/core-test.h | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+--- a/mm/damon/core-test.h
++++ b/mm/damon/core-test.h
+@@ -210,8 +210,14 @@ static void damon_test_merge_regions_of(
+ int i;
+
+ t = damon_new_target();
++ if (!t)
++ kunit_skip(test, "target alloc fail");
+ for (i = 0; i < ARRAY_SIZE(sa); i++) {
+ r = damon_new_region(sa[i], ea[i]);
++ if (!r) {
++ damon_free_target(t);
++ kunit_skip(test, "region alloc fail");
++ }
+ r->nr_accesses = nrs[i];
+ damon_add_region(r, t);
+ }
--- /dev/null
+From e16fdd4f754048d6e23c56bd8d920b71e41e3777 Mon Sep 17 00:00:00 2001
+From: SeongJae Park <sj@kernel.org>
+Date: Sat, 1 Nov 2025 11:19:56 -0700
+Subject: mm/damon/tests/core-kunit: handle allocation failures in damon_test_regions()
+
+From: SeongJae Park <sj@kernel.org>
+
+commit e16fdd4f754048d6e23c56bd8d920b71e41e3777 upstream.
+
+damon_test_regions() is assuming all dynamic memory allocation in it will
+succeed. Those are indeed likely in the real use cases since those
+allocations are too small to fail, but theoretically those could fail. In
+the case, inappropriate memory access can happen. Fix it by appropriately
+cleanup pre-allocated memory and skip the execution of the remaining tests
+in the failure cases.
+
+Link: https://lkml.kernel.org/r/20251101182021.74868-3-sj@kernel.org
+Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests")
+Signed-off-by: SeongJae Park <sj@kernel.org>
+Cc: Brendan Higgins <brendan.higgins@linux.dev>
+Cc: David Gow <davidgow@google.com>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: <stable@vger.kernel.org> [5.15+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/damon/core-test.h | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+--- a/mm/damon/core-test.h
++++ b/mm/damon/core-test.h
+@@ -20,11 +20,17 @@ static void damon_test_regions(struct ku
+ struct damon_target *t;
+
+ r = damon_new_region(1, 2);
++ if (!r)
++ kunit_skip(test, "region alloc fail");
+ KUNIT_EXPECT_EQ(test, 1ul, r->ar.start);
+ KUNIT_EXPECT_EQ(test, 2ul, r->ar.end);
+ KUNIT_EXPECT_EQ(test, 0u, r->nr_accesses);
+
+ t = damon_new_target();
++ if (!t) {
++ damon_free_region(r);
++ kunit_skip(test, "target alloc fail");
++ }
+ KUNIT_EXPECT_EQ(test, 0u, damon_nr_regions(t));
+
+ damon_add_region(r, t);
--- /dev/null
+From 7890e5b5bb6e386155c6e755fe70e0cdcc77f18e Mon Sep 17 00:00:00 2001
+From: SeongJae Park <sj@kernel.org>
+Date: Sat, 1 Nov 2025 11:20:12 -0700
+Subject: mm/damon/tests/vaddr-kunit: handle alloc failures in damon_test_split_evenly_fail()
+
+From: SeongJae Park <sj@kernel.org>
+
+commit 7890e5b5bb6e386155c6e755fe70e0cdcc77f18e upstream.
+
+damon_test_split_evenly_fail() is assuming all dynamic memory allocation
+in it will succeed. Those are indeed likely in the real use cases since
+those allocations are too small to fail, but theoretically those could
+fail. In the case, inappropriate memory access can happen. Fix it by
+appropriately cleanup pre-allocated memory and skip the execution of the
+remaining tests in the failure cases.
+
+Link: https://lkml.kernel.org/r/20251101182021.74868-19-sj@kernel.org
+Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests")
+Signed-off-by: SeongJae Park <sj@kernel.org>
+Cc: Brendan Higgins <brendan.higgins@linux.dev>
+Cc: David Gow <davidgow@google.com>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: <stable@vger.kernel.org> [5.15+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/damon/vaddr-test.h | 11 ++++++++++-
+ 1 file changed, 10 insertions(+), 1 deletion(-)
+
+--- a/mm/damon/vaddr-test.h
++++ b/mm/damon/vaddr-test.h
+@@ -250,7 +250,16 @@ static void damon_test_split_evenly_fail
+ unsigned long start, unsigned long end, unsigned int nr_pieces)
+ {
+ struct damon_target *t = damon_new_target();
+- struct damon_region *r = damon_new_region(start, end);
++ struct damon_region *r;
++
++ if (!t)
++ kunit_skip(test, "target alloc fail");
++
++ r = damon_new_region(start, end);
++ if (!r) {
++ damon_free_target(t);
++ kunit_skip(test, "region alloc fail");
++ }
+
+ damon_add_region(r, t);
+ KUNIT_EXPECT_EQ(test,
--- /dev/null
+From 2b22d0fcc6320ba29b2122434c1d2f0785fb0a25 Mon Sep 17 00:00:00 2001
+From: SeongJae Park <sj@kernel.org>
+Date: Sat, 1 Nov 2025 11:20:11 -0700
+Subject: mm/damon/tests/vaddr-kunit: handle alloc failures on damon_do_test_apply_three_regions()
+
+From: SeongJae Park <sj@kernel.org>
+
+commit 2b22d0fcc6320ba29b2122434c1d2f0785fb0a25 upstream.
+
+damon_do_test_apply_three_regions() is assuming all dynamic memory
+allocation in it will succeed. Those are indeed likely in the real use
+cases since those allocations are too small to fail, but theoretically
+those could fail. In the case, inappropriate memory access can happen.
+Fix it by appropriately cleanup pre-allocated memory and skip the
+execution of the remaining tests in the failure cases.
+
+Link: https://lkml.kernel.org/r/20251101182021.74868-18-sj@kernel.org
+Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests")
+Signed-off-by: SeongJae Park <sj@kernel.org>
+Cc: Brendan Higgins <brendan.higgins@linux.dev>
+Cc: David Gow <davidgow@google.com>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: <stable@vger.kernel.org> [5.15+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: SeongJae Park <sj@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/damon/vaddr-test.h | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+--- a/mm/damon/vaddr-test.h
++++ b/mm/damon/vaddr-test.h
+@@ -136,8 +136,14 @@ static void damon_do_test_apply_three_re
+ int i;
+
+ t = damon_new_target();
++ if (!t)
++ kunit_skip(test, "target alloc fail");
+ for (i = 0; i < nr_regions / 2; i++) {
+ r = damon_new_region(regions[i * 2], regions[i * 2 + 1]);
++ if (!r) {
++ damon_destroy_target(t);
++ kunit_skip(test, "region alloc fail");
++ }
+ damon_add_region(r, t);
+ }
+
--- /dev/null
+From 0a63a0e7570b9b2631dfb8d836dc572709dce39e Mon Sep 17 00:00:00 2001
+From: SeongJae Park <sj@kernel.org>
+Date: Sat, 1 Nov 2025 11:20:13 -0700
+Subject: mm/damon/tests/vaddr-kunit: handle alloc failures on damon_test_split_evenly_succ()
+
+From: SeongJae Park <sj@kernel.org>
+
+commit 0a63a0e7570b9b2631dfb8d836dc572709dce39e upstream.
+
+damon_test_split_evenly_succ() is assuming all dynamic memory allocation
+in it will succeed. Those are indeed likely in the real use cases since
+those allocations are too small to fail, but theoretically those could
+fail. In the case, inappropriate memory access can happen. Fix it by
+appropriately cleanup pre-allocated memory and skip the execution of the
+remaining tests in the failure cases.
+
+Link: https://lkml.kernel.org/r/20251101182021.74868-20-sj@kernel.org
+Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests")
+Signed-off-by: SeongJae Park <sj@kernel.org>
+Cc: Brendan Higgins <brendan.higgins@linux.dev>
+Cc: David Gow <davidgow@google.com>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: <stable@vger.kernel.org> [5.15+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/damon/vaddr-test.h | 9 ++++++++-
+ 1 file changed, 8 insertions(+), 1 deletion(-)
+
+--- a/mm/damon/vaddr-test.h
++++ b/mm/damon/vaddr-test.h
+@@ -284,10 +284,17 @@ static void damon_test_split_evenly_succ
+ unsigned long start, unsigned long end, unsigned int nr_pieces)
+ {
+ struct damon_target *t = damon_new_target();
+- struct damon_region *r = damon_new_region(start, end);
++ struct damon_region *r;
+ unsigned long expected_width = (end - start) / nr_pieces;
+ unsigned long i = 0;
+
++ if (!t)
++ kunit_skip(test, "target alloc fail");
++ r = damon_new_region(start, end);
++ if (!r) {
++ damon_free_target(t);
++ kunit_skip(test, "region alloc fail");
++ }
+ damon_add_region(r, t);
+ KUNIT_EXPECT_EQ(test,
+ damon_va_evenly_split_region(t, r, nr_pieces), 0);
--- /dev/null
+From wen.yang@linux.dev Mon Dec 29 08:53:54 2025
+From: wen.yang@linux.dev
+Date: Mon, 29 Dec 2025 15:53:17 +0800
+Subject: net: Allow to use SMP threads for backlog NAPI.
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: stable@vger.kernel.org, linux-kernel@vger.kernel.org, Sebastian Andrzej Siewior <bigeasy@linutronix.de>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Wen Yang <wen.yang@linux.dev>
+Message-ID: <013481655ddb09ae214bc510502efe6cf32b3445.1766987153.git.wen.yang@linux.dev>
+
+From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+
+commit dad6b97702639fba27a2bd3e986982ad6f0db3a7 upstream.
+
+Backlog NAPI is a per-CPU NAPI struct only (with no device behind it)
+used by drivers which don't do NAPI them self, RPS and parts of the
+stack which need to avoid recursive deadlocks while processing a packet.
+
+The non-NAPI driver use the CPU local backlog NAPI. If RPS is enabled
+then a flow for the skb is computed and based on the flow the skb can be
+enqueued on a remote CPU. Scheduling/ raising the softirq (for backlog's
+NAPI) on the remote CPU isn't trivial because the softirq is only
+scheduled on the local CPU and performed after the hardirq is done.
+In order to schedule a softirq on the remote CPU, an IPI is sent to the
+remote CPU which schedules the backlog-NAPI on the then local CPU.
+
+On PREEMPT_RT interrupts are force-threaded. The soft interrupts are
+raised within the interrupt thread and processed after the interrupt
+handler completed still within the context of the interrupt thread. The
+softirq is handled in the context where it originated.
+
+With force-threaded interrupts enabled, ksoftirqd is woken up if a
+softirq is raised from hardirq context. This is the case if it is raised
+from an IPI. Additionally there is a warning on PREEMPT_RT if the
+softirq is raised from the idle thread.
+This was done for two reasons:
+- With threaded interrupts the processing should happen in thread
+ context (where it originated) and ksoftirqd is the only thread for
+ this context if raised from hardirq. Using the currently running task
+ instead would "punish" a random task.
+- Once ksoftirqd is active it consumes all further softirqs until it
+ stops running. This changed recently and is no longer the case.
+
+Instead of keeping the backlog NAPI in ksoftirqd (in force-threaded/
+PREEMPT_RT setups) I am proposing NAPI-threads for backlog.
+The "proper" setup with threaded-NAPI is not doable because the threads
+are not pinned to an individual CPU and can be modified by the user.
+Additionally a dummy network device would have to be assigned. Also
+CPU-hotplug has to be considered if additional CPUs show up.
+All this can be probably done/ solved but the smpboot-threads already
+provide this infrastructure.
+
+Sending UDP packets over loopback expects that the packet is processed
+within the call. Delaying it by handing it over to the thread hurts
+performance. It is not beneficial to the outcome if the context switch
+happens immediately after enqueue or after a while to process a few
+packets in a batch.
+There is no need to always use the thread if the backlog NAPI is
+requested on the local CPU. This restores the loopback throuput. The
+performance drops mostly to the same value after enabling RPS on the
+loopback comparing the IPI and the tread result.
+
+Create NAPI-threads for backlog if request during boot. The thread runs
+the inner loop from napi_threaded_poll(), the wait part is different. It
+checks for NAPI_STATE_SCHED (the backlog NAPI can not be disabled).
+
+The NAPI threads for backlog are optional, it has to be enabled via the boot
+argument "thread_backlog_napi". It is mandatory for PREEMPT_RT to avoid the
+wakeup of ksoftirqd from the IPI.
+
+Acked-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Wen Yang <wen.yang@linux.dev>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/core/dev.c | 152 +++++++++++++++++++++++++++++++++++++++++++--------------
+ 1 file changed, 115 insertions(+), 37 deletions(-)
+
+--- a/net/core/dev.c
++++ b/net/core/dev.c
+@@ -78,6 +78,7 @@
+ #include <linux/slab.h>
+ #include <linux/sched.h>
+ #include <linux/sched/mm.h>
++#include <linux/smpboot.h>
+ #include <linux/mutex.h>
+ #include <linux/rwsem.h>
+ #include <linux/string.h>
+@@ -217,6 +218,31 @@ static inline struct hlist_head *dev_ind
+ return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
+ }
+
++#ifndef CONFIG_PREEMPT_RT
++
++static DEFINE_STATIC_KEY_FALSE(use_backlog_threads_key);
++
++static int __init setup_backlog_napi_threads(char *arg)
++{
++ static_branch_enable(&use_backlog_threads_key);
++ return 0;
++}
++early_param("thread_backlog_napi", setup_backlog_napi_threads);
++
++static bool use_backlog_threads(void)
++{
++ return static_branch_unlikely(&use_backlog_threads_key);
++}
++
++#else
++
++static bool use_backlog_threads(void)
++{
++ return true;
++}
++
++#endif
++
+ static inline void rps_lock_irqsave(struct softnet_data *sd,
+ unsigned long *flags)
+ {
+@@ -4494,6 +4520,7 @@ EXPORT_SYMBOL(__dev_direct_xmit);
+ /*************************************************************************
+ * Receiver routines
+ *************************************************************************/
++static DEFINE_PER_CPU(struct task_struct *, backlog_napi);
+
+ int netdev_max_backlog __read_mostly = 1000;
+ EXPORT_SYMBOL(netdev_max_backlog);
+@@ -4526,12 +4553,16 @@ static inline void ____napi_schedule(str
+ */
+ thread = READ_ONCE(napi->thread);
+ if (thread) {
++ if (use_backlog_threads() && thread == raw_cpu_read(backlog_napi))
++ goto use_local_napi;
++
+ set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
+ wake_up_process(thread);
+ return;
+ }
+ }
+
++use_local_napi:
+ list_add_tail(&napi->poll_list, &sd->poll_list);
+ WRITE_ONCE(napi->list_owner, smp_processor_id());
+ /* If not called from net_rx_action()
+@@ -4777,6 +4808,11 @@ static void napi_schedule_rps(struct sof
+
+ #ifdef CONFIG_RPS
+ if (sd != mysd) {
++ if (use_backlog_threads()) {
++ __napi_schedule_irqoff(&sd->backlog);
++ return;
++ }
++
+ sd->rps_ipi_next = mysd->rps_ipi_list;
+ mysd->rps_ipi_list = sd;
+
+@@ -6000,7 +6036,7 @@ static void net_rps_action_and_irq_enabl
+ #ifdef CONFIG_RPS
+ struct softnet_data *remsd = sd->rps_ipi_list;
+
+- if (remsd) {
++ if (!use_backlog_threads() && remsd) {
+ sd->rps_ipi_list = NULL;
+
+ local_irq_enable();
+@@ -6015,7 +6051,7 @@ static void net_rps_action_and_irq_enabl
+ static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
+ {
+ #ifdef CONFIG_RPS
+- return sd->rps_ipi_list != NULL;
++ return !use_backlog_threads() && sd->rps_ipi_list;
+ #else
+ return false;
+ #endif
+@@ -6059,7 +6095,7 @@ static int process_backlog(struct napi_s
+ * We can use a plain write instead of clear_bit(),
+ * and we dont need an smp_mb() memory barrier.
+ */
+- napi->state = 0;
++ napi->state &= NAPIF_STATE_THREADED;
+ again = false;
+ } else {
+ skb_queue_splice_tail_init(&sd->input_pkt_queue,
+@@ -6725,43 +6761,48 @@ static void skb_defer_free_flush(struct
+ }
+ }
+
+-static int napi_threaded_poll(void *data)
++static void napi_threaded_poll_loop(struct napi_struct *napi)
+ {
+- struct napi_struct *napi = data;
+ struct softnet_data *sd;
+- void *have;
++ unsigned long last_qs = jiffies;
+
+- while (!napi_thread_wait(napi)) {
+- unsigned long last_qs = jiffies;
++ for (;;) {
++ bool repoll = false;
++ void *have;
+
+- for (;;) {
+- bool repoll = false;
++ local_bh_disable();
++ sd = this_cpu_ptr(&softnet_data);
++ sd->in_napi_threaded_poll = true;
+
+- local_bh_disable();
+- sd = this_cpu_ptr(&softnet_data);
+- sd->in_napi_threaded_poll = true;
+-
+- have = netpoll_poll_lock(napi);
+- __napi_poll(napi, &repoll);
+- netpoll_poll_unlock(have);
+-
+- sd->in_napi_threaded_poll = false;
+- barrier();
+-
+- if (sd_has_rps_ipi_waiting(sd)) {
+- local_irq_disable();
+- net_rps_action_and_irq_enable(sd);
+- }
+- skb_defer_free_flush(sd);
+- local_bh_enable();
++ have = netpoll_poll_lock(napi);
++ __napi_poll(napi, &repoll);
++ netpoll_poll_unlock(have);
++
++ sd->in_napi_threaded_poll = false;
++ barrier();
++
++ if (sd_has_rps_ipi_waiting(sd)) {
++ local_irq_disable();
++ net_rps_action_and_irq_enable(sd);
++ }
++ skb_defer_free_flush(sd);
++ local_bh_enable();
+
+- if (!repoll)
+- break;
++ if (!repoll)
++ break;
+
+- rcu_softirq_qs_periodic(last_qs);
+- cond_resched();
+- }
++ rcu_softirq_qs_periodic(last_qs);
++ cond_resched();
+ }
++}
++
++static int napi_threaded_poll(void *data)
++{
++ struct napi_struct *napi = data;
++
++ while (!napi_thread_wait(napi))
++ napi_threaded_poll_loop(napi);
++
+ return 0;
+ }
+
+@@ -11346,7 +11387,7 @@ static int dev_cpu_dead(unsigned int old
+
+ list_del_init(&napi->poll_list);
+ if (napi->poll == process_backlog)
+- napi->state = 0;
++ napi->state &= NAPIF_STATE_THREADED;
+ else
+ ____napi_schedule(sd, napi);
+ }
+@@ -11354,12 +11395,14 @@ static int dev_cpu_dead(unsigned int old
+ raise_softirq_irqoff(NET_TX_SOFTIRQ);
+ local_irq_enable();
+
++ if (!use_backlog_threads()) {
+ #ifdef CONFIG_RPS
+- remsd = oldsd->rps_ipi_list;
+- oldsd->rps_ipi_list = NULL;
++ remsd = oldsd->rps_ipi_list;
++ oldsd->rps_ipi_list = NULL;
+ #endif
+- /* send out pending IPI's on offline CPU */
+- net_rps_send_ipi(remsd);
++ /* send out pending IPI's on offline CPU */
++ net_rps_send_ipi(remsd);
++ }
+
+ /* Process offline CPU's input_pkt_queue */
+ while ((skb = __skb_dequeue(&oldsd->process_queue))) {
+@@ -11622,6 +11665,38 @@ static struct pernet_operations __net_in
+ *
+ */
+
++static int backlog_napi_should_run(unsigned int cpu)
++{
++ struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
++ struct napi_struct *napi = &sd->backlog;
++
++ return test_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
++}
++
++static void run_backlog_napi(unsigned int cpu)
++{
++ struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
++
++ napi_threaded_poll_loop(&sd->backlog);
++}
++
++static void backlog_napi_setup(unsigned int cpu)
++{
++ struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
++ struct napi_struct *napi = &sd->backlog;
++
++ napi->thread = this_cpu_read(backlog_napi);
++ set_bit(NAPI_STATE_THREADED, &napi->state);
++}
++
++static struct smp_hotplug_thread backlog_threads = {
++ .store = &backlog_napi,
++ .thread_should_run = backlog_napi_should_run,
++ .thread_fn = run_backlog_napi,
++ .thread_comm = "backlog_napi/%u",
++ .setup = backlog_napi_setup,
++};
++
+ /*
+ * This is called single threaded during boot, so no need
+ * to take the rtnl semaphore.
+@@ -11672,7 +11747,10 @@ static int __init net_dev_init(void)
+ init_gro_hash(&sd->backlog);
+ sd->backlog.poll = process_backlog;
+ sd->backlog.weight = weight_p;
++ INIT_LIST_HEAD(&sd->backlog.poll_list);
+ }
++ if (use_backlog_threads())
++ smpboot_register_percpu_thread(&backlog_threads);
+
+ dev_boot_phase = 0;
+
--- /dev/null
+From stable+bounces-203472-greg=kroah.com@vger.kernel.org Mon Dec 29 08:54:02 2025
+From: wen.yang@linux.dev
+Date: Mon, 29 Dec 2025 15:53:16 +0800
+Subject: net: Remove conditional threaded-NAPI wakeup based on task state.
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: stable@vger.kernel.org, linux-kernel@vger.kernel.org, Sebastian Andrzej Siewior <bigeasy@linutronix.de>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Wen Yang <wen.yang@linux.dev>
+Message-ID: <b530eb6ed51ef4ca7940dddd981de2878834fcef.1766987153.git.wen.yang@linux.dev>
+
+From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+
+commit 56364c910691f6d10ba88c964c9041b9ab777bd6 upstream.
+
+A NAPI thread is scheduled by first setting NAPI_STATE_SCHED bit. If
+successful (the bit was not yet set) then the NAPI_STATE_SCHED_THREADED
+is set but only if thread's state is not TASK_INTERRUPTIBLE (is
+TASK_RUNNING) followed by task wakeup.
+
+If the task is idle (TASK_INTERRUPTIBLE) then the
+NAPI_STATE_SCHED_THREADED bit is not set. The thread is no relying on
+the bit but always leaving the wait-loop after returning from schedule()
+because there must have been a wakeup.
+
+The smpboot-threads implementation for per-CPU threads requires an
+explicit condition and does not support "if we get out of schedule()
+then there must be something to do".
+
+Removing this optimisation simplifies the following integration.
+
+Set NAPI_STATE_SCHED_THREADED unconditionally on wakeup and rely on it
+in the wait path by removing the `woken' condition.
+
+Acked-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Wen Yang <wen.yang@linux.dev>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/core/dev.c | 14 ++------------
+ 1 file changed, 2 insertions(+), 12 deletions(-)
+
+--- a/net/core/dev.c
++++ b/net/core/dev.c
+@@ -4526,13 +4526,7 @@ static inline void ____napi_schedule(str
+ */
+ thread = READ_ONCE(napi->thread);
+ if (thread) {
+- /* Avoid doing set_bit() if the thread is in
+- * INTERRUPTIBLE state, cause napi_thread_wait()
+- * makes sure to proceed with napi polling
+- * if the thread is explicitly woken from here.
+- */
+- if (READ_ONCE(thread->__state) != TASK_INTERRUPTIBLE)
+- set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
++ set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
+ wake_up_process(thread);
+ return;
+ }
+@@ -6688,8 +6682,6 @@ static int napi_poll(struct napi_struct
+
+ static int napi_thread_wait(struct napi_struct *napi)
+ {
+- bool woken = false;
+-
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ while (!kthread_should_stop()) {
+@@ -6698,15 +6690,13 @@ static int napi_thread_wait(struct napi_
+ * Testing SCHED bit is not enough because SCHED bit might be
+ * set by some other busy poll thread or by napi_disable().
+ */
+- if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) {
++ if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) {
+ WARN_ON(!list_empty(&napi->poll_list));
+ __set_current_state(TASK_RUNNING);
+ return 0;
+ }
+
+ schedule();
+- /* woken being true indicates this thread owns this napi. */
+- woken = true;
+ set_current_state(TASK_INTERRUPTIBLE);
+ }
+ __set_current_state(TASK_RUNNING);
--- /dev/null
+From d0706bfd3ee40923c001c6827b786a309e2a8713 Mon Sep 17 00:00:00 2001
+From: Zhu Yanjun <yanjun.zhu@linux.dev>
+Date: Tue, 6 May 2025 17:10:08 +0200
+Subject: RDMA/core: Fix "KASAN: slab-use-after-free Read in ib_register_device" problem
+
+From: Zhu Yanjun <yanjun.zhu@linux.dev>
+
+commit d0706bfd3ee40923c001c6827b786a309e2a8713 upstream.
+
+Call Trace:
+
+ __dump_stack lib/dump_stack.c:94 [inline]
+ dump_stack_lvl+0x116/0x1f0 lib/dump_stack.c:120
+ print_address_description mm/kasan/report.c:408 [inline]
+ print_report+0xc3/0x670 mm/kasan/report.c:521
+ kasan_report+0xe0/0x110 mm/kasan/report.c:634
+ strlen+0x93/0xa0 lib/string.c:420
+ __fortify_strlen include/linux/fortify-string.h:268 [inline]
+ get_kobj_path_length lib/kobject.c:118 [inline]
+ kobject_get_path+0x3f/0x2a0 lib/kobject.c:158
+ kobject_uevent_env+0x289/0x1870 lib/kobject_uevent.c:545
+ ib_register_device drivers/infiniband/core/device.c:1472 [inline]
+ ib_register_device+0x8cf/0xe00 drivers/infiniband/core/device.c:1393
+ rxe_register_device+0x275/0x320 drivers/infiniband/sw/rxe/rxe_verbs.c:1552
+ rxe_net_add+0x8e/0xe0 drivers/infiniband/sw/rxe/rxe_net.c:550
+ rxe_newlink+0x70/0x190 drivers/infiniband/sw/rxe/rxe.c:225
+ nldev_newlink+0x3a3/0x680 drivers/infiniband/core/nldev.c:1796
+ rdma_nl_rcv_msg+0x387/0x6e0 drivers/infiniband/core/netlink.c:195
+ rdma_nl_rcv_skb.constprop.0.isra.0+0x2e5/0x450
+ netlink_unicast_kernel net/netlink/af_netlink.c:1313 [inline]
+ netlink_unicast+0x53a/0x7f0 net/netlink/af_netlink.c:1339
+ netlink_sendmsg+0x8d1/0xdd0 net/netlink/af_netlink.c:1883
+ sock_sendmsg_nosec net/socket.c:712 [inline]
+ __sock_sendmsg net/socket.c:727 [inline]
+ ____sys_sendmsg+0xa95/0xc70 net/socket.c:2566
+ ___sys_sendmsg+0x134/0x1d0 net/socket.c:2620
+ __sys_sendmsg+0x16d/0x220 net/socket.c:2652
+ do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
+ do_syscall_64+0xcd/0x260 arch/x86/entry/syscall_64.c:94
+ entry_SYSCALL_64_after_hwframe+0x77/0x7f
+
+This problem is similar to the problem that the
+commit 1d6a9e7449e2 ("RDMA/core: Fix use-after-free when rename device name")
+fixes.
+
+The root cause is: the function ib_device_rename() renames the name with
+lock. But in the function kobject_uevent(), this name is accessed without
+lock protection at the same time.
+
+The solution is to add the lock protection when this name is accessed in
+the function kobject_uevent().
+
+Fixes: 779e0bf47632 ("RDMA/core: Do not indicate device ready when device enablement fails")
+Link: https://patch.msgid.link/r/20250506151008.75701-1-yanjun.zhu@linux.dev
+Reported-by: syzbot+e2ce9e275ecc70a30b72@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=e2ce9e275ecc70a30b72
+Signed-off-by: Zhu Yanjun <yanjun.zhu@linux.dev>
+Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+[ Ajay: Modified to apply on v5.10.y-v6.6.y
+ ib_device_notify_register() not present in v5.10.y-v6.6.y,
+ so directly added lock for kobject_uevent() ]
+Signed-off-by: Ajay Kaher <ajay.kaher@broadcom.com>
+Signed-off-by: Shivani Agarwal <shivani.agarwal@broadcom.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/infiniband/core/device.c | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+--- a/drivers/infiniband/core/device.c
++++ b/drivers/infiniband/core/device.c
+@@ -1450,8 +1450,13 @@ int ib_register_device(struct ib_device
+ return ret;
+ }
+ dev_set_uevent_suppress(&device->dev, false);
++
++ down_read(&devices_rwsem);
++
+ /* Mark for userspace that device is ready */
+ kobject_uevent(&device->dev.kobj, KOBJ_ADD);
++
++ up_read(&devices_rwsem);
+ ib_device_put(device);
+
+ return 0;
--- /dev/null
+From shivani.agarwal@broadcom.com Thu Jan 8 11:26:51 2026
+From: Shivani Agarwal <shivani.agarwal@broadcom.com>
+Date: Thu, 8 Jan 2026 02:05:40 -0800
+Subject: RDMA/rxe: Fix the failure of ibv_query_device() and ibv_query_device_ex() tests
+To: stable@vger.kernel.org, gregkh@linuxfoundation.org
+Cc: jgg@ziepe.ca, leon@kernel.org, zyjzyj2000@gmail.com, mbloch@nvidia.com, parav@nvidia.com, mrgolin@amazon.com, roman.gushchin@linux.dev, wangliang74@huawei.com, marco.crivellari@suse.com, zhao.xichao@vivo.com, haggaie@mellanox.com, monis@mellanox.com, dledford@redhat.com, amirv@mellanox.com, kamalh@mellanox.com, linux-rdma@vger.kernel.org, linux-kernel@vger.kernel.org, ajay.kaher@broadcom.com, alexey.makhalov@broadcom.com, vamsi-krishna.brahmajosyula@broadcom.com, yin.ding@broadcom.com, tapas.kundu@broadcom.com, Zhu Yanjun <yanjun.zhu@linux.dev>, Daisuke Matsuda <matsuda-daisuke@fujitsu.com>, Sasha Levin <sashal@kernel.org>, Shivani Agarwal <shivani.agarwal@broadcom.com>
+Message-ID: <20260108100540.672666-3-shivani.agarwal@broadcom.com>
+
+From: Zhu Yanjun <yanjun.zhu@linux.dev>
+
+[ Upstream commit 8ce2eb9dfac8743d1c423b86339336a5b6a6069e ]
+
+In rdma-core, the following failures appear.
+
+"
+$ ./build/bin/run_tests.py -k device
+ssssssss....FF........s
+======================================================================
+FAIL: test_query_device (tests.test_device.DeviceTest.test_query_device)
+Test ibv_query_device()
+----------------------------------------------------------------------
+Traceback (most recent call last):
+ File "/home/ubuntu/rdma-core/tests/test_device.py", line 63, in
+ test_query_device
+ self.verify_device_attr(attr, dev)
+ File "/home/ubuntu/rdma-core/tests/test_device.py", line 200, in
+ verify_device_attr
+ assert attr.sys_image_guid != 0
+ ^^^^^^^^^^^^^^^^^^^^^^^^
+AssertionError
+
+======================================================================
+FAIL: test_query_device_ex (tests.test_device.DeviceTest.test_query_device_ex)
+Test ibv_query_device_ex()
+----------------------------------------------------------------------
+Traceback (most recent call last):
+ File "/home/ubuntu/rdma-core/tests/test_device.py", line 222, in
+ test_query_device_ex
+ self.verify_device_attr(attr_ex.orig_attr, dev)
+ File "/home/ubuntu/rdma-core/tests/test_device.py", line 200, in
+ verify_device_attr
+ assert attr.sys_image_guid != 0
+ ^^^^^^^^^^^^^^^^^^^^^^^^
+AssertionError
+"
+
+The root cause is: before a net device is set with rxe, this net device
+is used to generate a sys_image_guid.
+
+Fixes: 2ac5415022d1 ("RDMA/rxe: Remove the direct link to net_device")
+Signed-off-by: Zhu Yanjun <yanjun.zhu@linux.dev>
+Link: https://patch.msgid.link/20250302215444.3742072-1-yanjun.zhu@linux.dev
+Reviewed-by: Daisuke Matsuda <matsuda-daisuke@fujitsu.com>
+Tested-by: Daisuke Matsuda <matsuda-daisuke@fujitsu.com>
+Signed-off-by: Leon Romanovsky <leon@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+[Shivani: Modified to apply on 6.6.y]
+Signed-off-by: Shivani Agarwal <shivani.agarwal@broadcom.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/infiniband/sw/rxe/rxe.c | 25 ++++++-------------------
+ 1 file changed, 6 insertions(+), 19 deletions(-)
+
+--- a/drivers/infiniband/sw/rxe/rxe.c
++++ b/drivers/infiniband/sw/rxe/rxe.c
+@@ -38,10 +38,8 @@ void rxe_dealloc(struct ib_device *ib_de
+ }
+
+ /* initialize rxe device parameters */
+-static void rxe_init_device_param(struct rxe_dev *rxe)
++static void rxe_init_device_param(struct rxe_dev *rxe, struct net_device *ndev)
+ {
+- struct net_device *ndev;
+-
+ rxe->max_inline_data = RXE_MAX_INLINE_DATA;
+
+ rxe->attr.vendor_id = RXE_VENDOR_ID;
+@@ -74,15 +72,9 @@ static void rxe_init_device_param(struct
+ rxe->attr.max_pkeys = RXE_MAX_PKEYS;
+ rxe->attr.local_ca_ack_delay = RXE_LOCAL_CA_ACK_DELAY;
+
+- ndev = rxe_ib_device_get_netdev(&rxe->ib_dev);
+- if (!ndev)
+- return;
+-
+ addrconf_addr_eui48((unsigned char *)&rxe->attr.sys_image_guid,
+ ndev->dev_addr);
+
+- dev_put(ndev);
+-
+ rxe->max_ucontext = RXE_MAX_UCONTEXT;
+ }
+
+@@ -115,18 +107,13 @@ static void rxe_init_port_param(struct r
+ /* initialize port state, note IB convention that HCA ports are always
+ * numbered from 1
+ */
+-static void rxe_init_ports(struct rxe_dev *rxe)
++static void rxe_init_ports(struct rxe_dev *rxe, struct net_device *ndev)
+ {
+ struct rxe_port *port = &rxe->port;
+- struct net_device *ndev;
+
+ rxe_init_port_param(port);
+- ndev = rxe_ib_device_get_netdev(&rxe->ib_dev);
+- if (!ndev)
+- return;
+ addrconf_addr_eui48((unsigned char *)&port->port_guid,
+ ndev->dev_addr);
+- dev_put(ndev);
+ spin_lock_init(&port->port_lock);
+ }
+
+@@ -144,12 +131,12 @@ static void rxe_init_pools(struct rxe_de
+ }
+
+ /* initialize rxe device state */
+-static void rxe_init(struct rxe_dev *rxe)
++static void rxe_init(struct rxe_dev *rxe, struct net_device *ndev)
+ {
+ /* init default device parameters */
+- rxe_init_device_param(rxe);
++ rxe_init_device_param(rxe, ndev);
+
+- rxe_init_ports(rxe);
++ rxe_init_ports(rxe, ndev);
+ rxe_init_pools(rxe);
+
+ /* init pending mmap list */
+@@ -186,7 +173,7 @@ void rxe_set_mtu(struct rxe_dev *rxe, un
+ int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name,
+ struct net_device *ndev)
+ {
+- rxe_init(rxe);
++ rxe_init(rxe, ndev);
+ rxe_set_mtu(rxe, mtu);
+
+ return rxe_register_device(rxe, ibdev_name, ndev);
--- /dev/null
+From stable+bounces-206303-greg=kroah.com@vger.kernel.org Thu Jan 8 11:31:56 2026
+From: Shivani Agarwal <shivani.agarwal@broadcom.com>
+Date: Thu, 8 Jan 2026 02:05:39 -0800
+Subject: RDMA/rxe: Remove the direct link to net_device
+To: stable@vger.kernel.org, gregkh@linuxfoundation.org
+Cc: jgg@ziepe.ca, leon@kernel.org, zyjzyj2000@gmail.com, mbloch@nvidia.com, parav@nvidia.com, mrgolin@amazon.com, roman.gushchin@linux.dev, wangliang74@huawei.com, marco.crivellari@suse.com, zhao.xichao@vivo.com, haggaie@mellanox.com, monis@mellanox.com, dledford@redhat.com, amirv@mellanox.com, kamalh@mellanox.com, linux-rdma@vger.kernel.org, linux-kernel@vger.kernel.org, ajay.kaher@broadcom.com, alexey.makhalov@broadcom.com, vamsi-krishna.brahmajosyula@broadcom.com, yin.ding@broadcom.com, tapas.kundu@broadcom.com, Zhu Yanjun <yanjun.zhu@linux.dev>, syzbot+4b87489410b4efd181bf@syzkaller.appspotmail.com, Sasha Levin <sashal@kernel.org>, Shivani Agarwal <shivani.agarwal@broadcom.com>
+Message-ID: <20260108100540.672666-2-shivani.agarwal@broadcom.com>
+
+From: Zhu Yanjun <yanjun.zhu@linux.dev>
+
+[ Upstream commit 2ac5415022d16d63d912a39a06f32f1f51140261 ]
+
+The similar patch in siw is in the link:
+https://git.kernel.org/rdma/rdma/c/16b87037b48889
+
+This problem also occurred in RXE. The following analyze this problem.
+In the following Call Traces:
+"
+BUG: KASAN: slab-use-after-free in dev_get_flags+0x188/0x1d0 net/core/dev.c:8782
+Read of size 4 at addr ffff8880554640b0 by task kworker/1:4/5295
+
+CPU: 1 UID: 0 PID: 5295 Comm: kworker/1:4 Not tainted
+6.12.0-rc3-syzkaller-00399-g9197b73fd7bb #0
+Hardware name: Google Compute Engine/Google Compute Engine,
+BIOS Google 09/13/2024
+Workqueue: infiniband ib_cache_event_task
+Call Trace:
+ <TASK>
+ __dump_stack lib/dump_stack.c:94 [inline]
+ dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120
+ print_address_description mm/kasan/report.c:377 [inline]
+ print_report+0x169/0x550 mm/kasan/report.c:488
+ kasan_report+0x143/0x180 mm/kasan/report.c:601
+ dev_get_flags+0x188/0x1d0 net/core/dev.c:8782
+ rxe_query_port+0x12d/0x260 drivers/infiniband/sw/rxe/rxe_verbs.c:60
+ __ib_query_port drivers/infiniband/core/device.c:2111 [inline]
+ ib_query_port+0x168/0x7d0 drivers/infiniband/core/device.c:2143
+ ib_cache_update+0x1a9/0xb80 drivers/infiniband/core/cache.c:1494
+ ib_cache_event_task+0xf3/0x1e0 drivers/infiniband/core/cache.c:1568
+ process_one_work kernel/workqueue.c:3229 [inline]
+ process_scheduled_works+0xa65/0x1850 kernel/workqueue.c:3310
+ worker_thread+0x870/0xd30 kernel/workqueue.c:3391
+ kthread+0x2f2/0x390 kernel/kthread.c:389
+ ret_from_fork+0x4d/0x80 arch/x86/kernel/process.c:147
+ ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244
+ </TASK>
+"
+
+1). In the link [1],
+
+"
+ infiniband syz2: set down
+"
+
+This means that on 839.350575, the event ib_cache_event_task was sent andi
+queued in ib_wq.
+
+2). In the link [1],
+
+"
+ team0 (unregistering): Port device team_slave_0 removed
+"
+
+It indicates that before 843.251853, the net device should be freed.
+
+3). In the link [1],
+
+"
+ BUG: KASAN: slab-use-after-free in dev_get_flags+0x188/0x1d0
+"
+
+This means that on 850.559070, this slab-use-after-free problem occurred.
+
+In all, on 839.350575, the event ib_cache_event_task was sent and queued
+in ib_wq,
+
+before 843.251853, the net device veth was freed.
+
+on 850.559070, this event was executed, and the mentioned freed net device
+was called. Thus, the above call trace occurred.
+
+[1] https://syzkaller.appspot.com/x/log.txt?x=12e7025f980000
+
+Reported-by: syzbot+4b87489410b4efd181bf@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=4b87489410b4efd181bf
+Fixes: 8700e3e7c485 ("Soft RoCE driver")
+Signed-off-by: Zhu Yanjun <yanjun.zhu@linux.dev>
+Link: https://patch.msgid.link/20241220222325.2487767-1-yanjun.zhu@linux.dev
+Signed-off-by: Leon Romanovsky <leon@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+[Shivani: - exported ib_device_get_netdev() function.
+ - added ib_device_get_netdev() to ib_verbs.h.]
+Signed-off-by: Shivani Agarwal <shivani.agarwal@broadcom.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/infiniband/core/device.c | 1 +
+ drivers/infiniband/sw/rxe/rxe.c | 23 +++++++++++++++++++----
+ drivers/infiniband/sw/rxe/rxe.h | 3 ++-
+ drivers/infiniband/sw/rxe/rxe_mcast.c | 22 ++++++++++++++++++++--
+ drivers/infiniband/sw/rxe/rxe_net.c | 25 ++++++++++++++++++++-----
+ drivers/infiniband/sw/rxe/rxe_verbs.c | 26 +++++++++++++++++++++-----
+ drivers/infiniband/sw/rxe/rxe_verbs.h | 11 ++++++++---
+ include/rdma/ib_verbs.h | 2 ++
+ 8 files changed, 93 insertions(+), 20 deletions(-)
+
+--- a/drivers/infiniband/core/device.c
++++ b/drivers/infiniband/core/device.c
+@@ -2259,6 +2259,7 @@ struct net_device *ib_device_get_netdev(
+
+ return res;
+ }
++EXPORT_SYMBOL(ib_device_get_netdev);
+
+ /**
+ * ib_device_get_by_netdev - Find an IB device associated with a netdev
+--- a/drivers/infiniband/sw/rxe/rxe.c
++++ b/drivers/infiniband/sw/rxe/rxe.c
+@@ -40,6 +40,8 @@ void rxe_dealloc(struct ib_device *ib_de
+ /* initialize rxe device parameters */
+ static void rxe_init_device_param(struct rxe_dev *rxe)
+ {
++ struct net_device *ndev;
++
+ rxe->max_inline_data = RXE_MAX_INLINE_DATA;
+
+ rxe->attr.vendor_id = RXE_VENDOR_ID;
+@@ -71,8 +73,15 @@ static void rxe_init_device_param(struct
+ rxe->attr.max_fast_reg_page_list_len = RXE_MAX_FMR_PAGE_LIST_LEN;
+ rxe->attr.max_pkeys = RXE_MAX_PKEYS;
+ rxe->attr.local_ca_ack_delay = RXE_LOCAL_CA_ACK_DELAY;
++
++ ndev = rxe_ib_device_get_netdev(&rxe->ib_dev);
++ if (!ndev)
++ return;
++
+ addrconf_addr_eui48((unsigned char *)&rxe->attr.sys_image_guid,
+- rxe->ndev->dev_addr);
++ ndev->dev_addr);
++
++ dev_put(ndev);
+
+ rxe->max_ucontext = RXE_MAX_UCONTEXT;
+ }
+@@ -109,10 +118,15 @@ static void rxe_init_port_param(struct r
+ static void rxe_init_ports(struct rxe_dev *rxe)
+ {
+ struct rxe_port *port = &rxe->port;
++ struct net_device *ndev;
+
+ rxe_init_port_param(port);
++ ndev = rxe_ib_device_get_netdev(&rxe->ib_dev);
++ if (!ndev)
++ return;
+ addrconf_addr_eui48((unsigned char *)&port->port_guid,
+- rxe->ndev->dev_addr);
++ ndev->dev_addr);
++ dev_put(ndev);
+ spin_lock_init(&port->port_lock);
+ }
+
+@@ -169,12 +183,13 @@ void rxe_set_mtu(struct rxe_dev *rxe, un
+ /* called by ifc layer to create new rxe device.
+ * The caller should allocate memory for rxe by calling ib_alloc_device.
+ */
+-int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name)
++int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name,
++ struct net_device *ndev)
+ {
+ rxe_init(rxe);
+ rxe_set_mtu(rxe, mtu);
+
+- return rxe_register_device(rxe, ibdev_name);
++ return rxe_register_device(rxe, ibdev_name, ndev);
+ }
+
+ static int rxe_newlink(const char *ibdev_name, struct net_device *ndev)
+--- a/drivers/infiniband/sw/rxe/rxe.h
++++ b/drivers/infiniband/sw/rxe/rxe.h
+@@ -139,7 +139,8 @@ enum resp_states {
+
+ void rxe_set_mtu(struct rxe_dev *rxe, unsigned int dev_mtu);
+
+-int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name);
++int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name,
++ struct net_device *ndev);
+
+ void rxe_rcv(struct sk_buff *skb);
+
+--- a/drivers/infiniband/sw/rxe/rxe_mcast.c
++++ b/drivers/infiniband/sw/rxe/rxe_mcast.c
+@@ -31,10 +31,19 @@
+ static int rxe_mcast_add(struct rxe_dev *rxe, union ib_gid *mgid)
+ {
+ unsigned char ll_addr[ETH_ALEN];
++ struct net_device *ndev;
++ int ret;
++
++ ndev = rxe_ib_device_get_netdev(&rxe->ib_dev);
++ if (!ndev)
++ return -ENODEV;
+
+ ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr);
+
+- return dev_mc_add(rxe->ndev, ll_addr);
++ ret = dev_mc_add(ndev, ll_addr);
++ dev_put(ndev);
++
++ return ret;
+ }
+
+ /**
+@@ -47,10 +56,19 @@ static int rxe_mcast_add(struct rxe_dev
+ static int rxe_mcast_del(struct rxe_dev *rxe, union ib_gid *mgid)
+ {
+ unsigned char ll_addr[ETH_ALEN];
++ struct net_device *ndev;
++ int ret;
++
++ ndev = rxe_ib_device_get_netdev(&rxe->ib_dev);
++ if (!ndev)
++ return -ENODEV;
+
+ ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr);
+
+- return dev_mc_del(rxe->ndev, ll_addr);
++ ret = dev_mc_del(ndev, ll_addr);
++ dev_put(ndev);
++
++ return ret;
+ }
+
+ /**
+--- a/drivers/infiniband/sw/rxe/rxe_net.c
++++ b/drivers/infiniband/sw/rxe/rxe_net.c
+@@ -509,7 +509,16 @@ out:
+ */
+ const char *rxe_parent_name(struct rxe_dev *rxe, unsigned int port_num)
+ {
+- return rxe->ndev->name;
++ struct net_device *ndev;
++ char *ndev_name;
++
++ ndev = rxe_ib_device_get_netdev(&rxe->ib_dev);
++ if (!ndev)
++ return NULL;
++ ndev_name = ndev->name;
++ dev_put(ndev);
++
++ return ndev_name;
+ }
+
+ int rxe_net_add(const char *ibdev_name, struct net_device *ndev)
+@@ -521,9 +530,7 @@ int rxe_net_add(const char *ibdev_name,
+ if (!rxe)
+ return -ENOMEM;
+
+- rxe->ndev = ndev;
+-
+- err = rxe_add(rxe, ndev->mtu, ibdev_name);
++ err = rxe_add(rxe, ndev->mtu, ibdev_name, ndev);
+ if (err) {
+ ib_dealloc_device(&rxe->ib_dev);
+ return err;
+@@ -571,10 +578,18 @@ void rxe_port_down(struct rxe_dev *rxe)
+
+ void rxe_set_port_state(struct rxe_dev *rxe)
+ {
+- if (netif_running(rxe->ndev) && netif_carrier_ok(rxe->ndev))
++ struct net_device *ndev;
++
++ ndev = rxe_ib_device_get_netdev(&rxe->ib_dev);
++ if (!ndev)
++ return;
++
++ if (netif_running(ndev) && netif_carrier_ok(ndev))
+ rxe_port_up(rxe);
+ else
+ rxe_port_down(rxe);
++
++ dev_put(ndev);
+ }
+
+ static int rxe_notify(struct notifier_block *not_blk,
+--- a/drivers/infiniband/sw/rxe/rxe_verbs.c
++++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
+@@ -41,6 +41,7 @@ static int rxe_query_port(struct ib_devi
+ u32 port_num, struct ib_port_attr *attr)
+ {
+ struct rxe_dev *rxe = to_rdev(ibdev);
++ struct net_device *ndev;
+ int err, ret;
+
+ if (port_num != 1) {
+@@ -51,19 +52,26 @@ static int rxe_query_port(struct ib_devi
+
+ memcpy(attr, &rxe->port.attr, sizeof(*attr));
+
++ ndev = rxe_ib_device_get_netdev(ibdev);
++ if (!ndev) {
++ err = -ENODEV;
++ goto err_out;
++ }
++
+ mutex_lock(&rxe->usdev_lock);
+ ret = ib_get_eth_speed(ibdev, port_num, &attr->active_speed,
+ &attr->active_width);
+
+ if (attr->state == IB_PORT_ACTIVE)
+ attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP;
+- else if (dev_get_flags(rxe->ndev) & IFF_UP)
++ else if (dev_get_flags(ndev) & IFF_UP)
+ attr->phys_state = IB_PORT_PHYS_STATE_POLLING;
+ else
+ attr->phys_state = IB_PORT_PHYS_STATE_DISABLED;
+
+ mutex_unlock(&rxe->usdev_lock);
+
++ dev_put(ndev);
+ return ret;
+
+ err_out:
+@@ -1428,9 +1436,16 @@ static const struct attribute_group rxe_
+ static int rxe_enable_driver(struct ib_device *ib_dev)
+ {
+ struct rxe_dev *rxe = container_of(ib_dev, struct rxe_dev, ib_dev);
++ struct net_device *ndev;
++
++ ndev = rxe_ib_device_get_netdev(ib_dev);
++ if (!ndev)
++ return -ENODEV;
+
+ rxe_set_port_state(rxe);
+- dev_info(&rxe->ib_dev.dev, "added %s\n", netdev_name(rxe->ndev));
++ dev_info(&rxe->ib_dev.dev, "added %s\n", netdev_name(ndev));
++
++ dev_put(ndev);
+ return 0;
+ }
+
+@@ -1498,7 +1513,8 @@ static const struct ib_device_ops rxe_de
+ INIT_RDMA_OBJ_SIZE(ib_mw, rxe_mw, ibmw),
+ };
+
+-int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name)
++int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name,
++ struct net_device *ndev)
+ {
+ int err;
+ struct ib_device *dev = &rxe->ib_dev;
+@@ -1510,13 +1526,13 @@ int rxe_register_device(struct rxe_dev *
+ dev->num_comp_vectors = num_possible_cpus();
+ dev->local_dma_lkey = 0;
+ addrconf_addr_eui48((unsigned char *)&dev->node_guid,
+- rxe->ndev->dev_addr);
++ ndev->dev_addr);
+
+ dev->uverbs_cmd_mask |= BIT_ULL(IB_USER_VERBS_CMD_POST_SEND) |
+ BIT_ULL(IB_USER_VERBS_CMD_REQ_NOTIFY_CQ);
+
+ ib_set_device_ops(dev, &rxe_dev_ops);
+- err = ib_device_set_netdev(&rxe->ib_dev, rxe->ndev, 1);
++ err = ib_device_set_netdev(&rxe->ib_dev, ndev, 1);
+ if (err)
+ return err;
+
+--- a/drivers/infiniband/sw/rxe/rxe_verbs.h
++++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
+@@ -369,6 +369,7 @@ struct rxe_port {
+ u32 qp_gsi_index;
+ };
+
++#define RXE_PORT 1
+ struct rxe_dev {
+ struct ib_device ib_dev;
+ struct ib_device_attr attr;
+@@ -376,8 +377,6 @@ struct rxe_dev {
+ int max_inline_data;
+ struct mutex usdev_lock;
+
+- struct net_device *ndev;
+-
+ struct rxe_pool uc_pool;
+ struct rxe_pool pd_pool;
+ struct rxe_pool ah_pool;
+@@ -405,6 +404,11 @@ struct rxe_dev {
+ struct crypto_shash *tfm;
+ };
+
++static inline struct net_device *rxe_ib_device_get_netdev(struct ib_device *dev)
++{
++ return ib_device_get_netdev(dev, RXE_PORT);
++}
++
+ static inline void rxe_counter_inc(struct rxe_dev *rxe, enum rxe_counters index)
+ {
+ atomic64_inc(&rxe->stats_counters[index]);
+@@ -470,6 +474,7 @@ static inline struct rxe_pd *rxe_mw_pd(s
+ return to_rpd(mw->ibmw.pd);
+ }
+
+-int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name);
++int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name,
++ struct net_device *ndev);
+
+ #endif /* RXE_VERBS_H */
+--- a/include/rdma/ib_verbs.h
++++ b/include/rdma/ib_verbs.h
+@@ -4444,6 +4444,8 @@ struct net_device *ib_get_net_dev_by_par
+ const struct sockaddr *addr);
+ int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev,
+ unsigned int port);
++struct net_device *ib_device_get_netdev(struct ib_device *ib_dev,
++ u32 port);
+ struct ib_wq *ib_create_wq(struct ib_pd *pd,
+ struct ib_wq_init_attr *init_attr);
+ int ib_destroy_wq_user(struct ib_wq *wq, struct ib_udata *udata);
--- /dev/null
+From stable+bounces-198201-greg=kroah.com@vger.kernel.org Wed Dec 3 12:41:03 2025
+From: Ajay Kaher <ajay.kaher@broadcom.com>
+Date: Wed, 3 Dec 2025 11:22:55 +0000
+Subject: sched/fair: Proportional newidle balance
+To: stable@vger.kernel.org, gregkh@linuxfoundation.org
+Cc: mingo@redhat.com, peterz@infradead.org, juri.lelli@redhat.com, vincent.guittot@linaro.org, dietmar.eggemann@arm.com, rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de, vschneid@redhat.com, linux-kernel@vger.kernel.org, ajay.kaher@broadcom.com, alexey.makhalov@broadcom.com, yin.ding@broadcom.com, tapas.kundu@broadcom.com, Chris Mason <clm@meta.com>
+Message-ID: <20251203112255.1738272-5-ajay.kaher@broadcom.com>
+
+From: Peter Zijlstra (Intel) <peterz@infradead.org>
+
+commit 33cf66d88306663d16e4759e9d24766b0aaa2e17 upstream.
+
+Add a randomized algorithm that runs newidle balancing proportional to
+its success rate.
+
+This improves schbench significantly:
+
+ 6.18-rc4: 2.22 Mrps/s
+ 6.18-rc4+revert: 2.04 Mrps/s
+ 6.18-rc4+revert+random: 2.18 Mrps/S
+
+Conversely, per Adam Li this affects SpecJBB slightly, reducing it by 1%:
+
+ 6.17: -6%
+ 6.17+revert: 0%
+ 6.17+revert+random: -1%
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Tested-by: Chris Mason <clm@meta.com>
+Link: https://lkml.kernel.org/r/6825c50d-7fa7-45d8-9b81-c6e7e25738e2@meta.com
+Link: https://patch.msgid.link/20251107161739.770122091@infradead.org
+[ Ajay: Modified to apply on v6.6 ]
+Signed-off-by: Ajay Kaher <ajay.kaher@broadcom.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/sched/topology.h | 3 ++
+ kernel/sched/core.c | 3 ++
+ kernel/sched/fair.c | 44 +++++++++++++++++++++++++++++++++++++----
+ kernel/sched/features.h | 5 ++++
+ kernel/sched/sched.h | 7 ++++++
+ kernel/sched/topology.c | 6 +++++
+ 6 files changed, 64 insertions(+), 4 deletions(-)
+
+--- a/include/linux/sched/topology.h
++++ b/include/linux/sched/topology.h
+@@ -106,6 +106,9 @@ struct sched_domain {
+ unsigned int nr_balance_failed; /* initialise to 0 */
+
+ /* idle_balance() stats */
++ unsigned int newidle_call;
++ unsigned int newidle_success;
++ unsigned int newidle_ratio;
+ u64 max_newidle_lb_cost;
+ unsigned long last_decay_max_lb_cost;
+
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -116,6 +116,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_
+ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
+
+ DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
++DEFINE_PER_CPU(struct rnd_state, sched_rnd_state);
+
+ #ifdef CONFIG_SCHED_DEBUG
+ /*
+@@ -9872,6 +9873,8 @@ void __init sched_init_smp(void)
+ {
+ sched_init_numa(NUMA_NO_NODE);
+
++ prandom_init_once(&sched_rnd_state);
++
+ /*
+ * There's no userspace yet to cause hotplug operations; hence all the
+ * CPU masks are stable and all blatant races in the below code cannot
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -11716,11 +11716,27 @@ void update_max_interval(void)
+ max_load_balance_interval = HZ*num_online_cpus()/10;
+ }
+
+-static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
++static inline void update_newidle_stats(struct sched_domain *sd, unsigned int success)
++{
++ sd->newidle_call++;
++ sd->newidle_success += success;
++
++ if (sd->newidle_call >= 1024) {
++ sd->newidle_ratio = sd->newidle_success;
++ sd->newidle_call /= 2;
++ sd->newidle_success /= 2;
++ }
++}
++
++static inline bool
++update_newidle_cost(struct sched_domain *sd, u64 cost, unsigned int success)
+ {
+ unsigned long next_decay = sd->last_decay_max_lb_cost + HZ;
+ unsigned long now = jiffies;
+
++ if (cost)
++ update_newidle_stats(sd, success);
++
+ if (cost > sd->max_newidle_lb_cost) {
+ /*
+ * Track max cost of a domain to make sure to not delay the
+@@ -11768,7 +11784,7 @@ static void rebalance_domains(struct rq
+ * Decay the newidle max times here because this is a regular
+ * visit to all the domains.
+ */
+- need_decay = update_newidle_cost(sd, 0);
++ need_decay = update_newidle_cost(sd, 0, 0);
+ max_cost += sd->max_newidle_lb_cost;
+
+ /*
+@@ -12406,6 +12422,22 @@ static int sched_balance_newidle(struct
+ break;
+
+ if (sd->flags & SD_BALANCE_NEWIDLE) {
++ unsigned int weight = 1;
++
++ if (sched_feat(NI_RANDOM)) {
++ /*
++ * Throw a 1k sided dice; and only run
++ * newidle_balance according to the success
++ * rate.
++ */
++ u32 d1k = sched_rng() % 1024;
++ weight = 1 + sd->newidle_ratio;
++ if (d1k > weight) {
++ update_newidle_stats(sd, 0);
++ continue;
++ }
++ weight = (1024 + weight/2) / weight;
++ }
+
+ pulled_task = load_balance(this_cpu, this_rq,
+ sd, CPU_NEWLY_IDLE,
+@@ -12413,10 +12445,14 @@ static int sched_balance_newidle(struct
+
+ t1 = sched_clock_cpu(this_cpu);
+ domain_cost = t1 - t0;
+- update_newidle_cost(sd, domain_cost);
+-
+ curr_cost += domain_cost;
+ t0 = t1;
++
++ /*
++ * Track max cost of a domain to make sure to not delay the
++ * next wakeup on the CPU.
++ */
++ update_newidle_cost(sd, domain_cost, weight * !!pulled_task);
+ }
+
+ /*
+--- a/kernel/sched/features.h
++++ b/kernel/sched/features.h
+@@ -88,4 +88,9 @@ SCHED_FEAT(UTIL_EST_FASTUP, true)
+
+ SCHED_FEAT(LATENCY_WARN, false)
+
++/*
++ * Do newidle balancing proportional to its success rate using randomization.
++ */
++SCHED_FEAT(NI_RANDOM, true)
++
+ SCHED_FEAT(HZ_BW, true)
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -5,6 +5,7 @@
+ #ifndef _KERNEL_SCHED_SCHED_H
+ #define _KERNEL_SCHED_SCHED_H
+
++#include <linux/prandom.h>
+ #include <linux/sched/affinity.h>
+ #include <linux/sched/autogroup.h>
+ #include <linux/sched/cpufreq.h>
+@@ -1205,6 +1206,12 @@ static inline bool is_migration_disabled
+ }
+
+ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
++DECLARE_PER_CPU(struct rnd_state, sched_rnd_state);
++
++static inline u32 sched_rng(void)
++{
++ return prandom_u32_state(this_cpu_ptr(&sched_rnd_state));
++}
+
+ #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
+ #define this_rq() this_cpu_ptr(&runqueues)
+--- a/kernel/sched/topology.c
++++ b/kernel/sched/topology.c
+@@ -1600,6 +1600,12 @@ sd_init(struct sched_domain_topology_lev
+
+ .last_balance = jiffies,
+ .balance_interval = sd_weight,
++
++ /* 50% success rate */
++ .newidle_call = 512,
++ .newidle_success = 256,
++ .newidle_ratio = 512,
++
+ .max_newidle_lb_cost = 0,
+ .last_decay_max_lb_cost = jiffies,
+ .child = child,
--- /dev/null
+From stable+bounces-198199-greg=kroah.com@vger.kernel.org Wed Dec 3 12:40:53 2025
+From: Ajay Kaher <ajay.kaher@broadcom.com>
+Date: Wed, 3 Dec 2025 11:22:53 +0000
+Subject: sched/fair: Small cleanup to sched_balance_newidle()
+To: stable@vger.kernel.org, gregkh@linuxfoundation.org
+Cc: mingo@redhat.com, peterz@infradead.org, juri.lelli@redhat.com, vincent.guittot@linaro.org, dietmar.eggemann@arm.com, rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de, vschneid@redhat.com, linux-kernel@vger.kernel.org, ajay.kaher@broadcom.com, alexey.makhalov@broadcom.com, yin.ding@broadcom.com, tapas.kundu@broadcom.com, Chris Mason <clm@meta.com>
+Message-ID: <20251203112255.1738272-3-ajay.kaher@broadcom.com>
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+commit e78e70dbf603c1425f15f32b455ca148c932f6c1 upstream.
+
+Pull out the !sd check to simplify code.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Tested-by: Chris Mason <clm@meta.com>
+Link: https://patch.msgid.link/20251107161739.525916173@infradead.org
+[ Ajay: Modified to apply on v6.6 ]
+Signed-off-by: Ajay Kaher <ajay.kaher@broadcom.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/sched/fair.c | 11 ++++++-----
+ 1 file changed, 6 insertions(+), 5 deletions(-)
+
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -12374,14 +12374,15 @@ static int sched_balance_newidle(struct
+
+ rcu_read_lock();
+ sd = rcu_dereference_check_sched_domain(this_rq->sd);
++ if (!sd) {
++ rcu_read_unlock();
++ goto out;
++ }
+
+ if (!READ_ONCE(this_rq->rd->overload) ||
+- (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) {
+-
+- if (sd)
+- update_next_balance(sd, &next_balance);
++ this_rq->avg_idle < sd->max_newidle_lb_cost) {
++ update_next_balance(sd, &next_balance);
+ rcu_read_unlock();
+-
+ goto out;
+ }
+ rcu_read_unlock();
--- /dev/null
+From stable+bounces-198200-greg=kroah.com@vger.kernel.org Wed Dec 3 12:40:49 2025
+From: Ajay Kaher <ajay.kaher@broadcom.com>
+Date: Wed, 3 Dec 2025 11:22:54 +0000
+Subject: sched/fair: Small cleanup to update_newidle_cost()
+To: stable@vger.kernel.org, gregkh@linuxfoundation.org
+Cc: mingo@redhat.com, peterz@infradead.org, juri.lelli@redhat.com, vincent.guittot@linaro.org, dietmar.eggemann@arm.com, rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de, vschneid@redhat.com, linux-kernel@vger.kernel.org, ajay.kaher@broadcom.com, alexey.makhalov@broadcom.com, yin.ding@broadcom.com, tapas.kundu@broadcom.com, Chris Mason <clm@meta.com>
+Message-ID: <20251203112255.1738272-4-ajay.kaher@broadcom.com>
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+commit 08d473dd8718e4a4d698b1113a14a40ad64a909b upstream.
+
+Simplify code by adding a few variables.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Tested-by: Chris Mason <clm@meta.com>
+Link: https://patch.msgid.link/20251107161739.655208666@infradead.org
+[ Ajay: Modified to apply on v6.6 ]
+Signed-off-by: Ajay Kaher <ajay.kaher@broadcom.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/sched/fair.c | 11 +++++++----
+ 1 file changed, 7 insertions(+), 4 deletions(-)
+
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -11718,22 +11718,25 @@ void update_max_interval(void)
+
+ static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
+ {
++ unsigned long next_decay = sd->last_decay_max_lb_cost + HZ;
++ unsigned long now = jiffies;
++
+ if (cost > sd->max_newidle_lb_cost) {
+ /*
+ * Track max cost of a domain to make sure to not delay the
+ * next wakeup on the CPU.
+ */
+ sd->max_newidle_lb_cost = cost;
+- sd->last_decay_max_lb_cost = jiffies;
+- } else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) {
++ sd->last_decay_max_lb_cost = now;
++
++ } else if (time_after(now, next_decay)) {
+ /*
+ * Decay the newidle max times by ~1% per second to ensure that
+ * it is not outdated and the current max cost is actually
+ * shorter.
+ */
+ sd->max_newidle_lb_cost = (sd->max_newidle_lb_cost * 253) / 256;
+- sd->last_decay_max_lb_cost = jiffies;
+-
++ sd->last_decay_max_lb_cost = now;
+ return true;
+ }
+
media-amphion-make-some-vpu_v4l2-functions-static.patch
media-amphion-remove-vpu_vb_is_codecconfig.patch
media-mediatek-vcodec-use-spinlock-for-context-list-protection-lock.patch
+kvm-svm-introduce-svm_recalc_lbr_msr_intercepts.patch
+kvm-nsvm-always-recalculate-lbr-msr-intercepts-in-svm_update_lbrv.patch
+kvm-nsvm-fix-and-simplify-lbr-virtualization-handling-with-nested.patch
+kvm-svm-fix-redundant-updates-of-lbr-msr-intercepts.patch
+mm-damon-tests-vaddr-kunit-handle-alloc-failures-in-damon_test_split_evenly_fail.patch
+mm-damon-tests-vaddr-kunit-handle-alloc-failures-on-damon_do_test_apply_three_regions.patch
+rdma-core-fix-kasan-slab-use-after-free-read-in-ib_register_device-problem.patch
+sched-fair-small-cleanup-to-sched_balance_newidle.patch
+sched-fair-small-cleanup-to-update_newidle_cost.patch
+sched-fair-proportional-newidle-balance.patch
+net-remove-conditional-threaded-napi-wakeup-based-on-task-state.patch
+net-allow-to-use-smp-threads-for-backlog-napi.patch
+rdma-rxe-remove-the-direct-link-to-net_device.patch
+rdma-rxe-fix-the-failure-of-ibv_query_device-and-ibv_query_device_ex-tests.patch
+mm-damon-tests-vaddr-kunit-handle-alloc-failures-on-damon_test_split_evenly_succ.patch
+mm-damon-tests-core-kunit-handle-alloc-failres-in-damon_test_new_filter.patch
+mm-damon-tests-core-kunit-handle-allocation-failures-in-damon_test_regions.patch
+mm-damon-tests-core-kunit-handle-alloc-failures-on-damon_test_split_at.patch
+mm-damon-tests-core-kunit-handle-alloc-failures-on-dasmon_test_merge_regions_of.patch
+mm-damon-tests-core-kunit-handle-alloc-failures-on-damon_test_merge_two.patch