]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
6.6-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 8 Jan 2026 12:38:40 +0000 (13:38 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 8 Jan 2026 12:38:40 +0000 (13:38 +0100)
added patches:
kvm-nsvm-always-recalculate-lbr-msr-intercepts-in-svm_update_lbrv.patch
kvm-nsvm-fix-and-simplify-lbr-virtualization-handling-with-nested.patch
kvm-svm-fix-redundant-updates-of-lbr-msr-intercepts.patch
kvm-svm-introduce-svm_recalc_lbr_msr_intercepts.patch
mm-damon-tests-core-kunit-handle-alloc-failres-in-damon_test_new_filter.patch
mm-damon-tests-core-kunit-handle-alloc-failures-on-damon_test_merge_two.patch
mm-damon-tests-core-kunit-handle-alloc-failures-on-damon_test_split_at.patch
mm-damon-tests-core-kunit-handle-alloc-failures-on-dasmon_test_merge_regions_of.patch
mm-damon-tests-core-kunit-handle-allocation-failures-in-damon_test_regions.patch
mm-damon-tests-vaddr-kunit-handle-alloc-failures-in-damon_test_split_evenly_fail.patch
mm-damon-tests-vaddr-kunit-handle-alloc-failures-on-damon_do_test_apply_three_regions.patch
mm-damon-tests-vaddr-kunit-handle-alloc-failures-on-damon_test_split_evenly_succ.patch
net-allow-to-use-smp-threads-for-backlog-napi.patch
net-remove-conditional-threaded-napi-wakeup-based-on-task-state.patch
rdma-core-fix-kasan-slab-use-after-free-read-in-ib_register_device-problem.patch
rdma-rxe-fix-the-failure-of-ibv_query_device-and-ibv_query_device_ex-tests.patch
rdma-rxe-remove-the-direct-link-to-net_device.patch
sched-fair-proportional-newidle-balance.patch
sched-fair-small-cleanup-to-sched_balance_newidle.patch
sched-fair-small-cleanup-to-update_newidle_cost.patch

21 files changed:
queue-6.6/kvm-nsvm-always-recalculate-lbr-msr-intercepts-in-svm_update_lbrv.patch [new file with mode: 0644]
queue-6.6/kvm-nsvm-fix-and-simplify-lbr-virtualization-handling-with-nested.patch [new file with mode: 0644]
queue-6.6/kvm-svm-fix-redundant-updates-of-lbr-msr-intercepts.patch [new file with mode: 0644]
queue-6.6/kvm-svm-introduce-svm_recalc_lbr_msr_intercepts.patch [new file with mode: 0644]
queue-6.6/mm-damon-tests-core-kunit-handle-alloc-failres-in-damon_test_new_filter.patch [new file with mode: 0644]
queue-6.6/mm-damon-tests-core-kunit-handle-alloc-failures-on-damon_test_merge_two.patch [new file with mode: 0644]
queue-6.6/mm-damon-tests-core-kunit-handle-alloc-failures-on-damon_test_split_at.patch [new file with mode: 0644]
queue-6.6/mm-damon-tests-core-kunit-handle-alloc-failures-on-dasmon_test_merge_regions_of.patch [new file with mode: 0644]
queue-6.6/mm-damon-tests-core-kunit-handle-allocation-failures-in-damon_test_regions.patch [new file with mode: 0644]
queue-6.6/mm-damon-tests-vaddr-kunit-handle-alloc-failures-in-damon_test_split_evenly_fail.patch [new file with mode: 0644]
queue-6.6/mm-damon-tests-vaddr-kunit-handle-alloc-failures-on-damon_do_test_apply_three_regions.patch [new file with mode: 0644]
queue-6.6/mm-damon-tests-vaddr-kunit-handle-alloc-failures-on-damon_test_split_evenly_succ.patch [new file with mode: 0644]
queue-6.6/net-allow-to-use-smp-threads-for-backlog-napi.patch [new file with mode: 0644]
queue-6.6/net-remove-conditional-threaded-napi-wakeup-based-on-task-state.patch [new file with mode: 0644]
queue-6.6/rdma-core-fix-kasan-slab-use-after-free-read-in-ib_register_device-problem.patch [new file with mode: 0644]
queue-6.6/rdma-rxe-fix-the-failure-of-ibv_query_device-and-ibv_query_device_ex-tests.patch [new file with mode: 0644]
queue-6.6/rdma-rxe-remove-the-direct-link-to-net_device.patch [new file with mode: 0644]
queue-6.6/sched-fair-proportional-newidle-balance.patch [new file with mode: 0644]
queue-6.6/sched-fair-small-cleanup-to-sched_balance_newidle.patch [new file with mode: 0644]
queue-6.6/sched-fair-small-cleanup-to-update_newidle_cost.patch [new file with mode: 0644]
queue-6.6/series

diff --git a/queue-6.6/kvm-nsvm-always-recalculate-lbr-msr-intercepts-in-svm_update_lbrv.patch b/queue-6.6/kvm-nsvm-always-recalculate-lbr-msr-intercepts-in-svm_update_lbrv.patch
new file mode 100644 (file)
index 0000000..27366f8
--- /dev/null
@@ -0,0 +1,100 @@
+From fbe5e5f030c22ae717ee422aaab0e00ea84fab5e Mon Sep 17 00:00:00 2001
+From: Yosry Ahmed <yosry.ahmed@linux.dev>
+Date: Sat, 8 Nov 2025 00:45:20 +0000
+Subject: KVM: nSVM: Always recalculate LBR MSR intercepts in svm_update_lbrv()
+
+From: Yosry Ahmed <yosry.ahmed@linux.dev>
+
+commit fbe5e5f030c22ae717ee422aaab0e00ea84fab5e upstream.
+
+svm_update_lbrv() is called when MSR_IA32_DEBUGCTLMSR is updated, and on
+nested transitions where LBRV is used. It checks whether LBRV enablement
+needs to be changed in the current VMCB, and if it does, it also
+recalculate intercepts to LBR MSRs.
+
+However, there are cases where intercepts need to be updated even when
+LBRV enablement doesn't. Example scenario:
+- L1 has MSR_IA32_DEBUGCTLMSR cleared.
+- L1 runs L2 without LBR_CTL_ENABLE (no LBRV).
+- L2 sets DEBUGCTLMSR_LBR in MSR_IA32_DEBUGCTLMSR, svm_update_lbrv()
+  sets LBR_CTL_ENABLE in VMCB02 and disables intercepts to LBR MSRs.
+- L2 exits to L1, svm_update_lbrv() is not called on this transition.
+- L1 clears MSR_IA32_DEBUGCTLMSR, svm_update_lbrv() finds that
+  LBR_CTL_ENABLE is already cleared in VMCB01 and does nothing.
+- Intercepts remain disabled, L1 reads to LBR MSRs read the host MSRs.
+
+Fix it by always recalculating intercepts in svm_update_lbrv().
+
+Fixes: 1d5a1b5860ed ("KVM: x86: nSVM: correctly virtualize LBR msrs when L2 is running")
+Cc: stable@vger.kernel.org
+Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
+Link: https://patch.msgid.link/20251108004524.1600006-3-yosry.ahmed@linux.dev
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/svm/svm.c |   29 +++++++++++++++++++----------
+ 1 file changed, 19 insertions(+), 10 deletions(-)
+
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -1031,26 +1031,30 @@ static void svm_recalc_lbr_msr_intercept
+                                    !intercept, !intercept);
+ }
+-void svm_enable_lbrv(struct kvm_vcpu *vcpu)
++static void __svm_enable_lbrv(struct kvm_vcpu *vcpu)
+ {
+       struct vcpu_svm *svm = to_svm(vcpu);
+       svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
+-      svm_recalc_lbr_msr_intercepts(vcpu);
+       /* Move the LBR msrs to the vmcb02 so that the guest can see them. */
+       if (is_guest_mode(vcpu))
+               svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr);
+ }
+-static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
++void svm_enable_lbrv(struct kvm_vcpu *vcpu)
++{
++      __svm_enable_lbrv(vcpu);
++      svm_recalc_lbr_msr_intercepts(vcpu);
++}
++
++static void __svm_disable_lbrv(struct kvm_vcpu *vcpu)
+ {
+       struct vcpu_svm *svm = to_svm(vcpu);
+       KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm);
+       svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
+-      svm_recalc_lbr_msr_intercepts(vcpu);
+       /*
+        * Move the LBR msrs back to the vmcb01 to avoid copying them
+@@ -1079,13 +1083,18 @@ void svm_update_lbrv(struct kvm_vcpu *vc
+                           (is_guest_mode(vcpu) && guest_can_use(vcpu, X86_FEATURE_LBRV) &&
+                           (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK));
+-      if (enable_lbrv == current_enable_lbrv)
+-              return;
++      if (enable_lbrv && !current_enable_lbrv)
++              __svm_enable_lbrv(vcpu);
++      else if (!enable_lbrv && current_enable_lbrv)
++              __svm_disable_lbrv(vcpu);
+-      if (enable_lbrv)
+-              svm_enable_lbrv(vcpu);
+-      else
+-              svm_disable_lbrv(vcpu);
++      /*
++       * During nested transitions, it is possible that the current VMCB has
++       * LBR_CTL set, but the previous LBR_CTL had it cleared (or vice versa).
++       * In this case, even though LBR_CTL does not need an update, intercepts
++       * do, so always recalculate the intercepts here.
++       */
++      svm_recalc_lbr_msr_intercepts(vcpu);
+ }
+ void disable_nmi_singlestep(struct vcpu_svm *svm)
diff --git a/queue-6.6/kvm-nsvm-fix-and-simplify-lbr-virtualization-handling-with-nested.patch b/queue-6.6/kvm-nsvm-fix-and-simplify-lbr-virtualization-handling-with-nested.patch
new file mode 100644 (file)
index 0000000..c050705
--- /dev/null
@@ -0,0 +1,193 @@
+From 8a4821412cf2c1429fffa07c012dd150f2edf78c Mon Sep 17 00:00:00 2001
+From: Yosry Ahmed <yosry.ahmed@linux.dev>
+Date: Sat, 8 Nov 2025 00:45:21 +0000
+Subject: KVM: nSVM: Fix and simplify LBR virtualization handling with nested
+
+From: Yosry Ahmed <yosry.ahmed@linux.dev>
+
+commit 8a4821412cf2c1429fffa07c012dd150f2edf78c upstream.
+
+The current scheme for handling LBRV when nested is used is very
+complicated, especially when L1 does not enable LBRV (i.e. does not set
+LBR_CTL_ENABLE_MASK).
+
+To avoid copying LBRs between VMCB01 and VMCB02 on every nested
+transition, the current implementation switches between using VMCB01 or
+VMCB02 as the source of truth for the LBRs while L2 is running. If L2
+enables LBR, VMCB02 is used as the source of truth. When L2 disables
+LBR, the LBRs are copied to VMCB01 and VMCB01 is used as the source of
+truth. This introduces significant complexity, and incorrect behavior in
+some cases.
+
+For example, on a nested #VMEXIT, the LBRs are only copied from VMCB02
+to VMCB01 if LBRV is enabled in VMCB01. This is because L2's writes to
+MSR_IA32_DEBUGCTLMSR to enable LBR are intercepted and propagated to
+VMCB01 instead of VMCB02. However, LBRV is only enabled in VMCB02 when
+L2 is running.
+
+This means that if L2 enables LBR and exits to L1, the LBRs will not be
+propagated from VMCB02 to VMCB01, because LBRV is disabled in VMCB01.
+
+There is no meaningful difference in CPUID rate in L2 when copying LBRs
+on every nested transition vs. the current approach, so do the simple
+and correct thing and always copy LBRs between VMCB01 and VMCB02 on
+nested transitions (when LBRV is disabled by L1). Drop the conditional
+LBRs copying in __svm_{enable/disable}_lbrv() as it is now unnecessary.
+
+VMCB02 becomes the only source of truth for LBRs when L2 is running,
+regardless of LBRV being enabled by L1, drop svm_get_lbr_vmcb() and use
+svm->vmcb directly in its place.
+
+Fixes: 1d5a1b5860ed ("KVM: x86: nSVM: correctly virtualize LBR msrs when L2 is running")
+Cc: stable@vger.kernel.org
+Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
+Link: https://patch.msgid.link/20251108004524.1600006-4-yosry.ahmed@linux.dev
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/svm/nested.c |   20 ++++++-------------
+ arch/x86/kvm/svm/svm.c    |   47 +++++++++-------------------------------------
+ 2 files changed, 17 insertions(+), 50 deletions(-)
+
+--- a/arch/x86/kvm/svm/nested.c
++++ b/arch/x86/kvm/svm/nested.c
+@@ -601,11 +601,10 @@ static void nested_vmcb02_prepare_save(s
+                */
+               svm_copy_lbrs(vmcb02, vmcb12);
+               vmcb02->save.dbgctl &= ~DEBUGCTL_RESERVED_BITS;
+-              svm_update_lbrv(&svm->vcpu);
+-
+-      } else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) {
++      } else {
+               svm_copy_lbrs(vmcb02, vmcb01);
+       }
++      svm_update_lbrv(&svm->vcpu);
+ }
+ static inline bool is_evtinj_soft(u32 evtinj)
+@@ -731,11 +730,7 @@ static void nested_vmcb02_prepare_contro
+                       svm->soft_int_next_rip = vmcb12_rip;
+       }
+-      vmcb02->control.virt_ext            = vmcb01->control.virt_ext &
+-                                            LBR_CTL_ENABLE_MASK;
+-      if (guest_can_use(vcpu, X86_FEATURE_LBRV))
+-              vmcb02->control.virt_ext  |=
+-                      (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK);
++      /* LBR_CTL_ENABLE_MASK is controlled by svm_update_lbrv() */
+       if (!nested_vmcb_needs_vls_intercept(svm))
+               vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+@@ -1066,13 +1061,12 @@ int nested_svm_vmexit(struct vcpu_svm *s
+               kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+       if (unlikely(guest_can_use(vcpu, X86_FEATURE_LBRV) &&
+-                   (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) {
++                   (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK)))
+               svm_copy_lbrs(vmcb12, vmcb02);
+-              svm_update_lbrv(vcpu);
+-      } else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) {
++      else
+               svm_copy_lbrs(vmcb01, vmcb02);
+-              svm_update_lbrv(vcpu);
+-      }
++
++      svm_update_lbrv(vcpu);
+       if (vnmi) {
+               if (vmcb02->control.int_ctl & V_NMI_BLOCKING_MASK)
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -1033,13 +1033,7 @@ static void svm_recalc_lbr_msr_intercept
+ static void __svm_enable_lbrv(struct kvm_vcpu *vcpu)
+ {
+-      struct vcpu_svm *svm = to_svm(vcpu);
+-
+-      svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
+-
+-      /* Move the LBR msrs to the vmcb02 so that the guest can see them. */
+-      if (is_guest_mode(vcpu))
+-              svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr);
++      to_svm(vcpu)->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
+ }
+ void svm_enable_lbrv(struct kvm_vcpu *vcpu)
+@@ -1050,36 +1044,15 @@ void svm_enable_lbrv(struct kvm_vcpu *vc
+ static void __svm_disable_lbrv(struct kvm_vcpu *vcpu)
+ {
+-      struct vcpu_svm *svm = to_svm(vcpu);
+-
+       KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm);
+-
+-      svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
+-
+-      /*
+-       * Move the LBR msrs back to the vmcb01 to avoid copying them
+-       * on nested guest entries.
+-       */
+-      if (is_guest_mode(vcpu))
+-              svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb);
+-}
+-
+-static struct vmcb *svm_get_lbr_vmcb(struct vcpu_svm *svm)
+-{
+-      /*
+-       * If LBR virtualization is disabled, the LBR MSRs are always kept in
+-       * vmcb01.  If LBR virtualization is enabled and L1 is running VMs of
+-       * its own, the MSRs are moved between vmcb01 and vmcb02 as needed.
+-       */
+-      return svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK ? svm->vmcb :
+-                                                                 svm->vmcb01.ptr;
++      to_svm(vcpu)->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
+ }
+ void svm_update_lbrv(struct kvm_vcpu *vcpu)
+ {
+       struct vcpu_svm *svm = to_svm(vcpu);
+       bool current_enable_lbrv = svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK;
+-      bool enable_lbrv = (svm_get_lbr_vmcb(svm)->save.dbgctl & DEBUGCTLMSR_LBR) ||
++      bool enable_lbrv = (svm->vmcb->save.dbgctl & DEBUGCTLMSR_LBR) ||
+                           (is_guest_mode(vcpu) && guest_can_use(vcpu, X86_FEATURE_LBRV) &&
+                           (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK));
+@@ -2925,19 +2898,19 @@ static int svm_get_msr(struct kvm_vcpu *
+               msr_info->data = svm->tsc_aux;
+               break;
+       case MSR_IA32_DEBUGCTLMSR:
+-              msr_info->data = svm_get_lbr_vmcb(svm)->save.dbgctl;
++              msr_info->data = svm->vmcb->save.dbgctl;
+               break;
+       case MSR_IA32_LASTBRANCHFROMIP:
+-              msr_info->data = svm_get_lbr_vmcb(svm)->save.br_from;
++              msr_info->data = svm->vmcb->save.br_from;
+               break;
+       case MSR_IA32_LASTBRANCHTOIP:
+-              msr_info->data = svm_get_lbr_vmcb(svm)->save.br_to;
++              msr_info->data = svm->vmcb->save.br_to;
+               break;
+       case MSR_IA32_LASTINTFROMIP:
+-              msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_from;
++              msr_info->data = svm->vmcb->save.last_excp_from;
+               break;
+       case MSR_IA32_LASTINTTOIP:
+-              msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_to;
++              msr_info->data = svm->vmcb->save.last_excp_to;
+               break;
+       case MSR_VM_HSAVE_PA:
+               msr_info->data = svm->nested.hsave_msr;
+@@ -3206,10 +3179,10 @@ static int svm_set_msr(struct kvm_vcpu *
+               if (data & DEBUGCTL_RESERVED_BITS)
+                       return 1;
+-              if (svm_get_lbr_vmcb(svm)->save.dbgctl == data)
++              if (svm->vmcb->save.dbgctl == data)
+                       break;
+-              svm_get_lbr_vmcb(svm)->save.dbgctl = data;
++              svm->vmcb->save.dbgctl = data;
+               vmcb_mark_dirty(svm->vmcb, VMCB_LBR);
+               svm_update_lbrv(vcpu);
+               break;
diff --git a/queue-6.6/kvm-svm-fix-redundant-updates-of-lbr-msr-intercepts.patch b/queue-6.6/kvm-svm-fix-redundant-updates-of-lbr-msr-intercepts.patch
new file mode 100644 (file)
index 0000000..4d4a7cb
--- /dev/null
@@ -0,0 +1,95 @@
+From 3fa05f96fc08dff5e846c2cc283a249c1bf029a1 Mon Sep 17 00:00:00 2001
+From: Yosry Ahmed <yosry.ahmed@linux.dev>
+Date: Wed, 12 Nov 2025 01:30:17 +0000
+Subject: KVM: SVM: Fix redundant updates of LBR MSR intercepts
+
+From: Yosry Ahmed <yosry.ahmed@linux.dev>
+
+commit 3fa05f96fc08dff5e846c2cc283a249c1bf029a1 upstream.
+
+Don't update the LBR MSR intercept bitmaps if they're already up-to-date,
+as unconditionally updating the intercepts forces KVM to recalculate the
+MSR bitmaps for vmcb02 on every nested VMRUN.  The redundant updates are
+functionally okay; however, they neuter an optimization in Hyper-V
+nested virtualization enlightenments and this manifests as a self-test
+failure.
+
+In particular, Hyper-V lets L1 mark "nested enlightenments" as clean, i.e.
+tell KVM that no changes were made to the MSR bitmap since the last VMRUN.
+The hyperv_svm_test KVM selftest intentionally changes the MSR bitmap
+"without telling KVM about it" to verify that KVM honors the clean hint,
+correctly fails because KVM notices the changed bitmap anyway:
+
+  ==== Test Assertion Failure ====
+  x86/hyperv_svm_test.c:120: vmcb->control.exit_code == 0x081
+  pid=193558 tid=193558 errno=4 - Interrupted system call
+     1 0x0000000000411361: assert_on_unhandled_exception at processor.c:659
+     2 0x0000000000406186: _vcpu_run at kvm_util.c:1699
+     3  (inlined by) vcpu_run at kvm_util.c:1710
+     4 0x0000000000401f2a: main at hyperv_svm_test.c:175
+     5 0x000000000041d0d3: __libc_start_call_main at libc-start.o:?
+     6 0x000000000041f27c: __libc_start_main_impl at ??:?
+     7 0x00000000004021a0: _start at ??:?
+  vmcb->control.exit_code == SVM_EXIT_VMMCALL
+
+Do *not* fix this by skipping svm_hv_vmcb_dirty_nested_enlightenments()
+when svm_set_intercept_for_msr() performs a no-op change.  changes to
+the L0 MSR interception bitmap are only triggered by full CPUID updates
+and MSR filter updates, both of which should be rare.  Changing
+svm_set_intercept_for_msr() risks hiding unintended pessimizations
+like this one, and is actually more complex than this change.
+
+Fixes: fbe5e5f030c2 ("KVM: nSVM: Always recalculate LBR MSR intercepts in svm_update_lbrv()")
+Cc: stable@vger.kernel.org
+Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
+Link: https://patch.msgid.link/20251112013017.1836863-1-yosry.ahmed@linux.dev
+[Rewritten commit message based on mailing list discussion. - Paolo]
+Reviewed-by: Sean Christopherson <seanjc@google.com>
+Tested-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/svm/svm.c |    6 ++++++
+ arch/x86/kvm/svm/svm.h |    1 +
+ 2 files changed, 7 insertions(+)
+
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -1017,6 +1017,9 @@ static void svm_recalc_lbr_msr_intercept
+       struct vcpu_svm *svm = to_svm(vcpu);
+       bool intercept = !(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK);
++      if (intercept == svm->lbr_msrs_intercepted)
++              return;
++
+       set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP,
+                            !intercept, !intercept);
+       set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP,
+@@ -1029,6 +1032,8 @@ static void svm_recalc_lbr_msr_intercept
+       if (sev_es_guest(vcpu->kvm))
+               set_msr_interception(vcpu, svm->msrpm, MSR_IA32_DEBUGCTLMSR,
+                                    !intercept, !intercept);
++
++      svm->lbr_msrs_intercepted = intercept;
+ }
+ static void __svm_enable_lbrv(struct kvm_vcpu *vcpu)
+@@ -1473,6 +1478,7 @@ static int svm_vcpu_create(struct kvm_vc
+       }
+       svm->x2avic_msrs_intercepted = true;
++      svm->lbr_msrs_intercepted = true;
+       svm->vmcb01.ptr = page_address(vmcb01_page);
+       svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
+--- a/arch/x86/kvm/svm/svm.h
++++ b/arch/x86/kvm/svm/svm.h
+@@ -288,6 +288,7 @@ struct vcpu_svm {
+       bool guest_state_loaded;
+       bool x2avic_msrs_intercepted;
++      bool lbr_msrs_intercepted;
+       /* Guest GIF value, used when vGIF is not enabled */
+       bool guest_gif;
diff --git a/queue-6.6/kvm-svm-introduce-svm_recalc_lbr_msr_intercepts.patch b/queue-6.6/kvm-svm-introduce-svm_recalc_lbr_msr_intercepts.patch
new file mode 100644 (file)
index 0000000..e2005bd
--- /dev/null
@@ -0,0 +1,78 @@
+From yosry.ahmed@linux.dev  Thu Jan  8 13:20:08 2026
+From: Yosry Ahmed <yosry.ahmed@linux.dev>
+Date: Wed,  3 Dec 2025 18:42:17 +0000
+Subject: KVM: SVM: Introduce svm_recalc_lbr_msr_intercepts()
+To: stable@vger.kernel.org
+Cc: Paolo Bonzini <pbonzini@redhat.com>, Sean Christopherson <seanjc@google.com>, Yosry Ahmed <yosry.ahmed@linux.dev>
+Message-ID: <20251203184220.2693264-1-yosry.ahmed@linux.dev>
+
+From: Yosry Ahmed <yosry.ahmed@linux.dev>
+
+Introduce a helper updating the intercepts for LBR MSRs, similar to the
+one introduced upstream by commit 160f143cc131 ("KVM: SVM: Manually
+recalc all MSR intercepts on userspace MSR filter change"). The main
+difference is that this version uses set_msr_interception(), which has
+inverted polarity compared to svm_set_intercept_for_msr().
+
+This is intended to simplify incoming backports. No functional changes
+intended.
+
+Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/svm/svm.c |   32 +++++++++++++++++++++-----------
+ 1 file changed, 21 insertions(+), 11 deletions(-)
+
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -1012,18 +1012,31 @@ void svm_copy_lbrs(struct vmcb *to_vmcb,
+       vmcb_mark_dirty(to_vmcb, VMCB_LBR);
+ }
+-void svm_enable_lbrv(struct kvm_vcpu *vcpu)
++static void svm_recalc_lbr_msr_intercepts(struct kvm_vcpu *vcpu)
+ {
+       struct vcpu_svm *svm = to_svm(vcpu);
++      bool intercept = !(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK);
+-      svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
+-      set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
+-      set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
+-      set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
+-      set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
++      set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP,
++                           !intercept, !intercept);
++      set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP,
++                           !intercept, !intercept);
++      set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP,
++                           !intercept, !intercept);
++      set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP,
++                           !intercept, !intercept);
+       if (sev_es_guest(vcpu->kvm))
+-              set_msr_interception(vcpu, svm->msrpm, MSR_IA32_DEBUGCTLMSR, 1, 1);
++              set_msr_interception(vcpu, svm->msrpm, MSR_IA32_DEBUGCTLMSR,
++                                   !intercept, !intercept);
++}
++
++void svm_enable_lbrv(struct kvm_vcpu *vcpu)
++{
++      struct vcpu_svm *svm = to_svm(vcpu);
++
++      svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
++      svm_recalc_lbr_msr_intercepts(vcpu);
+       /* Move the LBR msrs to the vmcb02 so that the guest can see them. */
+       if (is_guest_mode(vcpu))
+@@ -1037,10 +1050,7 @@ static void svm_disable_lbrv(struct kvm_
+       KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm);
+       svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
+-      set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
+-      set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
+-      set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
+-      set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
++      svm_recalc_lbr_msr_intercepts(vcpu);
+       /*
+        * Move the LBR msrs back to the vmcb01 to avoid copying them
diff --git a/queue-6.6/mm-damon-tests-core-kunit-handle-alloc-failres-in-damon_test_new_filter.patch b/queue-6.6/mm-damon-tests-core-kunit-handle-alloc-failres-in-damon_test_new_filter.patch
new file mode 100644 (file)
index 0000000..6f6bc40
--- /dev/null
@@ -0,0 +1,40 @@
+From 28ab2265e9422ccd81e4beafc0ace90f78de04c4 Mon Sep 17 00:00:00 2001
+From: SeongJae Park <sj@kernel.org>
+Date: Sat, 1 Nov 2025 11:20:07 -0700
+Subject: mm/damon/tests/core-kunit: handle alloc failres in damon_test_new_filter()
+
+From: SeongJae Park <sj@kernel.org>
+
+commit 28ab2265e9422ccd81e4beafc0ace90f78de04c4 upstream.
+
+damon_test_new_filter() is assuming all dynamic memory allocation in it
+will succeed.  Those are indeed likely in the real use cases since those
+allocations are too small to fail, but theoretically those could fail.  In
+the case, inappropriate memory access can happen.  Fix it by appropriately
+cleanup pre-allocated memory and skip the execution of the remaining tests
+in the failure cases.
+
+Link: https://lkml.kernel.org/r/20251101182021.74868-14-sj@kernel.org
+Fixes: 2a158e956b98 ("mm/damon/core-test: add a test for damos_new_filter()")
+Signed-off-by: SeongJae Park <sj@kernel.org>
+Cc: Brendan Higgins <brendan.higgins@linux.dev>
+Cc: David Gow <davidgow@google.com>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: <stable@vger.kernel.org>   [6.6+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/damon/core-test.h |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/mm/damon/core-test.h
++++ b/mm/damon/core-test.h
+@@ -346,6 +346,8 @@ static void damos_test_new_filter(struct
+       struct damos_filter *filter;
+       filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true);
++      if (!filter)
++              kunit_skip(test, "filter alloc fail");
+       KUNIT_EXPECT_EQ(test, filter->type, DAMOS_FILTER_TYPE_ANON);
+       KUNIT_EXPECT_EQ(test, filter->matching, true);
+       KUNIT_EXPECT_PTR_EQ(test, filter->list.prev, &filter->list);
diff --git a/queue-6.6/mm-damon-tests-core-kunit-handle-alloc-failures-on-damon_test_merge_two.patch b/queue-6.6/mm-damon-tests-core-kunit-handle-alloc-failures-on-damon_test_merge_two.patch
new file mode 100644 (file)
index 0000000..b99c1e9
--- /dev/null
@@ -0,0 +1,52 @@
+From 3d443dd29a1db7efa587a4bb0c06a497e13ca9e4 Mon Sep 17 00:00:00 2001
+From: SeongJae Park <sj@kernel.org>
+Date: Sat, 1 Nov 2025 11:20:00 -0700
+Subject: mm/damon/tests/core-kunit: handle alloc failures on damon_test_merge_two()
+
+From: SeongJae Park <sj@kernel.org>
+
+commit 3d443dd29a1db7efa587a4bb0c06a497e13ca9e4 upstream.
+
+damon_test_merge_two() is assuming all dynamic memory allocation in it
+will succeed.  Those are indeed likely in the real use cases since those
+allocations are too small to fail, but theoretically those could fail.  In
+the case, inappropriate memory access can happen.  Fix it by appropriately
+cleanup pre-allocated memory and skip the execution of the remaining tests
+in the failure cases.
+
+Link: https://lkml.kernel.org/r/20251101182021.74868-7-sj@kernel.org
+Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests")
+Signed-off-by: SeongJae Park <sj@kernel.org>
+Cc: Brendan Higgins <brendan.higgins@linux.dev>
+Cc: David Gow <davidgow@google.com>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: <stable@vger.kernel.org>   [5.15+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/damon/core-test.h |   10 ++++++++++
+ 1 file changed, 10 insertions(+)
+
+--- a/mm/damon/core-test.h
++++ b/mm/damon/core-test.h
+@@ -162,10 +162,20 @@ static void damon_test_merge_two(struct
+       int i;
+       t = damon_new_target();
++      if (!t)
++              kunit_skip(test, "target alloc fail");
+       r = damon_new_region(0, 100);
++      if (!r) {
++              damon_free_target(t);
++              kunit_skip(test, "region alloc fail");
++      }
+       r->nr_accesses = 10;
+       damon_add_region(r, t);
+       r2 = damon_new_region(100, 300);
++      if (!r2) {
++              damon_free_target(t);
++              kunit_skip(test, "second region alloc fail");
++      }
+       r2->nr_accesses = 20;
+       damon_add_region(r2, t);
diff --git a/queue-6.6/mm-damon-tests-core-kunit-handle-alloc-failures-on-damon_test_split_at.patch b/queue-6.6/mm-damon-tests-core-kunit-handle-alloc-failures-on-damon_test_split_at.patch
new file mode 100644 (file)
index 0000000..f9d8719
--- /dev/null
@@ -0,0 +1,51 @@
+From 5e80d73f22043c59c8ad36452a3253937ed77955 Mon Sep 17 00:00:00 2001
+From: SeongJae Park <sj@kernel.org>
+Date: Sat, 1 Nov 2025 11:19:59 -0700
+Subject: mm/damon/tests/core-kunit: handle alloc failures on damon_test_split_at()
+
+From: SeongJae Park <sj@kernel.org>
+
+commit 5e80d73f22043c59c8ad36452a3253937ed77955 upstream.
+
+damon_test_split_at() is assuming all dynamic memory allocation in it will
+succeed.  Those are indeed likely in the real use cases since those
+allocations are too small to fail, but theoretically those could fail.  In
+the case, inappropriate memory access can happen.  Fix it by appropriately
+cleanup pre-allocated memory and skip the execution of the remaining tests
+in the failure cases.
+
+Link: https://lkml.kernel.org/r/20251101182021.74868-6-sj@kernel.org
+Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests")
+Signed-off-by: SeongJae Park <sj@kernel.org>
+Cc: Brendan Higgins <brendan.higgins@linux.dev>
+Cc: David Gow <davidgow@google.com>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: <stable@vger.kernel.org>   [5.15+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/damon/core-test.h |   11 +++++++++++
+ 1 file changed, 11 insertions(+)
+
+--- a/mm/damon/core-test.h
++++ b/mm/damon/core-test.h
+@@ -129,8 +129,19 @@ static void damon_test_split_at(struct k
+       struct damon_target *t;
+       struct damon_region *r;
++      if (!c)
++              kunit_skip(test, "ctx alloc fail");
+       t = damon_new_target();
++      if (!t) {
++              damon_destroy_ctx(c);
++              kunit_skip(test, "target alloc fail");
++      }
+       r = damon_new_region(0, 100);
++      if (!r) {
++              damon_destroy_ctx(c);
++              damon_free_target(t);
++              kunit_skip(test, "region alloc fail");
++      }
+       damon_add_region(r, t);
+       damon_split_region_at(t, r, 25);
+       KUNIT_EXPECT_EQ(test, r->ar.start, 0ul);
diff --git a/queue-6.6/mm-damon-tests-core-kunit-handle-alloc-failures-on-dasmon_test_merge_regions_of.patch b/queue-6.6/mm-damon-tests-core-kunit-handle-alloc-failures-on-dasmon_test_merge_regions_of.patch
new file mode 100644 (file)
index 0000000..65733cf
--- /dev/null
@@ -0,0 +1,46 @@
+From 0998d2757218771c59d5ca59ccf13d1542a38f17 Mon Sep 17 00:00:00 2001
+From: SeongJae Park <sj@kernel.org>
+Date: Sat, 1 Nov 2025 11:20:01 -0700
+Subject: mm/damon/tests/core-kunit: handle alloc failures on dasmon_test_merge_regions_of()
+
+From: SeongJae Park <sj@kernel.org>
+
+commit 0998d2757218771c59d5ca59ccf13d1542a38f17 upstream.
+
+damon_test_merge_regions_of() is assuming all dynamic memory allocation in
+it will succeed.  Those are indeed likely in the real use cases since
+those allocations are too small to fail, but theoretically those could
+fail.  In the case, inappropriate memory access can happen.  Fix it by
+appropriately cleanup pre-allocated memory and skip the execution of the
+remaining tests in the failure cases.
+
+Link: https://lkml.kernel.org/r/20251101182021.74868-8-sj@kernel.org
+Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests")
+Signed-off-by: SeongJae Park <sj@kernel.org>
+Cc: Brendan Higgins <brendan.higgins@linux.dev>
+Cc: David Gow <davidgow@google.com>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: <stable@vger.kernel.org>   [5.15+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/damon/core-test.h |    6 ++++++
+ 1 file changed, 6 insertions(+)
+
+--- a/mm/damon/core-test.h
++++ b/mm/damon/core-test.h
+@@ -210,8 +210,14 @@ static void damon_test_merge_regions_of(
+       int i;
+       t = damon_new_target();
++      if (!t)
++              kunit_skip(test, "target alloc fail");
+       for (i = 0; i < ARRAY_SIZE(sa); i++) {
+               r = damon_new_region(sa[i], ea[i]);
++              if (!r) {
++                      damon_free_target(t);
++                      kunit_skip(test, "region alloc fail");
++              }
+               r->nr_accesses = nrs[i];
+               damon_add_region(r, t);
+       }
diff --git a/queue-6.6/mm-damon-tests-core-kunit-handle-allocation-failures-in-damon_test_regions.patch b/queue-6.6/mm-damon-tests-core-kunit-handle-allocation-failures-in-damon_test_regions.patch
new file mode 100644 (file)
index 0000000..f9bf68b
--- /dev/null
@@ -0,0 +1,49 @@
+From e16fdd4f754048d6e23c56bd8d920b71e41e3777 Mon Sep 17 00:00:00 2001
+From: SeongJae Park <sj@kernel.org>
+Date: Sat, 1 Nov 2025 11:19:56 -0700
+Subject: mm/damon/tests/core-kunit: handle allocation failures in damon_test_regions()
+
+From: SeongJae Park <sj@kernel.org>
+
+commit e16fdd4f754048d6e23c56bd8d920b71e41e3777 upstream.
+
+damon_test_regions() is assuming all dynamic memory allocation in it will
+succeed.  Those are indeed likely in the real use cases since those
+allocations are too small to fail, but theoretically those could fail.  In
+the case, inappropriate memory access can happen.  Fix it by appropriately
+cleanup pre-allocated memory and skip the execution of the remaining tests
+in the failure cases.
+
+Link: https://lkml.kernel.org/r/20251101182021.74868-3-sj@kernel.org
+Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests")
+Signed-off-by: SeongJae Park <sj@kernel.org>
+Cc: Brendan Higgins <brendan.higgins@linux.dev>
+Cc: David Gow <davidgow@google.com>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: <stable@vger.kernel.org>   [5.15+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/damon/core-test.h |    6 ++++++
+ 1 file changed, 6 insertions(+)
+
+--- a/mm/damon/core-test.h
++++ b/mm/damon/core-test.h
+@@ -20,11 +20,17 @@ static void damon_test_regions(struct ku
+       struct damon_target *t;
+       r = damon_new_region(1, 2);
++      if (!r)
++              kunit_skip(test, "region alloc fail");
+       KUNIT_EXPECT_EQ(test, 1ul, r->ar.start);
+       KUNIT_EXPECT_EQ(test, 2ul, r->ar.end);
+       KUNIT_EXPECT_EQ(test, 0u, r->nr_accesses);
+       t = damon_new_target();
++      if (!t) {
++              damon_free_region(r);
++              kunit_skip(test, "target alloc fail");
++      }
+       KUNIT_EXPECT_EQ(test, 0u, damon_nr_regions(t));
+       damon_add_region(r, t);
diff --git a/queue-6.6/mm-damon-tests-vaddr-kunit-handle-alloc-failures-in-damon_test_split_evenly_fail.patch b/queue-6.6/mm-damon-tests-vaddr-kunit-handle-alloc-failures-in-damon_test_split_evenly_fail.patch
new file mode 100644 (file)
index 0000000..856ebc3
--- /dev/null
@@ -0,0 +1,49 @@
+From 7890e5b5bb6e386155c6e755fe70e0cdcc77f18e Mon Sep 17 00:00:00 2001
+From: SeongJae Park <sj@kernel.org>
+Date: Sat, 1 Nov 2025 11:20:12 -0700
+Subject: mm/damon/tests/vaddr-kunit: handle alloc failures in damon_test_split_evenly_fail()
+
+From: SeongJae Park <sj@kernel.org>
+
+commit 7890e5b5bb6e386155c6e755fe70e0cdcc77f18e upstream.
+
+damon_test_split_evenly_fail() is assuming all dynamic memory allocation
+in it will succeed.  Those are indeed likely in the real use cases since
+those allocations are too small to fail, but theoretically those could
+fail.  In the case, inappropriate memory access can happen.  Fix it by
+appropriately cleanup pre-allocated memory and skip the execution of the
+remaining tests in the failure cases.
+
+Link: https://lkml.kernel.org/r/20251101182021.74868-19-sj@kernel.org
+Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests")
+Signed-off-by: SeongJae Park <sj@kernel.org>
+Cc: Brendan Higgins <brendan.higgins@linux.dev>
+Cc: David Gow <davidgow@google.com>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: <stable@vger.kernel.org>   [5.15+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/damon/vaddr-test.h |   11 ++++++++++-
+ 1 file changed, 10 insertions(+), 1 deletion(-)
+
+--- a/mm/damon/vaddr-test.h
++++ b/mm/damon/vaddr-test.h
+@@ -250,7 +250,16 @@ static void damon_test_split_evenly_fail
+               unsigned long start, unsigned long end, unsigned int nr_pieces)
+ {
+       struct damon_target *t = damon_new_target();
+-      struct damon_region *r = damon_new_region(start, end);
++      struct damon_region *r;
++
++      if (!t)
++              kunit_skip(test, "target alloc fail");
++
++      r = damon_new_region(start, end);
++      if (!r) {
++              damon_free_target(t);
++              kunit_skip(test, "region alloc fail");
++      }
+       damon_add_region(r, t);
+       KUNIT_EXPECT_EQ(test,
diff --git a/queue-6.6/mm-damon-tests-vaddr-kunit-handle-alloc-failures-on-damon_do_test_apply_three_regions.patch b/queue-6.6/mm-damon-tests-vaddr-kunit-handle-alloc-failures-on-damon_do_test_apply_three_regions.patch
new file mode 100644 (file)
index 0000000..a989848
--- /dev/null
@@ -0,0 +1,47 @@
+From 2b22d0fcc6320ba29b2122434c1d2f0785fb0a25 Mon Sep 17 00:00:00 2001
+From: SeongJae Park <sj@kernel.org>
+Date: Sat, 1 Nov 2025 11:20:11 -0700
+Subject: mm/damon/tests/vaddr-kunit: handle alloc failures on damon_do_test_apply_three_regions()
+
+From: SeongJae Park <sj@kernel.org>
+
+commit 2b22d0fcc6320ba29b2122434c1d2f0785fb0a25 upstream.
+
+damon_do_test_apply_three_regions() is assuming all dynamic memory
+allocation in it will succeed.  Those are indeed likely in the real use
+cases since those allocations are too small to fail, but theoretically
+those could fail.  In the case, inappropriate memory access can happen.
+Fix it by appropriately cleanup pre-allocated memory and skip the
+execution of the remaining tests in the failure cases.
+
+Link: https://lkml.kernel.org/r/20251101182021.74868-18-sj@kernel.org
+Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests")
+Signed-off-by: SeongJae Park <sj@kernel.org>
+Cc: Brendan Higgins <brendan.higgins@linux.dev>
+Cc: David Gow <davidgow@google.com>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: <stable@vger.kernel.org>   [5.15+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: SeongJae Park <sj@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/damon/vaddr-test.h |    6 ++++++
+ 1 file changed, 6 insertions(+)
+
+--- a/mm/damon/vaddr-test.h
++++ b/mm/damon/vaddr-test.h
+@@ -136,8 +136,14 @@ static void damon_do_test_apply_three_re
+       int i;
+       t = damon_new_target();
++      if (!t)
++              kunit_skip(test, "target alloc fail");
+       for (i = 0; i < nr_regions / 2; i++) {
+               r = damon_new_region(regions[i * 2], regions[i * 2 + 1]);
++              if (!r) {
++                      damon_destroy_target(t);
++                      kunit_skip(test, "region alloc fail");
++              }
+               damon_add_region(r, t);
+       }
diff --git a/queue-6.6/mm-damon-tests-vaddr-kunit-handle-alloc-failures-on-damon_test_split_evenly_succ.patch b/queue-6.6/mm-damon-tests-vaddr-kunit-handle-alloc-failures-on-damon_test_split_evenly_succ.patch
new file mode 100644 (file)
index 0000000..c3e32d0
--- /dev/null
@@ -0,0 +1,50 @@
+From 0a63a0e7570b9b2631dfb8d836dc572709dce39e Mon Sep 17 00:00:00 2001
+From: SeongJae Park <sj@kernel.org>
+Date: Sat, 1 Nov 2025 11:20:13 -0700
+Subject: mm/damon/tests/vaddr-kunit: handle alloc failures on damon_test_split_evenly_succ()
+
+From: SeongJae Park <sj@kernel.org>
+
+commit 0a63a0e7570b9b2631dfb8d836dc572709dce39e upstream.
+
+damon_test_split_evenly_succ() is assuming all dynamic memory allocation
+in it will succeed.  Those are indeed likely in the real use cases since
+those allocations are too small to fail, but theoretically those could
+fail.  In the case, inappropriate memory access can happen.  Fix it by
+appropriately cleanup pre-allocated memory and skip the execution of the
+remaining tests in the failure cases.
+
+Link: https://lkml.kernel.org/r/20251101182021.74868-20-sj@kernel.org
+Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests")
+Signed-off-by: SeongJae Park <sj@kernel.org>
+Cc: Brendan Higgins <brendan.higgins@linux.dev>
+Cc: David Gow <davidgow@google.com>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: <stable@vger.kernel.org>   [5.15+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/damon/vaddr-test.h |    9 ++++++++-
+ 1 file changed, 8 insertions(+), 1 deletion(-)
+
+--- a/mm/damon/vaddr-test.h
++++ b/mm/damon/vaddr-test.h
+@@ -284,10 +284,17 @@ static void damon_test_split_evenly_succ
+       unsigned long start, unsigned long end, unsigned int nr_pieces)
+ {
+       struct damon_target *t = damon_new_target();
+-      struct damon_region *r = damon_new_region(start, end);
++      struct damon_region *r;
+       unsigned long expected_width = (end - start) / nr_pieces;
+       unsigned long i = 0;
++      if (!t)
++              kunit_skip(test, "target alloc fail");
++      r = damon_new_region(start, end);
++      if (!r) {
++              damon_free_target(t);
++              kunit_skip(test, "region alloc fail");
++      }
+       damon_add_region(r, t);
+       KUNIT_EXPECT_EQ(test,
+                       damon_va_evenly_split_region(t, r, nr_pieces), 0);
diff --git a/queue-6.6/net-allow-to-use-smp-threads-for-backlog-napi.patch b/queue-6.6/net-allow-to-use-smp-threads-for-backlog-napi.patch
new file mode 100644 (file)
index 0000000..4893514
--- /dev/null
@@ -0,0 +1,339 @@
+From wen.yang@linux.dev Mon Dec 29 08:53:54 2025
+From: wen.yang@linux.dev
+Date: Mon, 29 Dec 2025 15:53:17 +0800
+Subject: net: Allow to use SMP threads for backlog NAPI.
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: stable@vger.kernel.org, linux-kernel@vger.kernel.org, Sebastian Andrzej Siewior <bigeasy@linutronix.de>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Wen Yang <wen.yang@linux.dev>
+Message-ID: <013481655ddb09ae214bc510502efe6cf32b3445.1766987153.git.wen.yang@linux.dev>
+
+From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+
+commit dad6b97702639fba27a2bd3e986982ad6f0db3a7 upstream.
+
+Backlog NAPI is a per-CPU NAPI struct only (with no device behind it)
+used by drivers which don't do NAPI them self, RPS and parts of the
+stack which need to avoid recursive deadlocks while processing a packet.
+
+The non-NAPI driver use the CPU local backlog NAPI. If RPS is enabled
+then a flow for the skb is computed and based on the flow the skb can be
+enqueued on a remote CPU. Scheduling/ raising the softirq (for backlog's
+NAPI) on the remote CPU isn't trivial because the softirq is only
+scheduled on the local CPU and performed after the hardirq is done.
+In order to schedule a softirq on the remote CPU, an IPI is sent to the
+remote CPU which schedules the backlog-NAPI on the then local CPU.
+
+On PREEMPT_RT interrupts are force-threaded. The soft interrupts are
+raised within the interrupt thread and processed after the interrupt
+handler completed still within the context of the interrupt thread. The
+softirq is handled in the context where it originated.
+
+With force-threaded interrupts enabled, ksoftirqd is woken up if a
+softirq is raised from hardirq context. This is the case if it is raised
+from an IPI. Additionally there is a warning on PREEMPT_RT if the
+softirq is raised from the idle thread.
+This was done for two reasons:
+- With threaded interrupts the processing should happen in thread
+  context (where it originated) and ksoftirqd is the only thread for
+  this context if raised from hardirq. Using the currently running task
+  instead would "punish" a random task.
+- Once ksoftirqd is active it consumes all further softirqs until it
+  stops running. This changed recently and is no longer the case.
+
+Instead of keeping the backlog NAPI in ksoftirqd (in force-threaded/
+PREEMPT_RT setups) I am proposing NAPI-threads for backlog.
+The "proper" setup with threaded-NAPI is not doable because the threads
+are not pinned to an individual CPU and can be modified by the user.
+Additionally a dummy network device would have to be assigned. Also
+CPU-hotplug has to be considered if additional CPUs show up.
+All this can be probably done/ solved but the smpboot-threads already
+provide this infrastructure.
+
+Sending UDP packets over loopback expects that the packet is processed
+within the call. Delaying it by handing it over to the thread hurts
+performance. It is not beneficial to the outcome if the context switch
+happens immediately after enqueue or after a while to process a few
+packets in a batch.
+There is no need to always use the thread if the backlog NAPI is
+requested on the local CPU. This restores the loopback throuput. The
+performance drops mostly to the same value after enabling RPS on the
+loopback comparing the IPI and the tread result.
+
+Create NAPI-threads for backlog if request during boot. The thread runs
+the inner loop from napi_threaded_poll(), the wait part is different. It
+checks for NAPI_STATE_SCHED (the backlog NAPI can not be disabled).
+
+The NAPI threads for backlog are optional, it has to be enabled via the boot
+argument "thread_backlog_napi". It is mandatory for PREEMPT_RT to avoid the
+wakeup of ksoftirqd from the IPI.
+
+Acked-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Wen Yang <wen.yang@linux.dev>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/core/dev.c |  152 +++++++++++++++++++++++++++++++++++++++++++--------------
+ 1 file changed, 115 insertions(+), 37 deletions(-)
+
+--- a/net/core/dev.c
++++ b/net/core/dev.c
+@@ -78,6 +78,7 @@
+ #include <linux/slab.h>
+ #include <linux/sched.h>
+ #include <linux/sched/mm.h>
++#include <linux/smpboot.h>
+ #include <linux/mutex.h>
+ #include <linux/rwsem.h>
+ #include <linux/string.h>
+@@ -217,6 +218,31 @@ static inline struct hlist_head *dev_ind
+       return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
+ }
++#ifndef CONFIG_PREEMPT_RT
++
++static DEFINE_STATIC_KEY_FALSE(use_backlog_threads_key);
++
++static int __init setup_backlog_napi_threads(char *arg)
++{
++      static_branch_enable(&use_backlog_threads_key);
++      return 0;
++}
++early_param("thread_backlog_napi", setup_backlog_napi_threads);
++
++static bool use_backlog_threads(void)
++{
++      return static_branch_unlikely(&use_backlog_threads_key);
++}
++
++#else
++
++static bool use_backlog_threads(void)
++{
++      return true;
++}
++
++#endif
++
+ static inline void rps_lock_irqsave(struct softnet_data *sd,
+                                   unsigned long *flags)
+ {
+@@ -4494,6 +4520,7 @@ EXPORT_SYMBOL(__dev_direct_xmit);
+ /*************************************************************************
+  *                    Receiver routines
+  *************************************************************************/
++static DEFINE_PER_CPU(struct task_struct *, backlog_napi);
+ int netdev_max_backlog __read_mostly = 1000;
+ EXPORT_SYMBOL(netdev_max_backlog);
+@@ -4526,12 +4553,16 @@ static inline void ____napi_schedule(str
+                */
+               thread = READ_ONCE(napi->thread);
+               if (thread) {
++                      if (use_backlog_threads() && thread == raw_cpu_read(backlog_napi))
++                              goto use_local_napi;
++
+                       set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
+                       wake_up_process(thread);
+                       return;
+               }
+       }
++use_local_napi:
+       list_add_tail(&napi->poll_list, &sd->poll_list);
+       WRITE_ONCE(napi->list_owner, smp_processor_id());
+       /* If not called from net_rx_action()
+@@ -4777,6 +4808,11 @@ static void napi_schedule_rps(struct sof
+ #ifdef CONFIG_RPS
+       if (sd != mysd) {
++              if (use_backlog_threads()) {
++                      __napi_schedule_irqoff(&sd->backlog);
++                      return;
++              }
++
+               sd->rps_ipi_next = mysd->rps_ipi_list;
+               mysd->rps_ipi_list = sd;
+@@ -6000,7 +6036,7 @@ static void net_rps_action_and_irq_enabl
+ #ifdef CONFIG_RPS
+       struct softnet_data *remsd = sd->rps_ipi_list;
+-      if (remsd) {
++      if (!use_backlog_threads() && remsd) {
+               sd->rps_ipi_list = NULL;
+               local_irq_enable();
+@@ -6015,7 +6051,7 @@ static void net_rps_action_and_irq_enabl
+ static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
+ {
+ #ifdef CONFIG_RPS
+-      return sd->rps_ipi_list != NULL;
++      return !use_backlog_threads() && sd->rps_ipi_list;
+ #else
+       return false;
+ #endif
+@@ -6059,7 +6095,7 @@ static int process_backlog(struct napi_s
+                        * We can use a plain write instead of clear_bit(),
+                        * and we dont need an smp_mb() memory barrier.
+                        */
+-                      napi->state = 0;
++                      napi->state &= NAPIF_STATE_THREADED;
+                       again = false;
+               } else {
+                       skb_queue_splice_tail_init(&sd->input_pkt_queue,
+@@ -6725,43 +6761,48 @@ static void skb_defer_free_flush(struct
+       }
+ }
+-static int napi_threaded_poll(void *data)
++static void napi_threaded_poll_loop(struct napi_struct *napi)
+ {
+-      struct napi_struct *napi = data;
+       struct softnet_data *sd;
+-      void *have;
++      unsigned long last_qs = jiffies;
+-      while (!napi_thread_wait(napi)) {
+-              unsigned long last_qs = jiffies;
++      for (;;) {
++              bool repoll = false;
++              void *have;
+-              for (;;) {
+-                      bool repoll = false;
++              local_bh_disable();
++              sd = this_cpu_ptr(&softnet_data);
++              sd->in_napi_threaded_poll = true;
+-                      local_bh_disable();
+-                      sd = this_cpu_ptr(&softnet_data);
+-                      sd->in_napi_threaded_poll = true;
+-
+-                      have = netpoll_poll_lock(napi);
+-                      __napi_poll(napi, &repoll);
+-                      netpoll_poll_unlock(have);
+-
+-                      sd->in_napi_threaded_poll = false;
+-                      barrier();
+-
+-                      if (sd_has_rps_ipi_waiting(sd)) {
+-                              local_irq_disable();
+-                              net_rps_action_and_irq_enable(sd);
+-                      }
+-                      skb_defer_free_flush(sd);
+-                      local_bh_enable();
++              have = netpoll_poll_lock(napi);
++              __napi_poll(napi, &repoll);
++              netpoll_poll_unlock(have);
++
++              sd->in_napi_threaded_poll = false;
++              barrier();
++
++              if (sd_has_rps_ipi_waiting(sd)) {
++                      local_irq_disable();
++                      net_rps_action_and_irq_enable(sd);
++              }
++              skb_defer_free_flush(sd);
++              local_bh_enable();
+-                      if (!repoll)
+-                              break;
++              if (!repoll)
++                      break;
+-                      rcu_softirq_qs_periodic(last_qs);
+-                      cond_resched();
+-              }
++              rcu_softirq_qs_periodic(last_qs);
++              cond_resched();
+       }
++}
++
++static int napi_threaded_poll(void *data)
++{
++      struct napi_struct *napi = data;
++
++      while (!napi_thread_wait(napi))
++              napi_threaded_poll_loop(napi);
++
+       return 0;
+ }
+@@ -11346,7 +11387,7 @@ static int dev_cpu_dead(unsigned int old
+               list_del_init(&napi->poll_list);
+               if (napi->poll == process_backlog)
+-                      napi->state = 0;
++                      napi->state &= NAPIF_STATE_THREADED;
+               else
+                       ____napi_schedule(sd, napi);
+       }
+@@ -11354,12 +11395,14 @@ static int dev_cpu_dead(unsigned int old
+       raise_softirq_irqoff(NET_TX_SOFTIRQ);
+       local_irq_enable();
++      if (!use_backlog_threads()) {
+ #ifdef CONFIG_RPS
+-      remsd = oldsd->rps_ipi_list;
+-      oldsd->rps_ipi_list = NULL;
++              remsd = oldsd->rps_ipi_list;
++              oldsd->rps_ipi_list = NULL;
+ #endif
+-      /* send out pending IPI's on offline CPU */
+-      net_rps_send_ipi(remsd);
++              /* send out pending IPI's on offline CPU */
++              net_rps_send_ipi(remsd);
++      }
+       /* Process offline CPU's input_pkt_queue */
+       while ((skb = __skb_dequeue(&oldsd->process_queue))) {
+@@ -11622,6 +11665,38 @@ static struct pernet_operations __net_in
+  *
+  */
++static int backlog_napi_should_run(unsigned int cpu)
++{
++      struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
++      struct napi_struct *napi = &sd->backlog;
++
++      return test_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
++}
++
++static void run_backlog_napi(unsigned int cpu)
++{
++      struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
++
++      napi_threaded_poll_loop(&sd->backlog);
++}
++
++static void backlog_napi_setup(unsigned int cpu)
++{
++      struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
++      struct napi_struct *napi = &sd->backlog;
++
++      napi->thread = this_cpu_read(backlog_napi);
++      set_bit(NAPI_STATE_THREADED, &napi->state);
++}
++
++static struct smp_hotplug_thread backlog_threads = {
++      .store                  = &backlog_napi,
++      .thread_should_run      = backlog_napi_should_run,
++      .thread_fn              = run_backlog_napi,
++      .thread_comm            = "backlog_napi/%u",
++      .setup                  = backlog_napi_setup,
++};
++
+ /*
+  *       This is called single threaded during boot, so no need
+  *       to take the rtnl semaphore.
+@@ -11672,7 +11747,10 @@ static int __init net_dev_init(void)
+               init_gro_hash(&sd->backlog);
+               sd->backlog.poll = process_backlog;
+               sd->backlog.weight = weight_p;
++              INIT_LIST_HEAD(&sd->backlog.poll_list);
+       }
++      if (use_backlog_threads())
++              smpboot_register_percpu_thread(&backlog_threads);
+       dev_boot_phase = 0;
diff --git a/queue-6.6/net-remove-conditional-threaded-napi-wakeup-based-on-task-state.patch b/queue-6.6/net-remove-conditional-threaded-napi-wakeup-based-on-task-state.patch
new file mode 100644 (file)
index 0000000..d51b779
--- /dev/null
@@ -0,0 +1,83 @@
+From stable+bounces-203472-greg=kroah.com@vger.kernel.org Mon Dec 29 08:54:02 2025
+From: wen.yang@linux.dev
+Date: Mon, 29 Dec 2025 15:53:16 +0800
+Subject: net: Remove conditional threaded-NAPI wakeup based on task state.
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: stable@vger.kernel.org, linux-kernel@vger.kernel.org, Sebastian Andrzej Siewior <bigeasy@linutronix.de>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Wen Yang <wen.yang@linux.dev>
+Message-ID: <b530eb6ed51ef4ca7940dddd981de2878834fcef.1766987153.git.wen.yang@linux.dev>
+
+From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+
+commit 56364c910691f6d10ba88c964c9041b9ab777bd6 upstream.
+
+A NAPI thread is scheduled by first setting NAPI_STATE_SCHED bit. If
+successful (the bit was not yet set) then the NAPI_STATE_SCHED_THREADED
+is set but only if thread's state is not TASK_INTERRUPTIBLE (is
+TASK_RUNNING) followed by task wakeup.
+
+If the task is idle (TASK_INTERRUPTIBLE) then the
+NAPI_STATE_SCHED_THREADED bit is not set. The thread is no relying on
+the bit but always leaving the wait-loop after returning from schedule()
+because there must have been a wakeup.
+
+The smpboot-threads implementation for per-CPU threads requires an
+explicit condition and does not support "if we get out of schedule()
+then there must be something to do".
+
+Removing this optimisation simplifies the following integration.
+
+Set NAPI_STATE_SCHED_THREADED unconditionally on wakeup and rely on it
+in the wait path by removing the `woken' condition.
+
+Acked-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Wen Yang <wen.yang@linux.dev>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/core/dev.c |   14 ++------------
+ 1 file changed, 2 insertions(+), 12 deletions(-)
+
+--- a/net/core/dev.c
++++ b/net/core/dev.c
+@@ -4526,13 +4526,7 @@ static inline void ____napi_schedule(str
+                */
+               thread = READ_ONCE(napi->thread);
+               if (thread) {
+-                      /* Avoid doing set_bit() if the thread is in
+-                       * INTERRUPTIBLE state, cause napi_thread_wait()
+-                       * makes sure to proceed with napi polling
+-                       * if the thread is explicitly woken from here.
+-                       */
+-                      if (READ_ONCE(thread->__state) != TASK_INTERRUPTIBLE)
+-                              set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
++                      set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
+                       wake_up_process(thread);
+                       return;
+               }
+@@ -6688,8 +6682,6 @@ static int napi_poll(struct napi_struct
+ static int napi_thread_wait(struct napi_struct *napi)
+ {
+-      bool woken = false;
+-
+       set_current_state(TASK_INTERRUPTIBLE);
+       while (!kthread_should_stop()) {
+@@ -6698,15 +6690,13 @@ static int napi_thread_wait(struct napi_
+                * Testing SCHED bit is not enough because SCHED bit might be
+                * set by some other busy poll thread or by napi_disable().
+                */
+-              if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) {
++              if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) {
+                       WARN_ON(!list_empty(&napi->poll_list));
+                       __set_current_state(TASK_RUNNING);
+                       return 0;
+               }
+               schedule();
+-              /* woken being true indicates this thread owns this napi. */
+-              woken = true;
+               set_current_state(TASK_INTERRUPTIBLE);
+       }
+       __set_current_state(TASK_RUNNING);
diff --git a/queue-6.6/rdma-core-fix-kasan-slab-use-after-free-read-in-ib_register_device-problem.patch b/queue-6.6/rdma-core-fix-kasan-slab-use-after-free-read-in-ib_register_device-problem.patch
new file mode 100644 (file)
index 0000000..e268a1c
--- /dev/null
@@ -0,0 +1,85 @@
+From d0706bfd3ee40923c001c6827b786a309e2a8713 Mon Sep 17 00:00:00 2001
+From: Zhu Yanjun <yanjun.zhu@linux.dev>
+Date: Tue, 6 May 2025 17:10:08 +0200
+Subject: RDMA/core: Fix "KASAN: slab-use-after-free Read in ib_register_device" problem
+
+From: Zhu Yanjun <yanjun.zhu@linux.dev>
+
+commit d0706bfd3ee40923c001c6827b786a309e2a8713 upstream.
+
+Call Trace:
+
+ __dump_stack lib/dump_stack.c:94 [inline]
+ dump_stack_lvl+0x116/0x1f0 lib/dump_stack.c:120
+ print_address_description mm/kasan/report.c:408 [inline]
+ print_report+0xc3/0x670 mm/kasan/report.c:521
+ kasan_report+0xe0/0x110 mm/kasan/report.c:634
+ strlen+0x93/0xa0 lib/string.c:420
+ __fortify_strlen include/linux/fortify-string.h:268 [inline]
+ get_kobj_path_length lib/kobject.c:118 [inline]
+ kobject_get_path+0x3f/0x2a0 lib/kobject.c:158
+ kobject_uevent_env+0x289/0x1870 lib/kobject_uevent.c:545
+ ib_register_device drivers/infiniband/core/device.c:1472 [inline]
+ ib_register_device+0x8cf/0xe00 drivers/infiniband/core/device.c:1393
+ rxe_register_device+0x275/0x320 drivers/infiniband/sw/rxe/rxe_verbs.c:1552
+ rxe_net_add+0x8e/0xe0 drivers/infiniband/sw/rxe/rxe_net.c:550
+ rxe_newlink+0x70/0x190 drivers/infiniband/sw/rxe/rxe.c:225
+ nldev_newlink+0x3a3/0x680 drivers/infiniband/core/nldev.c:1796
+ rdma_nl_rcv_msg+0x387/0x6e0 drivers/infiniband/core/netlink.c:195
+ rdma_nl_rcv_skb.constprop.0.isra.0+0x2e5/0x450
+ netlink_unicast_kernel net/netlink/af_netlink.c:1313 [inline]
+ netlink_unicast+0x53a/0x7f0 net/netlink/af_netlink.c:1339
+ netlink_sendmsg+0x8d1/0xdd0 net/netlink/af_netlink.c:1883
+ sock_sendmsg_nosec net/socket.c:712 [inline]
+ __sock_sendmsg net/socket.c:727 [inline]
+ ____sys_sendmsg+0xa95/0xc70 net/socket.c:2566
+ ___sys_sendmsg+0x134/0x1d0 net/socket.c:2620
+ __sys_sendmsg+0x16d/0x220 net/socket.c:2652
+ do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
+ do_syscall_64+0xcd/0x260 arch/x86/entry/syscall_64.c:94
+ entry_SYSCALL_64_after_hwframe+0x77/0x7f
+
+This problem is similar to the problem that the
+commit 1d6a9e7449e2 ("RDMA/core: Fix use-after-free when rename device name")
+fixes.
+
+The root cause is: the function ib_device_rename() renames the name with
+lock. But in the function kobject_uevent(), this name is accessed without
+lock protection at the same time.
+
+The solution is to add the lock protection when this name is accessed in
+the function kobject_uevent().
+
+Fixes: 779e0bf47632 ("RDMA/core: Do not indicate device ready when device enablement fails")
+Link: https://patch.msgid.link/r/20250506151008.75701-1-yanjun.zhu@linux.dev
+Reported-by: syzbot+e2ce9e275ecc70a30b72@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=e2ce9e275ecc70a30b72
+Signed-off-by: Zhu Yanjun <yanjun.zhu@linux.dev>
+Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+[ Ajay: Modified to apply on v5.10.y-v6.6.y
+        ib_device_notify_register() not present in v5.10.y-v6.6.y,
+        so directly added lock for kobject_uevent() ]
+Signed-off-by: Ajay Kaher <ajay.kaher@broadcom.com>
+Signed-off-by: Shivani Agarwal <shivani.agarwal@broadcom.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/infiniband/core/device.c |    5 +++++
+ 1 file changed, 5 insertions(+)
+
+--- a/drivers/infiniband/core/device.c
++++ b/drivers/infiniband/core/device.c
+@@ -1450,8 +1450,13 @@ int ib_register_device(struct ib_device
+               return ret;
+       }
+       dev_set_uevent_suppress(&device->dev, false);
++
++      down_read(&devices_rwsem);
++
+       /* Mark for userspace that device is ready */
+       kobject_uevent(&device->dev.kobj, KOBJ_ADD);
++
++      up_read(&devices_rwsem);
+       ib_device_put(device);
+       return 0;
diff --git a/queue-6.6/rdma-rxe-fix-the-failure-of-ibv_query_device-and-ibv_query_device_ex-tests.patch b/queue-6.6/rdma-rxe-fix-the-failure-of-ibv_query_device-and-ibv_query_device_ex-tests.patch
new file mode 100644 (file)
index 0000000..2360938
--- /dev/null
@@ -0,0 +1,138 @@
+From shivani.agarwal@broadcom.com Thu Jan  8 11:26:51 2026
+From: Shivani Agarwal <shivani.agarwal@broadcom.com>
+Date: Thu,  8 Jan 2026 02:05:40 -0800
+Subject: RDMA/rxe: Fix the failure of ibv_query_device() and ibv_query_device_ex() tests
+To: stable@vger.kernel.org, gregkh@linuxfoundation.org
+Cc: jgg@ziepe.ca, leon@kernel.org, zyjzyj2000@gmail.com, mbloch@nvidia.com, parav@nvidia.com, mrgolin@amazon.com, roman.gushchin@linux.dev, wangliang74@huawei.com, marco.crivellari@suse.com, zhao.xichao@vivo.com, haggaie@mellanox.com, monis@mellanox.com, dledford@redhat.com, amirv@mellanox.com, kamalh@mellanox.com, linux-rdma@vger.kernel.org, linux-kernel@vger.kernel.org, ajay.kaher@broadcom.com, alexey.makhalov@broadcom.com, vamsi-krishna.brahmajosyula@broadcom.com, yin.ding@broadcom.com, tapas.kundu@broadcom.com, Zhu Yanjun <yanjun.zhu@linux.dev>, Daisuke Matsuda <matsuda-daisuke@fujitsu.com>, Sasha Levin <sashal@kernel.org>, Shivani Agarwal <shivani.agarwal@broadcom.com>
+Message-ID: <20260108100540.672666-3-shivani.agarwal@broadcom.com>
+
+From: Zhu Yanjun <yanjun.zhu@linux.dev>
+
+[ Upstream commit 8ce2eb9dfac8743d1c423b86339336a5b6a6069e ]
+
+In rdma-core, the following failures appear.
+
+"
+$ ./build/bin/run_tests.py -k device
+ssssssss....FF........s
+======================================================================
+FAIL: test_query_device (tests.test_device.DeviceTest.test_query_device)
+Test ibv_query_device()
+----------------------------------------------------------------------
+Traceback (most recent call last):
+   File "/home/ubuntu/rdma-core/tests/test_device.py", line 63, in
+   test_query_device
+     self.verify_device_attr(attr, dev)
+   File "/home/ubuntu/rdma-core/tests/test_device.py", line 200, in
+   verify_device_attr
+     assert attr.sys_image_guid != 0
+            ^^^^^^^^^^^^^^^^^^^^^^^^
+AssertionError
+
+======================================================================
+FAIL: test_query_device_ex (tests.test_device.DeviceTest.test_query_device_ex)
+Test ibv_query_device_ex()
+----------------------------------------------------------------------
+Traceback (most recent call last):
+   File "/home/ubuntu/rdma-core/tests/test_device.py", line 222, in
+   test_query_device_ex
+     self.verify_device_attr(attr_ex.orig_attr, dev)
+   File "/home/ubuntu/rdma-core/tests/test_device.py", line 200, in
+   verify_device_attr
+     assert attr.sys_image_guid != 0
+            ^^^^^^^^^^^^^^^^^^^^^^^^
+AssertionError
+"
+
+The root cause is: before a net device is set with rxe, this net device
+is used to generate a sys_image_guid.
+
+Fixes: 2ac5415022d1 ("RDMA/rxe: Remove the direct link to net_device")
+Signed-off-by: Zhu Yanjun <yanjun.zhu@linux.dev>
+Link: https://patch.msgid.link/20250302215444.3742072-1-yanjun.zhu@linux.dev
+Reviewed-by: Daisuke Matsuda <matsuda-daisuke@fujitsu.com>
+Tested-by: Daisuke Matsuda <matsuda-daisuke@fujitsu.com>
+Signed-off-by: Leon Romanovsky <leon@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+[Shivani: Modified to apply on 6.6.y]
+Signed-off-by: Shivani Agarwal <shivani.agarwal@broadcom.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/infiniband/sw/rxe/rxe.c |   25 ++++++-------------------
+ 1 file changed, 6 insertions(+), 19 deletions(-)
+
+--- a/drivers/infiniband/sw/rxe/rxe.c
++++ b/drivers/infiniband/sw/rxe/rxe.c
+@@ -38,10 +38,8 @@ void rxe_dealloc(struct ib_device *ib_de
+ }
+ /* initialize rxe device parameters */
+-static void rxe_init_device_param(struct rxe_dev *rxe)
++static void rxe_init_device_param(struct rxe_dev *rxe, struct net_device *ndev)
+ {
+-      struct net_device *ndev;
+-
+       rxe->max_inline_data                    = RXE_MAX_INLINE_DATA;
+       rxe->attr.vendor_id                     = RXE_VENDOR_ID;
+@@ -74,15 +72,9 @@ static void rxe_init_device_param(struct
+       rxe->attr.max_pkeys                     = RXE_MAX_PKEYS;
+       rxe->attr.local_ca_ack_delay            = RXE_LOCAL_CA_ACK_DELAY;
+-      ndev = rxe_ib_device_get_netdev(&rxe->ib_dev);
+-      if (!ndev)
+-              return;
+-
+       addrconf_addr_eui48((unsigned char *)&rxe->attr.sys_image_guid,
+                       ndev->dev_addr);
+-      dev_put(ndev);
+-
+       rxe->max_ucontext                       = RXE_MAX_UCONTEXT;
+ }
+@@ -115,18 +107,13 @@ static void rxe_init_port_param(struct r
+ /* initialize port state, note IB convention that HCA ports are always
+  * numbered from 1
+  */
+-static void rxe_init_ports(struct rxe_dev *rxe)
++static void rxe_init_ports(struct rxe_dev *rxe, struct net_device *ndev)
+ {
+       struct rxe_port *port = &rxe->port;
+-      struct net_device *ndev;
+       rxe_init_port_param(port);
+-      ndev = rxe_ib_device_get_netdev(&rxe->ib_dev);
+-      if (!ndev)
+-              return;
+       addrconf_addr_eui48((unsigned char *)&port->port_guid,
+                           ndev->dev_addr);
+-      dev_put(ndev);
+       spin_lock_init(&port->port_lock);
+ }
+@@ -144,12 +131,12 @@ static void rxe_init_pools(struct rxe_de
+ }
+ /* initialize rxe device state */
+-static void rxe_init(struct rxe_dev *rxe)
++static void rxe_init(struct rxe_dev *rxe, struct net_device *ndev)
+ {
+       /* init default device parameters */
+-      rxe_init_device_param(rxe);
++      rxe_init_device_param(rxe, ndev);
+-      rxe_init_ports(rxe);
++      rxe_init_ports(rxe, ndev);
+       rxe_init_pools(rxe);
+       /* init pending mmap list */
+@@ -186,7 +173,7 @@ void rxe_set_mtu(struct rxe_dev *rxe, un
+ int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name,
+                       struct net_device *ndev)
+ {
+-      rxe_init(rxe);
++      rxe_init(rxe, ndev);
+       rxe_set_mtu(rxe, mtu);
+       return rxe_register_device(rxe, ibdev_name, ndev);
diff --git a/queue-6.6/rdma-rxe-remove-the-direct-link-to-net_device.patch b/queue-6.6/rdma-rxe-remove-the-direct-link-to-net_device.patch
new file mode 100644 (file)
index 0000000..5f43a9c
--- /dev/null
@@ -0,0 +1,416 @@
+From stable+bounces-206303-greg=kroah.com@vger.kernel.org Thu Jan  8 11:31:56 2026
+From: Shivani Agarwal <shivani.agarwal@broadcom.com>
+Date: Thu,  8 Jan 2026 02:05:39 -0800
+Subject: RDMA/rxe: Remove the direct link to net_device
+To: stable@vger.kernel.org, gregkh@linuxfoundation.org
+Cc: jgg@ziepe.ca, leon@kernel.org, zyjzyj2000@gmail.com, mbloch@nvidia.com, parav@nvidia.com, mrgolin@amazon.com, roman.gushchin@linux.dev, wangliang74@huawei.com, marco.crivellari@suse.com, zhao.xichao@vivo.com, haggaie@mellanox.com, monis@mellanox.com, dledford@redhat.com, amirv@mellanox.com, kamalh@mellanox.com, linux-rdma@vger.kernel.org, linux-kernel@vger.kernel.org, ajay.kaher@broadcom.com, alexey.makhalov@broadcom.com, vamsi-krishna.brahmajosyula@broadcom.com, yin.ding@broadcom.com, tapas.kundu@broadcom.com, Zhu Yanjun <yanjun.zhu@linux.dev>, syzbot+4b87489410b4efd181bf@syzkaller.appspotmail.com, Sasha Levin <sashal@kernel.org>, Shivani Agarwal <shivani.agarwal@broadcom.com>
+Message-ID: <20260108100540.672666-2-shivani.agarwal@broadcom.com>
+
+From: Zhu Yanjun <yanjun.zhu@linux.dev>
+
+[ Upstream commit 2ac5415022d16d63d912a39a06f32f1f51140261 ]
+
+The similar patch in siw is in the link:
+https://git.kernel.org/rdma/rdma/c/16b87037b48889
+
+This problem also occurred in RXE. The following analyze this problem.
+In the following Call Traces:
+"
+BUG: KASAN: slab-use-after-free in dev_get_flags+0x188/0x1d0 net/core/dev.c:8782
+Read of size 4 at addr ffff8880554640b0 by task kworker/1:4/5295
+
+CPU: 1 UID: 0 PID: 5295 Comm: kworker/1:4 Not tainted
+6.12.0-rc3-syzkaller-00399-g9197b73fd7bb #0
+Hardware name: Google Compute Engine/Google Compute Engine,
+BIOS Google 09/13/2024
+Workqueue: infiniband ib_cache_event_task
+Call Trace:
+ <TASK>
+ __dump_stack lib/dump_stack.c:94 [inline]
+ dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120
+ print_address_description mm/kasan/report.c:377 [inline]
+ print_report+0x169/0x550 mm/kasan/report.c:488
+ kasan_report+0x143/0x180 mm/kasan/report.c:601
+ dev_get_flags+0x188/0x1d0 net/core/dev.c:8782
+ rxe_query_port+0x12d/0x260 drivers/infiniband/sw/rxe/rxe_verbs.c:60
+ __ib_query_port drivers/infiniband/core/device.c:2111 [inline]
+ ib_query_port+0x168/0x7d0 drivers/infiniband/core/device.c:2143
+ ib_cache_update+0x1a9/0xb80 drivers/infiniband/core/cache.c:1494
+ ib_cache_event_task+0xf3/0x1e0 drivers/infiniband/core/cache.c:1568
+ process_one_work kernel/workqueue.c:3229 [inline]
+ process_scheduled_works+0xa65/0x1850 kernel/workqueue.c:3310
+ worker_thread+0x870/0xd30 kernel/workqueue.c:3391
+ kthread+0x2f2/0x390 kernel/kthread.c:389
+ ret_from_fork+0x4d/0x80 arch/x86/kernel/process.c:147
+ ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244
+ </TASK>
+"
+
+1). In the link [1],
+
+"
+ infiniband syz2: set down
+"
+
+This means that on 839.350575, the event ib_cache_event_task was sent andi
+queued in ib_wq.
+
+2). In the link [1],
+
+"
+ team0 (unregistering): Port device team_slave_0 removed
+"
+
+It indicates that before 843.251853, the net device should be freed.
+
+3). In the link [1],
+
+"
+ BUG: KASAN: slab-use-after-free in dev_get_flags+0x188/0x1d0
+"
+
+This means that on 850.559070, this slab-use-after-free problem occurred.
+
+In all, on 839.350575, the event ib_cache_event_task was sent and queued
+in ib_wq,
+
+before 843.251853, the net device veth was freed.
+
+on 850.559070, this event was executed, and the mentioned freed net device
+was called. Thus, the above call trace occurred.
+
+[1] https://syzkaller.appspot.com/x/log.txt?x=12e7025f980000
+
+Reported-by: syzbot+4b87489410b4efd181bf@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=4b87489410b4efd181bf
+Fixes: 8700e3e7c485 ("Soft RoCE driver")
+Signed-off-by: Zhu Yanjun <yanjun.zhu@linux.dev>
+Link: https://patch.msgid.link/20241220222325.2487767-1-yanjun.zhu@linux.dev
+Signed-off-by: Leon Romanovsky <leon@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+[Shivani: - exported ib_device_get_netdev() function.
+          - added ib_device_get_netdev() to ib_verbs.h.]
+Signed-off-by: Shivani Agarwal <shivani.agarwal@broadcom.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/infiniband/core/device.c      |    1 +
+ drivers/infiniband/sw/rxe/rxe.c       |   23 +++++++++++++++++++----
+ drivers/infiniband/sw/rxe/rxe.h       |    3 ++-
+ drivers/infiniband/sw/rxe/rxe_mcast.c |   22 ++++++++++++++++++++--
+ drivers/infiniband/sw/rxe/rxe_net.c   |   25 ++++++++++++++++++++-----
+ drivers/infiniband/sw/rxe/rxe_verbs.c |   26 +++++++++++++++++++++-----
+ drivers/infiniband/sw/rxe/rxe_verbs.h |   11 ++++++++---
+ include/rdma/ib_verbs.h               |    2 ++
+ 8 files changed, 93 insertions(+), 20 deletions(-)
+
+--- a/drivers/infiniband/core/device.c
++++ b/drivers/infiniband/core/device.c
+@@ -2259,6 +2259,7 @@ struct net_device *ib_device_get_netdev(
+       return res;
+ }
++EXPORT_SYMBOL(ib_device_get_netdev);
+ /**
+  * ib_device_get_by_netdev - Find an IB device associated with a netdev
+--- a/drivers/infiniband/sw/rxe/rxe.c
++++ b/drivers/infiniband/sw/rxe/rxe.c
+@@ -40,6 +40,8 @@ void rxe_dealloc(struct ib_device *ib_de
+ /* initialize rxe device parameters */
+ static void rxe_init_device_param(struct rxe_dev *rxe)
+ {
++      struct net_device *ndev;
++
+       rxe->max_inline_data                    = RXE_MAX_INLINE_DATA;
+       rxe->attr.vendor_id                     = RXE_VENDOR_ID;
+@@ -71,8 +73,15 @@ static void rxe_init_device_param(struct
+       rxe->attr.max_fast_reg_page_list_len    = RXE_MAX_FMR_PAGE_LIST_LEN;
+       rxe->attr.max_pkeys                     = RXE_MAX_PKEYS;
+       rxe->attr.local_ca_ack_delay            = RXE_LOCAL_CA_ACK_DELAY;
++
++      ndev = rxe_ib_device_get_netdev(&rxe->ib_dev);
++      if (!ndev)
++              return;
++
+       addrconf_addr_eui48((unsigned char *)&rxe->attr.sys_image_guid,
+-                      rxe->ndev->dev_addr);
++                      ndev->dev_addr);
++
++      dev_put(ndev);
+       rxe->max_ucontext                       = RXE_MAX_UCONTEXT;
+ }
+@@ -109,10 +118,15 @@ static void rxe_init_port_param(struct r
+ static void rxe_init_ports(struct rxe_dev *rxe)
+ {
+       struct rxe_port *port = &rxe->port;
++      struct net_device *ndev;
+       rxe_init_port_param(port);
++      ndev = rxe_ib_device_get_netdev(&rxe->ib_dev);
++      if (!ndev)
++              return;
+       addrconf_addr_eui48((unsigned char *)&port->port_guid,
+-                          rxe->ndev->dev_addr);
++                          ndev->dev_addr);
++      dev_put(ndev);
+       spin_lock_init(&port->port_lock);
+ }
+@@ -169,12 +183,13 @@ void rxe_set_mtu(struct rxe_dev *rxe, un
+ /* called by ifc layer to create new rxe device.
+  * The caller should allocate memory for rxe by calling ib_alloc_device.
+  */
+-int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name)
++int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name,
++                      struct net_device *ndev)
+ {
+       rxe_init(rxe);
+       rxe_set_mtu(rxe, mtu);
+-      return rxe_register_device(rxe, ibdev_name);
++      return rxe_register_device(rxe, ibdev_name, ndev);
+ }
+ static int rxe_newlink(const char *ibdev_name, struct net_device *ndev)
+--- a/drivers/infiniband/sw/rxe/rxe.h
++++ b/drivers/infiniband/sw/rxe/rxe.h
+@@ -139,7 +139,8 @@ enum resp_states {
+ void rxe_set_mtu(struct rxe_dev *rxe, unsigned int dev_mtu);
+-int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name);
++int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name,
++                      struct net_device *ndev);
+ void rxe_rcv(struct sk_buff *skb);
+--- a/drivers/infiniband/sw/rxe/rxe_mcast.c
++++ b/drivers/infiniband/sw/rxe/rxe_mcast.c
+@@ -31,10 +31,19 @@
+ static int rxe_mcast_add(struct rxe_dev *rxe, union ib_gid *mgid)
+ {
+       unsigned char ll_addr[ETH_ALEN];
++      struct net_device *ndev;
++      int ret;
++
++      ndev = rxe_ib_device_get_netdev(&rxe->ib_dev);
++      if (!ndev)
++              return -ENODEV;
+       ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr);
+-      return dev_mc_add(rxe->ndev, ll_addr);
++      ret = dev_mc_add(ndev, ll_addr);
++      dev_put(ndev);
++
++      return ret;
+ }
+ /**
+@@ -47,10 +56,19 @@ static int rxe_mcast_add(struct rxe_dev
+ static int rxe_mcast_del(struct rxe_dev *rxe, union ib_gid *mgid)
+ {
+       unsigned char ll_addr[ETH_ALEN];
++      struct net_device *ndev;
++      int ret;
++
++      ndev = rxe_ib_device_get_netdev(&rxe->ib_dev);
++      if (!ndev)
++              return -ENODEV;
+       ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr);
+-      return dev_mc_del(rxe->ndev, ll_addr);
++      ret = dev_mc_del(ndev, ll_addr);
++      dev_put(ndev);
++
++      return ret;
+ }
+ /**
+--- a/drivers/infiniband/sw/rxe/rxe_net.c
++++ b/drivers/infiniband/sw/rxe/rxe_net.c
+@@ -509,7 +509,16 @@ out:
+  */
+ const char *rxe_parent_name(struct rxe_dev *rxe, unsigned int port_num)
+ {
+-      return rxe->ndev->name;
++      struct net_device *ndev;
++      char *ndev_name;
++
++      ndev = rxe_ib_device_get_netdev(&rxe->ib_dev);
++      if (!ndev)
++              return NULL;
++      ndev_name = ndev->name;
++      dev_put(ndev);
++
++      return ndev_name;
+ }
+ int rxe_net_add(const char *ibdev_name, struct net_device *ndev)
+@@ -521,9 +530,7 @@ int rxe_net_add(const char *ibdev_name,
+       if (!rxe)
+               return -ENOMEM;
+-      rxe->ndev = ndev;
+-
+-      err = rxe_add(rxe, ndev->mtu, ibdev_name);
++      err = rxe_add(rxe, ndev->mtu, ibdev_name, ndev);
+       if (err) {
+               ib_dealloc_device(&rxe->ib_dev);
+               return err;
+@@ -571,10 +578,18 @@ void rxe_port_down(struct rxe_dev *rxe)
+ void rxe_set_port_state(struct rxe_dev *rxe)
+ {
+-      if (netif_running(rxe->ndev) && netif_carrier_ok(rxe->ndev))
++      struct net_device *ndev;
++
++      ndev = rxe_ib_device_get_netdev(&rxe->ib_dev);
++      if (!ndev)
++              return;
++
++      if (netif_running(ndev) && netif_carrier_ok(ndev))
+               rxe_port_up(rxe);
+       else
+               rxe_port_down(rxe);
++
++      dev_put(ndev);
+ }
+ static int rxe_notify(struct notifier_block *not_blk,
+--- a/drivers/infiniband/sw/rxe/rxe_verbs.c
++++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
+@@ -41,6 +41,7 @@ static int rxe_query_port(struct ib_devi
+                         u32 port_num, struct ib_port_attr *attr)
+ {
+       struct rxe_dev *rxe = to_rdev(ibdev);
++      struct net_device *ndev;
+       int err, ret;
+       if (port_num != 1) {
+@@ -51,19 +52,26 @@ static int rxe_query_port(struct ib_devi
+       memcpy(attr, &rxe->port.attr, sizeof(*attr));
++      ndev = rxe_ib_device_get_netdev(ibdev);
++      if (!ndev) {
++              err = -ENODEV;
++              goto err_out;
++      }
++
+       mutex_lock(&rxe->usdev_lock);
+       ret = ib_get_eth_speed(ibdev, port_num, &attr->active_speed,
+                              &attr->active_width);
+       if (attr->state == IB_PORT_ACTIVE)
+               attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP;
+-      else if (dev_get_flags(rxe->ndev) & IFF_UP)
++      else if (dev_get_flags(ndev) & IFF_UP)
+               attr->phys_state = IB_PORT_PHYS_STATE_POLLING;
+       else
+               attr->phys_state = IB_PORT_PHYS_STATE_DISABLED;
+       mutex_unlock(&rxe->usdev_lock);
++      dev_put(ndev);
+       return ret;
+ err_out:
+@@ -1428,9 +1436,16 @@ static const struct attribute_group rxe_
+ static int rxe_enable_driver(struct ib_device *ib_dev)
+ {
+       struct rxe_dev *rxe = container_of(ib_dev, struct rxe_dev, ib_dev);
++      struct net_device *ndev;
++
++      ndev = rxe_ib_device_get_netdev(ib_dev);
++      if (!ndev)
++              return -ENODEV;
+       rxe_set_port_state(rxe);
+-      dev_info(&rxe->ib_dev.dev, "added %s\n", netdev_name(rxe->ndev));
++      dev_info(&rxe->ib_dev.dev, "added %s\n", netdev_name(ndev));
++
++      dev_put(ndev);
+       return 0;
+ }
+@@ -1498,7 +1513,8 @@ static const struct ib_device_ops rxe_de
+       INIT_RDMA_OBJ_SIZE(ib_mw, rxe_mw, ibmw),
+ };
+-int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name)
++int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name,
++                                              struct net_device *ndev)
+ {
+       int err;
+       struct ib_device *dev = &rxe->ib_dev;
+@@ -1510,13 +1526,13 @@ int rxe_register_device(struct rxe_dev *
+       dev->num_comp_vectors = num_possible_cpus();
+       dev->local_dma_lkey = 0;
+       addrconf_addr_eui48((unsigned char *)&dev->node_guid,
+-                          rxe->ndev->dev_addr);
++                          ndev->dev_addr);
+       dev->uverbs_cmd_mask |= BIT_ULL(IB_USER_VERBS_CMD_POST_SEND) |
+                               BIT_ULL(IB_USER_VERBS_CMD_REQ_NOTIFY_CQ);
+       ib_set_device_ops(dev, &rxe_dev_ops);
+-      err = ib_device_set_netdev(&rxe->ib_dev, rxe->ndev, 1);
++      err = ib_device_set_netdev(&rxe->ib_dev, ndev, 1);
+       if (err)
+               return err;
+--- a/drivers/infiniband/sw/rxe/rxe_verbs.h
++++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
+@@ -369,6 +369,7 @@ struct rxe_port {
+       u32                     qp_gsi_index;
+ };
++#define       RXE_PORT        1
+ struct rxe_dev {
+       struct ib_device        ib_dev;
+       struct ib_device_attr   attr;
+@@ -376,8 +377,6 @@ struct rxe_dev {
+       int                     max_inline_data;
+       struct mutex    usdev_lock;
+-      struct net_device       *ndev;
+-
+       struct rxe_pool         uc_pool;
+       struct rxe_pool         pd_pool;
+       struct rxe_pool         ah_pool;
+@@ -405,6 +404,11 @@ struct rxe_dev {
+       struct crypto_shash     *tfm;
+ };
++static inline struct net_device *rxe_ib_device_get_netdev(struct ib_device *dev)
++{
++      return ib_device_get_netdev(dev, RXE_PORT);
++}
++
+ static inline void rxe_counter_inc(struct rxe_dev *rxe, enum rxe_counters index)
+ {
+       atomic64_inc(&rxe->stats_counters[index]);
+@@ -470,6 +474,7 @@ static inline struct rxe_pd *rxe_mw_pd(s
+       return to_rpd(mw->ibmw.pd);
+ }
+-int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name);
++int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name,
++                                              struct net_device *ndev);
+ #endif /* RXE_VERBS_H */
+--- a/include/rdma/ib_verbs.h
++++ b/include/rdma/ib_verbs.h
+@@ -4444,6 +4444,8 @@ struct net_device *ib_get_net_dev_by_par
+                                           const struct sockaddr *addr);
+ int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev,
+                        unsigned int port);
++struct net_device *ib_device_get_netdev(struct ib_device *ib_dev,
++                                      u32 port);
+ struct ib_wq *ib_create_wq(struct ib_pd *pd,
+                          struct ib_wq_init_attr *init_attr);
+ int ib_destroy_wq_user(struct ib_wq *wq, struct ib_udata *udata);
diff --git a/queue-6.6/sched-fair-proportional-newidle-balance.patch b/queue-6.6/sched-fair-proportional-newidle-balance.patch
new file mode 100644 (file)
index 0000000..943c80b
--- /dev/null
@@ -0,0 +1,206 @@
+From stable+bounces-198201-greg=kroah.com@vger.kernel.org Wed Dec  3 12:41:03 2025
+From: Ajay Kaher <ajay.kaher@broadcom.com>
+Date: Wed,  3 Dec 2025 11:22:55 +0000
+Subject: sched/fair: Proportional newidle balance
+To: stable@vger.kernel.org, gregkh@linuxfoundation.org
+Cc: mingo@redhat.com, peterz@infradead.org, juri.lelli@redhat.com, vincent.guittot@linaro.org, dietmar.eggemann@arm.com, rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de, vschneid@redhat.com, linux-kernel@vger.kernel.org, ajay.kaher@broadcom.com, alexey.makhalov@broadcom.com, yin.ding@broadcom.com, tapas.kundu@broadcom.com, Chris Mason <clm@meta.com>
+Message-ID: <20251203112255.1738272-5-ajay.kaher@broadcom.com>
+
+From: Peter Zijlstra (Intel) <peterz@infradead.org>
+
+commit 33cf66d88306663d16e4759e9d24766b0aaa2e17 upstream.
+
+Add a randomized algorithm that runs newidle balancing proportional to
+its success rate.
+
+This improves schbench significantly:
+
+ 6.18-rc4:                     2.22 Mrps/s
+ 6.18-rc4+revert:              2.04 Mrps/s
+ 6.18-rc4+revert+random:       2.18 Mrps/S
+
+Conversely, per Adam Li this affects SpecJBB slightly, reducing it by 1%:
+
+ 6.17:                 -6%
+ 6.17+revert:           0%
+ 6.17+revert+random:   -1%
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Tested-by: Chris Mason <clm@meta.com>
+Link: https://lkml.kernel.org/r/6825c50d-7fa7-45d8-9b81-c6e7e25738e2@meta.com
+Link: https://patch.msgid.link/20251107161739.770122091@infradead.org
+[ Ajay: Modified to apply on v6.6 ]
+Signed-off-by: Ajay Kaher <ajay.kaher@broadcom.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/sched/topology.h |    3 ++
+ kernel/sched/core.c            |    3 ++
+ kernel/sched/fair.c            |   44 +++++++++++++++++++++++++++++++++++++----
+ kernel/sched/features.h        |    5 ++++
+ kernel/sched/sched.h           |    7 ++++++
+ kernel/sched/topology.c        |    6 +++++
+ 6 files changed, 64 insertions(+), 4 deletions(-)
+
+--- a/include/linux/sched/topology.h
++++ b/include/linux/sched/topology.h
+@@ -106,6 +106,9 @@ struct sched_domain {
+       unsigned int nr_balance_failed; /* initialise to 0 */
+       /* idle_balance() stats */
++      unsigned int newidle_call;
++      unsigned int newidle_success;
++      unsigned int newidle_ratio;
+       u64 max_newidle_lb_cost;
+       unsigned long last_decay_max_lb_cost;
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -116,6 +116,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_
+ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
+ DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
++DEFINE_PER_CPU(struct rnd_state, sched_rnd_state);
+ #ifdef CONFIG_SCHED_DEBUG
+ /*
+@@ -9872,6 +9873,8 @@ void __init sched_init_smp(void)
+ {
+       sched_init_numa(NUMA_NO_NODE);
++      prandom_init_once(&sched_rnd_state);
++
+       /*
+        * There's no userspace yet to cause hotplug operations; hence all the
+        * CPU masks are stable and all blatant races in the below code cannot
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -11716,11 +11716,27 @@ void update_max_interval(void)
+       max_load_balance_interval = HZ*num_online_cpus()/10;
+ }
+-static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
++static inline void update_newidle_stats(struct sched_domain *sd, unsigned int success)
++{
++      sd->newidle_call++;
++      sd->newidle_success += success;
++
++      if (sd->newidle_call >= 1024) {
++              sd->newidle_ratio = sd->newidle_success;
++              sd->newidle_call /= 2;
++              sd->newidle_success /= 2;
++      }
++}
++
++static inline bool
++update_newidle_cost(struct sched_domain *sd, u64 cost, unsigned int success)
+ {
+       unsigned long next_decay = sd->last_decay_max_lb_cost + HZ;
+       unsigned long now = jiffies;
++      if (cost)
++              update_newidle_stats(sd, success);
++
+       if (cost > sd->max_newidle_lb_cost) {
+               /*
+                * Track max cost of a domain to make sure to not delay the
+@@ -11768,7 +11784,7 @@ static void rebalance_domains(struct rq
+                * Decay the newidle max times here because this is a regular
+                * visit to all the domains.
+                */
+-              need_decay = update_newidle_cost(sd, 0);
++              need_decay = update_newidle_cost(sd, 0, 0);
+               max_cost += sd->max_newidle_lb_cost;
+               /*
+@@ -12406,6 +12422,22 @@ static int sched_balance_newidle(struct
+                       break;
+               if (sd->flags & SD_BALANCE_NEWIDLE) {
++                      unsigned int weight = 1;
++
++                      if (sched_feat(NI_RANDOM)) {
++                              /*
++                               * Throw a 1k sided dice; and only run
++                               * newidle_balance according to the success
++                               * rate.
++                               */
++                              u32 d1k = sched_rng() % 1024;
++                              weight = 1 + sd->newidle_ratio;
++                              if (d1k > weight) {
++                                      update_newidle_stats(sd, 0);
++                                      continue;
++                              }
++                              weight = (1024 + weight/2) / weight;
++                      }
+                       pulled_task = load_balance(this_cpu, this_rq,
+                                                  sd, CPU_NEWLY_IDLE,
+@@ -12413,10 +12445,14 @@ static int sched_balance_newidle(struct
+                       t1 = sched_clock_cpu(this_cpu);
+                       domain_cost = t1 - t0;
+-                      update_newidle_cost(sd, domain_cost);
+-
+                       curr_cost += domain_cost;
+                       t0 = t1;
++
++                      /*
++                       * Track max cost of a domain to make sure to not delay the
++                       * next wakeup on the CPU.
++                       */
++                      update_newidle_cost(sd, domain_cost, weight * !!pulled_task);
+               }
+               /*
+--- a/kernel/sched/features.h
++++ b/kernel/sched/features.h
+@@ -88,4 +88,9 @@ SCHED_FEAT(UTIL_EST_FASTUP, true)
+ SCHED_FEAT(LATENCY_WARN, false)
++/*
++ * Do newidle balancing proportional to its success rate using randomization.
++ */
++SCHED_FEAT(NI_RANDOM, true)
++
+ SCHED_FEAT(HZ_BW, true)
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -5,6 +5,7 @@
+ #ifndef _KERNEL_SCHED_SCHED_H
+ #define _KERNEL_SCHED_SCHED_H
++#include <linux/prandom.h>
+ #include <linux/sched/affinity.h>
+ #include <linux/sched/autogroup.h>
+ #include <linux/sched/cpufreq.h>
+@@ -1205,6 +1206,12 @@ static inline bool is_migration_disabled
+ }
+ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
++DECLARE_PER_CPU(struct rnd_state, sched_rnd_state);
++
++static inline u32 sched_rng(void)
++{
++      return prandom_u32_state(this_cpu_ptr(&sched_rnd_state));
++}
+ #define cpu_rq(cpu)           (&per_cpu(runqueues, (cpu)))
+ #define this_rq()             this_cpu_ptr(&runqueues)
+--- a/kernel/sched/topology.c
++++ b/kernel/sched/topology.c
+@@ -1600,6 +1600,12 @@ sd_init(struct sched_domain_topology_lev
+               .last_balance           = jiffies,
+               .balance_interval       = sd_weight,
++
++              /* 50% success rate */
++              .newidle_call           = 512,
++              .newidle_success        = 256,
++              .newidle_ratio          = 512,
++
+               .max_newidle_lb_cost    = 0,
+               .last_decay_max_lb_cost = jiffies,
+               .child                  = child,
diff --git a/queue-6.6/sched-fair-small-cleanup-to-sched_balance_newidle.patch b/queue-6.6/sched-fair-small-cleanup-to-sched_balance_newidle.patch
new file mode 100644 (file)
index 0000000..50ca624
--- /dev/null
@@ -0,0 +1,49 @@
+From stable+bounces-198199-greg=kroah.com@vger.kernel.org Wed Dec  3 12:40:53 2025
+From: Ajay Kaher <ajay.kaher@broadcom.com>
+Date: Wed,  3 Dec 2025 11:22:53 +0000
+Subject: sched/fair: Small cleanup to sched_balance_newidle()
+To: stable@vger.kernel.org, gregkh@linuxfoundation.org
+Cc: mingo@redhat.com, peterz@infradead.org, juri.lelli@redhat.com, vincent.guittot@linaro.org, dietmar.eggemann@arm.com, rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de, vschneid@redhat.com, linux-kernel@vger.kernel.org, ajay.kaher@broadcom.com, alexey.makhalov@broadcom.com, yin.ding@broadcom.com, tapas.kundu@broadcom.com, Chris Mason <clm@meta.com>
+Message-ID: <20251203112255.1738272-3-ajay.kaher@broadcom.com>
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+commit e78e70dbf603c1425f15f32b455ca148c932f6c1 upstream.
+
+Pull out the !sd check to simplify code.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Tested-by: Chris Mason <clm@meta.com>
+Link: https://patch.msgid.link/20251107161739.525916173@infradead.org
+[ Ajay: Modified to apply on v6.6 ]
+Signed-off-by: Ajay Kaher <ajay.kaher@broadcom.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/sched/fair.c |   11 ++++++-----
+ 1 file changed, 6 insertions(+), 5 deletions(-)
+
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -12374,14 +12374,15 @@ static int sched_balance_newidle(struct
+       rcu_read_lock();
+       sd = rcu_dereference_check_sched_domain(this_rq->sd);
++      if (!sd) {
++              rcu_read_unlock();
++              goto out;
++      }
+       if (!READ_ONCE(this_rq->rd->overload) ||
+-          (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) {
+-
+-              if (sd)
+-                      update_next_balance(sd, &next_balance);
++          this_rq->avg_idle < sd->max_newidle_lb_cost) {
++              update_next_balance(sd, &next_balance);
+               rcu_read_unlock();
+-
+               goto out;
+       }
+       rcu_read_unlock();
diff --git a/queue-6.6/sched-fair-small-cleanup-to-update_newidle_cost.patch b/queue-6.6/sched-fair-small-cleanup-to-update_newidle_cost.patch
new file mode 100644 (file)
index 0000000..e82a34c
--- /dev/null
@@ -0,0 +1,58 @@
+From stable+bounces-198200-greg=kroah.com@vger.kernel.org Wed Dec  3 12:40:49 2025
+From: Ajay Kaher <ajay.kaher@broadcom.com>
+Date: Wed,  3 Dec 2025 11:22:54 +0000
+Subject: sched/fair: Small cleanup to update_newidle_cost()
+To: stable@vger.kernel.org, gregkh@linuxfoundation.org
+Cc: mingo@redhat.com, peterz@infradead.org, juri.lelli@redhat.com, vincent.guittot@linaro.org, dietmar.eggemann@arm.com, rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de, vschneid@redhat.com, linux-kernel@vger.kernel.org, ajay.kaher@broadcom.com, alexey.makhalov@broadcom.com, yin.ding@broadcom.com, tapas.kundu@broadcom.com, Chris Mason <clm@meta.com>
+Message-ID: <20251203112255.1738272-4-ajay.kaher@broadcom.com>
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+commit 08d473dd8718e4a4d698b1113a14a40ad64a909b upstream.
+
+Simplify code by adding a few variables.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Tested-by: Chris Mason <clm@meta.com>
+Link: https://patch.msgid.link/20251107161739.655208666@infradead.org
+[ Ajay: Modified to apply on v6.6 ]
+Signed-off-by: Ajay Kaher <ajay.kaher@broadcom.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/sched/fair.c |   11 +++++++----
+ 1 file changed, 7 insertions(+), 4 deletions(-)
+
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -11718,22 +11718,25 @@ void update_max_interval(void)
+ static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
+ {
++      unsigned long next_decay = sd->last_decay_max_lb_cost + HZ;
++      unsigned long now = jiffies;
++
+       if (cost > sd->max_newidle_lb_cost) {
+               /*
+                * Track max cost of a domain to make sure to not delay the
+                * next wakeup on the CPU.
+                */
+               sd->max_newidle_lb_cost = cost;
+-              sd->last_decay_max_lb_cost = jiffies;
+-      } else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) {
++              sd->last_decay_max_lb_cost = now;
++
++      } else if (time_after(now, next_decay)) {
+               /*
+                * Decay the newidle max times by ~1% per second to ensure that
+                * it is not outdated and the current max cost is actually
+                * shorter.
+                */
+               sd->max_newidle_lb_cost = (sd->max_newidle_lb_cost * 253) / 256;
+-              sd->last_decay_max_lb_cost = jiffies;
+-
++              sd->last_decay_max_lb_cost = now;
+               return true;
+       }
index 5443c67f8935c9b191dc06109bfa7776511e96d5..2b6e83b3e4746002543c999c98204e59e3cd4b05 100644 (file)
@@ -699,3 +699,23 @@ media-amphion-add-a-frame-flush-mode-for-decoder.patch
 media-amphion-make-some-vpu_v4l2-functions-static.patch
 media-amphion-remove-vpu_vb_is_codecconfig.patch
 media-mediatek-vcodec-use-spinlock-for-context-list-protection-lock.patch
+kvm-svm-introduce-svm_recalc_lbr_msr_intercepts.patch
+kvm-nsvm-always-recalculate-lbr-msr-intercepts-in-svm_update_lbrv.patch
+kvm-nsvm-fix-and-simplify-lbr-virtualization-handling-with-nested.patch
+kvm-svm-fix-redundant-updates-of-lbr-msr-intercepts.patch
+mm-damon-tests-vaddr-kunit-handle-alloc-failures-in-damon_test_split_evenly_fail.patch
+mm-damon-tests-vaddr-kunit-handle-alloc-failures-on-damon_do_test_apply_three_regions.patch
+rdma-core-fix-kasan-slab-use-after-free-read-in-ib_register_device-problem.patch
+sched-fair-small-cleanup-to-sched_balance_newidle.patch
+sched-fair-small-cleanup-to-update_newidle_cost.patch
+sched-fair-proportional-newidle-balance.patch
+net-remove-conditional-threaded-napi-wakeup-based-on-task-state.patch
+net-allow-to-use-smp-threads-for-backlog-napi.patch
+rdma-rxe-remove-the-direct-link-to-net_device.patch
+rdma-rxe-fix-the-failure-of-ibv_query_device-and-ibv_query_device_ex-tests.patch
+mm-damon-tests-vaddr-kunit-handle-alloc-failures-on-damon_test_split_evenly_succ.patch
+mm-damon-tests-core-kunit-handle-alloc-failres-in-damon_test_new_filter.patch
+mm-damon-tests-core-kunit-handle-allocation-failures-in-damon_test_regions.patch
+mm-damon-tests-core-kunit-handle-alloc-failures-on-damon_test_split_at.patch
+mm-damon-tests-core-kunit-handle-alloc-failures-on-dasmon_test_merge_regions_of.patch
+mm-damon-tests-core-kunit-handle-alloc-failures-on-damon_test_merge_two.patch