]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
6.1-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 21 Nov 2025 10:15:12 +0000 (11:15 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 21 Nov 2025 10:15:12 +0000 (11:15 +0100)
added patches:
eventpoll-replace-rwlock-with-spinlock.patch
iommufd-don-t-overflow-during-division-for-dirty-tracking.patch
kvm-svm-mark-vmcb_lbr-dirty-when-msr_ia32_debugctlmsr-is-updated.patch
mm-percpu-do-not-consider-sleepable-allocations-atomic.patch
net-netpoll-fix-incorrect-refcount-handling-causing-incorrect-cleanup.patch
scsi-ufs-core-add-a-quirk-to-suppress-link_startup_again.patch
scsi-ufs-ufs-pci-set-ufshcd_quirk_perform_link_startup_once-for-intel-adl.patch
virtio-net-fix-received-length-check-in-big-packets.patch

queue-6.1/eventpoll-replace-rwlock-with-spinlock.patch [new file with mode: 0644]
queue-6.1/iommufd-don-t-overflow-during-division-for-dirty-tracking.patch [new file with mode: 0644]
queue-6.1/kvm-svm-mark-vmcb_lbr-dirty-when-msr_ia32_debugctlmsr-is-updated.patch [new file with mode: 0644]
queue-6.1/mm-percpu-do-not-consider-sleepable-allocations-atomic.patch [new file with mode: 0644]
queue-6.1/net-netpoll-fix-incorrect-refcount-handling-causing-incorrect-cleanup.patch [new file with mode: 0644]
queue-6.1/scsi-ufs-core-add-a-quirk-to-suppress-link_startup_again.patch [new file with mode: 0644]
queue-6.1/scsi-ufs-ufs-pci-set-ufshcd_quirk_perform_link_startup_once-for-intel-adl.patch [new file with mode: 0644]
queue-6.1/series
queue-6.1/virtio-net-fix-received-length-check-in-big-packets.patch [new file with mode: 0644]

diff --git a/queue-6.1/eventpoll-replace-rwlock-with-spinlock.patch b/queue-6.1/eventpoll-replace-rwlock-with-spinlock.patch
new file mode 100644 (file)
index 0000000..e7b6a87
--- /dev/null
@@ -0,0 +1,393 @@
+From stable+bounces-195475-greg=kroah.com@vger.kernel.org Fri Nov 21 09:15:53 2025
+From: Florian Bezdeka <florian.bezdeka@siemens.com>
+Date: Fri, 21 Nov 2025 09:12:42 +0100
+Subject: eventpoll: Replace rwlock with spinlock
+To: stable@vger.kernel.org
+Cc: Nam Cao <namcao@linutronix.de>, K Prateek Nayak <kprateek.nayak@amd.com>, Frederic Weisbecker <frederic@kernel.org>, Valentin Schneider <vschneid@redhat.com>, Christian Brauner <brauner@kernel.org>, Florian Bezdeka <florian.bezdeka@siemens.com>
+Message-ID: <20251121081242.3296022-1-florian.bezdeka@siemens.com>
+
+From: Nam Cao <namcao@linutronix.de>
+
+[ Upstream commit 0c43094f8cc9d3d99d835c0ac9c4fe1ccc62babd ]
+
+The ready event list of an epoll object is protected by read-write
+semaphore:
+
+  - The consumer (waiter) acquires the write lock and takes items.
+  - the producer (waker) takes the read lock and adds items.
+
+The point of this design is enabling epoll to scale well with large number
+of producers, as multiple producers can hold the read lock at the same
+time.
+
+Unfortunately, this implementation may cause scheduling priority inversion
+problem. Suppose the consumer has higher scheduling priority than the
+producer. The consumer needs to acquire the write lock, but may be blocked
+by the producer holding the read lock. Since read-write semaphore does not
+support priority-boosting for the readers (even with CONFIG_PREEMPT_RT=y),
+we have a case of priority inversion: a higher priority consumer is blocked
+by a lower priority producer. This problem was reported in [1].
+
+Furthermore, this could also cause stall problem, as described in [2].
+
+Fix this problem by replacing rwlock with spinlock.
+
+This reduces the event bandwidth, as the producers now have to contend with
+each other for the spinlock. According to the benchmark from
+https://github.com/rouming/test-tools/blob/master/stress-epoll.c:
+
+    On 12 x86 CPUs:
+                  Before     After        Diff
+        threads  events/ms  events/ms
+              8       7162       4956     -31%
+             16       8733       5383     -38%
+             32       7968       5572     -30%
+             64      10652       5739     -46%
+            128      11236       5931     -47%
+
+    On 4 riscv CPUs:
+                  Before     After        Diff
+        threads  events/ms  events/ms
+              8       2958       2833      -4%
+             16       3323       3097      -7%
+             32       3451       3240      -6%
+             64       3554       3178     -11%
+            128       3601       3235     -10%
+
+Although the numbers look bad, it should be noted that this benchmark
+creates multiple threads who do nothing except constantly generating new
+epoll events, thus contention on the spinlock is high. For real workload,
+the event rate is likely much lower, and the performance drop is not as
+bad.
+
+Using another benchmark (perf bench epoll wait) where spinlock contention
+is lower, improvement is even observed on x86:
+
+    On 12 x86 CPUs:
+        Before: Averaged 110279 operations/sec (+- 1.09%), total secs = 8
+        After:  Averaged 114577 operations/sec (+- 2.25%), total secs = 8
+
+    On 4 riscv CPUs:
+        Before: Averaged 175767 operations/sec (+- 0.62%), total secs = 8
+        After:  Averaged 167396 operations/sec (+- 0.23%), total secs = 8
+
+In conclusion, no one is likely to be upset over this change. After all,
+spinlock was used originally for years, and the commit which converted to
+rwlock didn't mention a real workload, just that the benchmark numbers are
+nice.
+
+This patch is not exactly the revert of commit a218cc491420 ("epoll: use
+rwlock in order to reduce ep_poll_callback() contention"), because git
+revert conflicts in some places which are not obvious on the resolution.
+This patch is intended to be backported, therefore go with the obvious
+approach:
+
+  - Replace rwlock_t with spinlock_t one to one
+
+  - Delete list_add_tail_lockless() and chain_epi_lockless(). These were
+    introduced to allow producers to concurrently add items to the list.
+    But now that spinlock no longer allows producers to touch the event
+    list concurrently, these two functions are not necessary anymore.
+
+Fixes: a218cc491420 ("epoll: use rwlock in order to reduce ep_poll_callback() contention")
+Signed-off-by: Nam Cao <namcao@linutronix.de>
+Link: https://lore.kernel.org/ec92458ea357ec503c737ead0f10b2c6e4c37d47.1752581388.git.namcao@linutronix.de
+Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
+Cc: stable@vger.kernel.org
+Reported-by: Frederic Weisbecker <frederic@kernel.org>
+Closes: https://lore.kernel.org/linux-rt-users/20210825132754.GA895675@lothringen/ [1]
+Reported-by: Valentin Schneider <vschneid@redhat.com>
+Closes: https://lore.kernel.org/linux-rt-users/xhsmhttqvnall.mognet@vschneid.remote.csb/ [2]
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Signed-off-by: Florian Bezdeka <florian.bezdeka@siemens.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/eventpoll.c |  139 ++++++++++-----------------------------------------------
+ 1 file changed, 26 insertions(+), 113 deletions(-)
+
+--- a/fs/eventpoll.c
++++ b/fs/eventpoll.c
+@@ -45,10 +45,10 @@
+  *
+  * 1) epmutex (mutex)
+  * 2) ep->mtx (mutex)
+- * 3) ep->lock (rwlock)
++ * 3) ep->lock (spinlock)
+  *
+  * The acquire order is the one listed above, from 1 to 3.
+- * We need a rwlock (ep->lock) because we manipulate objects
++ * We need a spinlock (ep->lock) because we manipulate objects
+  * from inside the poll callback, that might be triggered from
+  * a wake_up() that in turn might be called from IRQ context.
+  * So we can't sleep inside the poll callback and hence we need
+@@ -193,7 +193,7 @@ struct eventpoll {
+       struct list_head rdllist;
+       /* Lock which protects rdllist and ovflist */
+-      rwlock_t lock;
++      spinlock_t lock;
+       /* RB tree root used to store monitored fd structs */
+       struct rb_root_cached rbr;
+@@ -621,10 +621,10 @@ static void ep_start_scan(struct eventpo
+        * in a lockless way.
+        */
+       lockdep_assert_irqs_enabled();
+-      write_lock_irq(&ep->lock);
++      spin_lock_irq(&ep->lock);
+       list_splice_init(&ep->rdllist, txlist);
+       WRITE_ONCE(ep->ovflist, NULL);
+-      write_unlock_irq(&ep->lock);
++      spin_unlock_irq(&ep->lock);
+ }
+ static void ep_done_scan(struct eventpoll *ep,
+@@ -632,7 +632,7 @@ static void ep_done_scan(struct eventpol
+ {
+       struct epitem *epi, *nepi;
+-      write_lock_irq(&ep->lock);
++      spin_lock_irq(&ep->lock);
+       /*
+        * During the time we spent inside the "sproc" callback, some
+        * other events might have been queued by the poll callback.
+@@ -673,7 +673,7 @@ static void ep_done_scan(struct eventpol
+                       wake_up(&ep->wq);
+       }
+-      write_unlock_irq(&ep->lock);
++      spin_unlock_irq(&ep->lock);
+ }
+ static void epi_rcu_free(struct rcu_head *head)
+@@ -719,10 +719,10 @@ static int ep_remove(struct eventpoll *e
+       rb_erase_cached(&epi->rbn, &ep->rbr);
+-      write_lock_irq(&ep->lock);
++      spin_lock_irq(&ep->lock);
+       if (ep_is_linked(epi))
+               list_del_init(&epi->rdllink);
+-      write_unlock_irq(&ep->lock);
++      spin_unlock_irq(&ep->lock);
+       wakeup_source_unregister(ep_wakeup_source(epi));
+       /*
+@@ -986,7 +986,7 @@ static int ep_alloc(struct eventpoll **p
+               goto free_uid;
+       mutex_init(&ep->mtx);
+-      rwlock_init(&ep->lock);
++      spin_lock_init(&ep->lock);
+       init_waitqueue_head(&ep->wq);
+       init_waitqueue_head(&ep->poll_wait);
+       INIT_LIST_HEAD(&ep->rdllist);
+@@ -1077,99 +1077,9 @@ struct file *get_epoll_tfile_raw_ptr(str
+ #endif /* CONFIG_KCMP */
+ /*
+- * Adds a new entry to the tail of the list in a lockless way, i.e.
+- * multiple CPUs are allowed to call this function concurrently.
+- *
+- * Beware: it is necessary to prevent any other modifications of the
+- *         existing list until all changes are completed, in other words
+- *         concurrent list_add_tail_lockless() calls should be protected
+- *         with a read lock, where write lock acts as a barrier which
+- *         makes sure all list_add_tail_lockless() calls are fully
+- *         completed.
+- *
+- *        Also an element can be locklessly added to the list only in one
+- *        direction i.e. either to the tail or to the head, otherwise
+- *        concurrent access will corrupt the list.
+- *
+- * Return: %false if element has been already added to the list, %true
+- * otherwise.
+- */
+-static inline bool list_add_tail_lockless(struct list_head *new,
+-                                        struct list_head *head)
+-{
+-      struct list_head *prev;
+-
+-      /*
+-       * This is simple 'new->next = head' operation, but cmpxchg()
+-       * is used in order to detect that same element has been just
+-       * added to the list from another CPU: the winner observes
+-       * new->next == new.
+-       */
+-      if (!try_cmpxchg(&new->next, &new, head))
+-              return false;
+-
+-      /*
+-       * Initially ->next of a new element must be updated with the head
+-       * (we are inserting to the tail) and only then pointers are atomically
+-       * exchanged.  XCHG guarantees memory ordering, thus ->next should be
+-       * updated before pointers are actually swapped and pointers are
+-       * swapped before prev->next is updated.
+-       */
+-
+-      prev = xchg(&head->prev, new);
+-
+-      /*
+-       * It is safe to modify prev->next and new->prev, because a new element
+-       * is added only to the tail and new->next is updated before XCHG.
+-       */
+-
+-      prev->next = new;
+-      new->prev = prev;
+-
+-      return true;
+-}
+-
+-/*
+- * Chains a new epi entry to the tail of the ep->ovflist in a lockless way,
+- * i.e. multiple CPUs are allowed to call this function concurrently.
+- *
+- * Return: %false if epi element has been already chained, %true otherwise.
+- */
+-static inline bool chain_epi_lockless(struct epitem *epi)
+-{
+-      struct eventpoll *ep = epi->ep;
+-
+-      /* Fast preliminary check */
+-      if (epi->next != EP_UNACTIVE_PTR)
+-              return false;
+-
+-      /* Check that the same epi has not been just chained from another CPU */
+-      if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR)
+-              return false;
+-
+-      /* Atomically exchange tail */
+-      epi->next = xchg(&ep->ovflist, epi);
+-
+-      return true;
+-}
+-
+-/*
+  * This is the callback that is passed to the wait queue wakeup
+  * mechanism. It is called by the stored file descriptors when they
+  * have events to report.
+- *
+- * This callback takes a read lock in order not to contend with concurrent
+- * events from another file descriptor, thus all modifications to ->rdllist
+- * or ->ovflist are lockless.  Read lock is paired with the write lock from
+- * ep_scan_ready_list(), which stops all list modifications and guarantees
+- * that lists state is seen correctly.
+- *
+- * Another thing worth to mention is that ep_poll_callback() can be called
+- * concurrently for the same @epi from different CPUs if poll table was inited
+- * with several wait queues entries.  Plural wakeup from different CPUs of a
+- * single wait queue is serialized by wq.lock, but the case when multiple wait
+- * queues are used should be detected accordingly.  This is detected using
+- * cmpxchg() operation.
+  */
+ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
+ {
+@@ -1180,7 +1090,7 @@ static int ep_poll_callback(wait_queue_e
+       unsigned long flags;
+       int ewake = 0;
+-      read_lock_irqsave(&ep->lock, flags);
++      spin_lock_irqsave(&ep->lock, flags);
+       ep_set_busy_poll_napi_id(epi);
+@@ -1209,12 +1119,15 @@ static int ep_poll_callback(wait_queue_e
+        * chained in ep->ovflist and requeued later on.
+        */
+       if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
+-              if (chain_epi_lockless(epi))
++              if (epi->next == EP_UNACTIVE_PTR) {
++                      epi->next = READ_ONCE(ep->ovflist);
++                      WRITE_ONCE(ep->ovflist, epi);
+                       ep_pm_stay_awake_rcu(epi);
++              }
+       } else if (!ep_is_linked(epi)) {
+               /* In the usual case, add event to ready list. */
+-              if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist))
+-                      ep_pm_stay_awake_rcu(epi);
++              list_add_tail(&epi->rdllink, &ep->rdllist);
++              ep_pm_stay_awake_rcu(epi);
+       }
+       /*
+@@ -1247,7 +1160,7 @@ static int ep_poll_callback(wait_queue_e
+               pwake++;
+ out_unlock:
+-      read_unlock_irqrestore(&ep->lock, flags);
++      spin_unlock_irqrestore(&ep->lock, flags);
+       /* We have to call this outside the lock */
+       if (pwake)
+@@ -1576,7 +1489,7 @@ static int ep_insert(struct eventpoll *e
+       }
+       /* We have to drop the new item inside our item list to keep track of it */
+-      write_lock_irq(&ep->lock);
++      spin_lock_irq(&ep->lock);
+       /* record NAPI ID of new item if present */
+       ep_set_busy_poll_napi_id(epi);
+@@ -1593,7 +1506,7 @@ static int ep_insert(struct eventpoll *e
+                       pwake++;
+       }
+-      write_unlock_irq(&ep->lock);
++      spin_unlock_irq(&ep->lock);
+       /* We have to call this outside the lock */
+       if (pwake)
+@@ -1657,7 +1570,7 @@ static int ep_modify(struct eventpoll *e
+        * list, push it inside.
+        */
+       if (ep_item_poll(epi, &pt, 1)) {
+-              write_lock_irq(&ep->lock);
++              spin_lock_irq(&ep->lock);
+               if (!ep_is_linked(epi)) {
+                       list_add_tail(&epi->rdllink, &ep->rdllist);
+                       ep_pm_stay_awake(epi);
+@@ -1668,7 +1581,7 @@ static int ep_modify(struct eventpoll *e
+                       if (waitqueue_active(&ep->poll_wait))
+                               pwake++;
+               }
+-              write_unlock_irq(&ep->lock);
++              spin_unlock_irq(&ep->lock);
+       }
+       /* We have to call this outside the lock */
+@@ -1901,7 +1814,7 @@ static int ep_poll(struct eventpoll *ep,
+               init_wait(&wait);
+               wait.func = ep_autoremove_wake_function;
+-              write_lock_irq(&ep->lock);
++              spin_lock_irq(&ep->lock);
+               /*
+                * Barrierless variant, waitqueue_active() is called under
+                * the same lock on wakeup ep_poll_callback() side, so it
+@@ -1920,7 +1833,7 @@ static int ep_poll(struct eventpoll *ep,
+               if (!eavail)
+                       __add_wait_queue_exclusive(&ep->wq, &wait);
+-              write_unlock_irq(&ep->lock);
++              spin_unlock_irq(&ep->lock);
+               if (!eavail)
+                       timed_out = !schedule_hrtimeout_range(to, slack,
+@@ -1935,7 +1848,7 @@ static int ep_poll(struct eventpoll *ep,
+               eavail = 1;
+               if (!list_empty_careful(&wait.entry)) {
+-                      write_lock_irq(&ep->lock);
++                      spin_lock_irq(&ep->lock);
+                       /*
+                        * If the thread timed out and is not on the wait queue,
+                        * it means that the thread was woken up after its
+@@ -1946,7 +1859,7 @@ static int ep_poll(struct eventpoll *ep,
+                       if (timed_out)
+                               eavail = list_empty(&wait.entry);
+                       __remove_wait_queue(&ep->wq, &wait);
+-                      write_unlock_irq(&ep->lock);
++                      spin_unlock_irq(&ep->lock);
+               }
+       }
+ }
diff --git a/queue-6.1/iommufd-don-t-overflow-during-division-for-dirty-tracking.patch b/queue-6.1/iommufd-don-t-overflow-during-division-for-dirty-tracking.patch
new file mode 100644 (file)
index 0000000..ddcccf1
--- /dev/null
@@ -0,0 +1,48 @@
+From stable+bounces-192870-greg=kroah.com@vger.kernel.org Mon Nov 10 00:27:54 2025
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun,  9 Nov 2025 18:27:45 -0500
+Subject: iommufd: Don't overflow during division for dirty tracking
+To: stable@vger.kernel.org
+Cc: Jason Gunthorpe <jgg@ziepe.ca>, Joao Martins <joao.m.martins@oracle.com>, Nicolin Chen <nicolinc@nvidia.com>, Kevin Tian <kevin.tian@intel.com>, syzbot+093a8a8b859472e6c257@syzkaller.appspotmail.com, Jason Gunthorpe <jgg@nvidia.com>, Sasha Levin <sashal@kernel.org>
+Message-ID: <20251109232745.533736-1-sashal@kernel.org>
+
+From: Jason Gunthorpe <jgg@ziepe.ca>
+
+[ Upstream commit cb30dfa75d55eced379a42fd67bd5fb7ec38555e ]
+
+If pgshift is 63 then BITS_PER_TYPE(*bitmap->bitmap) * pgsize will overflow
+to 0 and this triggers divide by 0.
+
+In this case the index should just be 0, so reorganize things to divide
+by shift and avoid hitting any overflows.
+
+Link: https://patch.msgid.link/r/0-v1-663679b57226+172-iommufd_dirty_div0_jgg@nvidia.com
+Cc: stable@vger.kernel.org
+Fixes: 58ccf0190d19 ("vfio: Add an IOVA bitmap support")
+Reviewed-by: Joao Martins <joao.m.martins@oracle.com>
+Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
+Reviewed-by: Kevin Tian <kevin.tian@intel.com>
+Reported-by: syzbot+093a8a8b859472e6c257@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=093a8a8b859472e6c257
+Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
+[ drivers/iommu/iommufd/iova_bitmap.c => drivers/vfio/iova_bitmap.c ]
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/vfio/iova_bitmap.c |    5 ++---
+ 1 file changed, 2 insertions(+), 3 deletions(-)
+
+--- a/drivers/vfio/iova_bitmap.c
++++ b/drivers/vfio/iova_bitmap.c
+@@ -126,9 +126,8 @@ struct iova_bitmap {
+ static unsigned long iova_bitmap_offset_to_index(struct iova_bitmap *bitmap,
+                                                unsigned long iova)
+ {
+-      unsigned long pgsize = 1UL << bitmap->mapped.pgshift;
+-
+-      return iova / (BITS_PER_TYPE(*bitmap->bitmap) * pgsize);
++      return (iova >> bitmap->mapped.pgshift) /
++             BITS_PER_TYPE(*bitmap->bitmap);
+ }
+ /*
diff --git a/queue-6.1/kvm-svm-mark-vmcb_lbr-dirty-when-msr_ia32_debugctlmsr-is-updated.patch b/queue-6.1/kvm-svm-mark-vmcb_lbr-dirty-when-msr_ia32_debugctlmsr-is-updated.patch
new file mode 100644 (file)
index 0000000..11d3412
--- /dev/null
@@ -0,0 +1,60 @@
+From stable+bounces-195377-greg=kroah.com@vger.kernel.org Thu Nov 20 19:07:04 2025
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 20 Nov 2025 13:06:55 -0500
+Subject: KVM: SVM: Mark VMCB_LBR dirty when MSR_IA32_DEBUGCTLMSR is updated
+To: stable@vger.kernel.org
+Cc: Yosry Ahmed <yosry.ahmed@linux.dev>, Matteo Rizzo <matteorizzo@google.com>, evn@google.com, Jim Mattson <jmattson@google.com>, Paolo Bonzini <pbonzini@redhat.com>, Sasha Levin <sashal@kernel.org>
+Message-ID: <20251120180655.1918545-1-sashal@kernel.org>
+
+From: Yosry Ahmed <yosry.ahmed@linux.dev>
+
+[ Upstream commit dc55b3c3f61246e483e50c85d8d5366f9567e188 ]
+
+The APM lists the DbgCtlMsr field as being tracked by the VMCB_LBR clean
+bit.  Always clear the bit when MSR_IA32_DEBUGCTLMSR is updated.
+
+The history is complicated, it was correctly cleared for L1 before
+commit 1d5a1b5860ed ("KVM: x86: nSVM: correctly virtualize LBR msrs when
+L2 is running").  At that point svm_set_msr() started to rely on
+svm_update_lbrv() to clear the bit, but when nested virtualization
+is enabled the latter does not always clear it even if MSR_IA32_DEBUGCTLMSR
+changed. Go back to clearing it directly in svm_set_msr().
+
+Fixes: 1d5a1b5860ed ("KVM: x86: nSVM: correctly virtualize LBR msrs when L2 is running")
+Reported-by: Matteo Rizzo <matteorizzo@google.com>
+Reported-by: evn@google.com
+Co-developed-by: Jim Mattson <jmattson@google.com>
+Signed-off-by: Jim Mattson <jmattson@google.com>
+Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
+Link: https://patch.msgid.link/20251108004524.1600006-2-yosry.ahmed@linux.dev
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+[ Open coded svm_get_lbr_vmcb() call ]
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/svm/svm.c |   10 ++++++++--
+ 1 file changed, 8 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -3053,11 +3053,17 @@ static int svm_set_msr(struct kvm_vcpu *
+               if (data & DEBUGCTL_RESERVED_BITS)
+                       return 1;
+-              if (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK)
++              if (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) {
++                      if (svm->vmcb->save.dbgctl == data)
++                              break;
+                       svm->vmcb->save.dbgctl = data;
+-              else
++              } else {
++                      if (svm->vmcb01.ptr->save.dbgctl == data)
++                              break;
+                       svm->vmcb01.ptr->save.dbgctl = data;
++              }
++              vmcb_mark_dirty(svm->vmcb, VMCB_LBR);
+               svm_update_lbrv(vcpu);
+               break;
diff --git a/queue-6.1/mm-percpu-do-not-consider-sleepable-allocations-atomic.patch b/queue-6.1/mm-percpu-do-not-consider-sleepable-allocations-atomic.patch
new file mode 100644 (file)
index 0000000..cf66568
--- /dev/null
@@ -0,0 +1,88 @@
+From stable+bounces-194937-greg=kroah.com@vger.kernel.org Mon Nov 17 10:02:24 2025
+From: mambaxin@163.com
+Date: Mon, 17 Nov 2025 16:59:22 +0800
+Subject: mm, percpu: do not consider sleepable allocations atomic
+To: dennis@kernel.org, tj@kernel.org, cl@linux.com, akpm@linux-foundation.org
+Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org, stable@vger.kernel.org, mhocko@suse.com, Vlastimil Babka <vbabka@suse.cz>, Filipe David Manana <fdmanana@suse.com>, chenxin <chenxinxin@xiaomi.com>
+Message-ID: <20251117085922.508060-1-mambaxin@163.com>
+
+From: Michal Hocko <mhocko@suse.com>
+
+[ Upstream commit 9a5b183941b52f84c0f9e5f27ce44e99318c9e0f ]
+
+28307d938fb2 ("percpu: make pcpu_alloc() aware of current gfp context")
+has fixed a reclaim recursion for scoped GFP_NOFS context.  It has done
+that by avoiding taking pcpu_alloc_mutex.  This is a correct solution as
+the worker context with full GFP_KERNEL allocation/reclaim power and which
+is using the same lock cannot block the NOFS pcpu_alloc caller.
+
+On the other hand this is a very conservative approach that could lead to
+failures because pcpu_alloc lockless implementation is quite limited.
+
+We have a bug report about premature failures when scsi array of 193
+devices is scanned.  Sometimes (not consistently) the scanning aborts
+because the iscsid daemon fails to create the queue for a random scsi
+device during the scan.  iscsid itself is running with PR_SET_IO_FLUSHER
+set so all allocations from this process context are GFP_NOIO.  This in
+turn makes any pcpu_alloc lockless (without pcpu_alloc_mutex) which leads
+to pre-mature failures.
+
+It has turned out that iscsid has worked around this by dropping
+PR_SET_IO_FLUSHER (https://github.com/open-iscsi/open-iscsi/pull/382) when
+scanning host.  But we can do better in this case on the kernel side and
+use pcpu_alloc_mutex for NOIO resp.  NOFS constrained allocation scopes
+too.  We just need the WQ worker to never trigger IO/FS reclaim.  Achieve
+that by enforcing scoped GFP_NOIO for the whole execution of
+pcpu_balance_workfn (this will imply NOFS constrain as well).  This will
+remove the dependency chain and preserve the full allocation power of the
+pcpu_alloc call.
+
+While at it make is_atomic really test for blockable allocations.
+
+Link: https://lkml.kernel.org/r/20250206122633.167896-1-mhocko@kernel.org
+Fixes: 28307d938fb2 ("percpu: make pcpu_alloc() aware of current gfp context")
+Signed-off-by: Michal Hocko <mhocko@suse.com>
+Acked-by: Vlastimil Babka <vbabka@suse.cz>
+Cc: Dennis Zhou <dennis@kernel.org>
+Cc: Filipe David Manana <fdmanana@suse.com>
+Cc: Tejun Heo <tj@kernel.org>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: chenxin <chenxinxin@xiaomi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/percpu.c |    8 +++++++-
+ 1 file changed, 7 insertions(+), 1 deletion(-)
+
+--- a/mm/percpu.c
++++ b/mm/percpu.c
+@@ -1737,7 +1737,7 @@ static void __percpu *pcpu_alloc(size_t
+       gfp = current_gfp_context(gfp);
+       /* whitelisted flags that can be passed to the backing allocators */
+       pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
+-      is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
++      is_atomic = !gfpflags_allow_blocking(gfp);
+       do_warn = !(gfp & __GFP_NOWARN);
+       /*
+@@ -2237,7 +2237,12 @@ static void pcpu_balance_workfn(struct w
+        * to grow other chunks.  This then gives pcpu_reclaim_populated() time
+        * to move fully free chunks to the active list to be freed if
+        * appropriate.
++       *
++       * Enforce GFP_NOIO allocations because we have pcpu_alloc users
++       * constrained to GFP_NOIO/NOFS contexts and they could form lock
++       * dependency through pcpu_alloc_mutex
+        */
++      unsigned int flags = memalloc_noio_save();
+       mutex_lock(&pcpu_alloc_mutex);
+       spin_lock_irq(&pcpu_lock);
+@@ -2248,6 +2253,7 @@ static void pcpu_balance_workfn(struct w
+       spin_unlock_irq(&pcpu_lock);
+       mutex_unlock(&pcpu_alloc_mutex);
++      memalloc_noio_restore(flags);
+ }
+ /**
diff --git a/queue-6.1/net-netpoll-fix-incorrect-refcount-handling-causing-incorrect-cleanup.patch b/queue-6.1/net-netpoll-fix-incorrect-refcount-handling-causing-incorrect-cleanup.patch
new file mode 100644 (file)
index 0000000..1e0bc6a
--- /dev/null
@@ -0,0 +1,85 @@
+From stable+bounces-195438-greg=kroah.com@vger.kernel.org Fri Nov 21 02:45:43 2025
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 20 Nov 2025 20:45:33 -0500
+Subject: net: netpoll: fix incorrect refcount handling causing incorrect cleanup
+To: stable@vger.kernel.org
+Cc: Breno Leitao <leitao@debian.org>, Jay Vosburgh <jv@jvosburgh.net>, Simon Horman <horms@kernel.org>, Jakub Kicinski <kuba@kernel.org>, Sasha Levin <sashal@kernel.org>
+Message-ID: <20251121014533.2332732-1-sashal@kernel.org>
+
+From: Breno Leitao <leitao@debian.org>
+
+[ Upstream commit 49c8d2c1f94cc2f4d1a108530d7ba52614b874c2 ]
+
+commit efa95b01da18 ("netpoll: fix use after free") incorrectly
+ignored the refcount and prematurely set dev->npinfo to NULL during
+netpoll cleanup, leading to improper behavior and memory leaks.
+
+Scenario causing lack of proper cleanup:
+
+1) A netpoll is associated with a NIC (e.g., eth0) and netdev->npinfo is
+   allocated, and refcnt = 1
+   - Keep in mind that npinfo is shared among all netpoll instances. In
+     this case, there is just one.
+
+2) Another netpoll is also associated with the same NIC and
+   npinfo->refcnt += 1.
+   - Now dev->npinfo->refcnt = 2;
+   - There is just one npinfo associated to the netdev.
+
+3) When the first netpolls goes to clean up:
+   - The first cleanup succeeds and clears np->dev->npinfo, ignoring
+     refcnt.
+     - It basically calls `RCU_INIT_POINTER(np->dev->npinfo, NULL);`
+   - Set dev->npinfo = NULL, without proper cleanup
+   - No ->ndo_netpoll_cleanup() is either called
+
+4) Now the second target tries to clean up
+   - The second cleanup fails because np->dev->npinfo is already NULL.
+     * In this case, ops->ndo_netpoll_cleanup() was never called, and
+       the skb pool is not cleaned as well (for the second netpoll
+       instance)
+  - This leaks npinfo and skbpool skbs, which is clearly reported by
+    kmemleak.
+
+Revert commit efa95b01da18 ("netpoll: fix use after free") and adds
+clarifying comments emphasizing that npinfo cleanup should only happen
+once the refcount reaches zero, ensuring stable and correct netpoll
+behavior.
+
+Cc: <stable@vger.kernel.org> # 3.17.x
+Cc: Jay Vosburgh <jv@jvosburgh.net>
+Fixes: efa95b01da18 ("netpoll: fix use after free")
+Signed-off-by: Breno Leitao <leitao@debian.org>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Link: https://patch.msgid.link/20251107-netconsole_torture-v10-1-749227b55f63@debian.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+[ Adjust context ]
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/core/netpoll.c |    7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)
+
+--- a/net/core/netpoll.c
++++ b/net/core/netpoll.c
+@@ -851,6 +851,10 @@ void __netpoll_cleanup(struct netpoll *n
+       synchronize_srcu(&netpoll_srcu);
++      /* At this point, there is a single npinfo instance per netdevice, and
++       * its refcnt tracks how many netpoll structures are linked to it. We
++       * only perform npinfo cleanup when the refcnt decrements to zero.
++       */
+       if (refcount_dec_and_test(&npinfo->refcnt)) {
+               const struct net_device_ops *ops;
+@@ -860,8 +864,7 @@ void __netpoll_cleanup(struct netpoll *n
+               RCU_INIT_POINTER(np->dev->npinfo, NULL);
+               call_rcu(&npinfo->rcu, rcu_cleanup_netpoll_info);
+-      } else
+-              RCU_INIT_POINTER(np->dev->npinfo, NULL);
++      }
+ }
+ EXPORT_SYMBOL_GPL(__netpoll_cleanup);
diff --git a/queue-6.1/scsi-ufs-core-add-a-quirk-to-suppress-link_startup_again.patch b/queue-6.1/scsi-ufs-core-add-a-quirk-to-suppress-link_startup_again.patch
new file mode 100644 (file)
index 0000000..004de4b
--- /dev/null
@@ -0,0 +1,61 @@
+From stable+bounces-192941-greg=kroah.com@vger.kernel.org Mon Nov 10 13:05:03 2025
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 10 Nov 2025 06:58:57 -0500
+Subject: scsi: ufs: core: Add a quirk to suppress link_startup_again
+To: stable@vger.kernel.org
+Cc: Adrian Hunter <adrian.hunter@intel.com>, Bart Van Assche <bvanassche@acm.org>, "Martin K. Petersen" <martin.petersen@oracle.com>
+Message-ID: <20251110115859.651217-1-sashal@kernel.org>
+
+From: Adrian Hunter <adrian.hunter@intel.com>
+
+ufshcd_link_startup() has a facility (link_startup_again) to issue
+DME_LINKSTARTUP a 2nd time even though the 1st time was successful.
+
+Some older hardware benefits from that, however the behaviour is
+non-standard, and has been found to cause link startup to be unreliable
+for some Intel Alder Lake based host controllers.
+
+Add UFSHCD_QUIRK_PERFORM_LINK_STARTUP_ONCE to suppress
+link_startup_again, in preparation for setting the quirk for affected
+controllers.
+
+Fixes: 7dc9fb47bc9a ("scsi: ufs: ufs-pci: Add support for Intel ADL")
+Cc: stable@vger.kernel.org
+Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
+Reviewed-by: Bart Van Assche <bvanassche@acm.org>
+Link: https://patch.msgid.link/20251024085918.31825-3-adrian.hunter@intel.com
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/ufs/core/ufshcd.c |    3 ++-
+ include/ufs/ufshcd.h      |    7 +++++++
+ 2 files changed, 9 insertions(+), 1 deletion(-)
+
+--- a/drivers/ufs/core/ufshcd.c
++++ b/drivers/ufs/core/ufshcd.c
+@@ -4778,7 +4778,8 @@ static int ufshcd_link_startup(struct uf
+        * If UFS device isn't active then we will have to issue link startup
+        * 2 times to make sure the device state move to active.
+        */
+-      if (!ufshcd_is_ufs_dev_active(hba))
++      if (!(hba->quirks & UFSHCD_QUIRK_PERFORM_LINK_STARTUP_ONCE) &&
++          !ufshcd_is_ufs_dev_active(hba))
+               link_startup_again = true;
+ link_startup:
+--- a/include/ufs/ufshcd.h
++++ b/include/ufs/ufshcd.h
+@@ -592,6 +592,13 @@ enum ufshcd_quirks {
+        * auto-hibernate capability but it's FASTAUTO only.
+        */
+       UFSHCD_QUIRK_HIBERN_FASTAUTO                    = 1 << 18,
++
++      /*
++       * This quirk indicates that DME_LINKSTARTUP should not be issued a 2nd
++       * time (refer link_startup_again) after the 1st time was successful,
++       * because it causes link startup to become unreliable.
++       */
++      UFSHCD_QUIRK_PERFORM_LINK_STARTUP_ONCE          = 1 << 19,
+ };
+ enum ufshcd_caps {
diff --git a/queue-6.1/scsi-ufs-ufs-pci-set-ufshcd_quirk_perform_link_startup_once-for-intel-adl.patch b/queue-6.1/scsi-ufs-ufs-pci-set-ufshcd_quirk_perform_link_startup_once-for-intel-adl.patch
new file mode 100644 (file)
index 0000000..7dbbd89
--- /dev/null
@@ -0,0 +1,41 @@
+From stable+bounces-192942-greg=kroah.com@vger.kernel.org Mon Nov 10 13:04:00 2025
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 10 Nov 2025 06:58:58 -0500
+Subject: scsi: ufs: ufs-pci: Set UFSHCD_QUIRK_PERFORM_LINK_STARTUP_ONCE for Intel ADL
+To: stable@vger.kernel.org
+Cc: Adrian Hunter <adrian.hunter@intel.com>, Bart Van Assche <bvanassche@acm.org>, "Martin K. Petersen" <martin.petersen@oracle.com>, Sasha Levin <sashal@kernel.org>
+Message-ID: <20251110115859.651217-2-sashal@kernel.org>
+
+From: Adrian Hunter <adrian.hunter@intel.com>
+
+[ Upstream commit d968e99488c4b08259a324a89e4ed17bf36561a4 ]
+
+Link startup becomes unreliable for Intel Alder Lake based host
+controllers when a 2nd DME_LINKSTARTUP is issued unnecessarily.  Employ
+UFSHCD_QUIRK_PERFORM_LINK_STARTUP_ONCE to suppress that from happening.
+
+Fixes: 7dc9fb47bc9a ("scsi: ufs: ufs-pci: Add support for Intel ADL")
+Cc: stable@vger.kernel.org
+Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
+Reviewed-by: Bart Van Assche <bvanassche@acm.org>
+Link: https://patch.msgid.link/20251024085918.31825-4-adrian.hunter@intel.com
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+[ adjusted patch context line numbers from 428 to 460 due to prerequisite backport ]
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/ufs/host/ufshcd-pci.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/drivers/ufs/host/ufshcd-pci.c
++++ b/drivers/ufs/host/ufshcd-pci.c
+@@ -463,7 +463,8 @@ static int ufs_intel_lkf_init(struct ufs
+ static int ufs_intel_adl_init(struct ufs_hba *hba)
+ {
+       hba->nop_out_timeout = 200;
+-      hba->quirks |= UFSHCD_QUIRK_BROKEN_AUTO_HIBERN8;
++      hba->quirks |= UFSHCD_QUIRK_BROKEN_AUTO_HIBERN8 |
++                     UFSHCD_QUIRK_PERFORM_LINK_STARTUP_ONCE;
+       hba->caps |= UFSHCD_CAP_WB_EN;
+       return ufs_intel_common_init(hba);
+ }
index 8d314786101b93d5ee4aedc18f4d594547e7984a..7f78b154d729ef387c8441d6fc1852f441bf6a04 100644 (file)
@@ -413,3 +413,11 @@ edac-altera-handle-ocram-ecc-enable-after-warm-reset.patch
 edac-altera-use-inttest-register-for-ethernet-and-usb-sbe-injection.patch
 btrfs-do-not-update-last_log_commit-when-logging-inode-due-to-a-new-name.patch
 selftests-mptcp-connect-trunc-read-all-recv-data.patch
+virtio-net-fix-received-length-check-in-big-packets.patch
+scsi-ufs-core-add-a-quirk-to-suppress-link_startup_again.patch
+scsi-ufs-ufs-pci-set-ufshcd_quirk_perform_link_startup_once-for-intel-adl.patch
+iommufd-don-t-overflow-during-division-for-dirty-tracking.patch
+kvm-svm-mark-vmcb_lbr-dirty-when-msr_ia32_debugctlmsr-is-updated.patch
+net-netpoll-fix-incorrect-refcount-handling-causing-incorrect-cleanup.patch
+eventpoll-replace-rwlock-with-spinlock.patch
+mm-percpu-do-not-consider-sleepable-allocations-atomic.patch
diff --git a/queue-6.1/virtio-net-fix-received-length-check-in-big-packets.patch b/queue-6.1/virtio-net-fix-received-length-check-in-big-packets.patch
new file mode 100644 (file)
index 0000000..c06cd8e
--- /dev/null
@@ -0,0 +1,84 @@
+From stable+bounces-192944-greg=kroah.com@vger.kernel.org Mon Nov 10 13:09:30 2025
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 10 Nov 2025 07:06:04 -0500
+Subject: virtio-net: fix received length check in big packets
+To: stable@vger.kernel.org
+Cc: Bui Quang Minh <minhquangbui99@gmail.com>, Xuan Zhuo <xuanzhuo@linux.alibaba.com>, Lei Yang <leiyang@redhat.com>, Jakub Kicinski <kuba@kernel.org>, Sasha Levin <sashal@kernel.org>
+Message-ID: <20251110120604.655678-1-sashal@kernel.org>
+
+From: Bui Quang Minh <minhquangbui99@gmail.com>
+
+[ Upstream commit 0c716703965ffc5ef4311b65cb5d84a703784717 ]
+
+Since commit 4959aebba8c0 ("virtio-net: use mtu size as buffer length
+for big packets"), when guest gso is off, the allocated size for big
+packets is not MAX_SKB_FRAGS * PAGE_SIZE anymore but depends on
+negotiated MTU. The number of allocated frags for big packets is stored
+in vi->big_packets_num_skbfrags.
+
+Because the host announced buffer length can be malicious (e.g. the host
+vhost_net driver's get_rx_bufs is modified to announce incorrect
+length), we need a check in virtio_net receive path. Currently, the
+check is not adapted to the new change which can lead to NULL page
+pointer dereference in the below while loop when receiving length that
+is larger than the allocated one.
+
+This commit fixes the received length check corresponding to the new
+change.
+
+Fixes: 4959aebba8c0 ("virtio-net: use mtu size as buffer length for big packets")
+Cc: stable@vger.kernel.org
+Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com>
+Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
+Tested-by: Lei Yang <leiyang@redhat.com>
+Link: https://patch.msgid.link/20251030144438.7582-1-minhquangbui99@gmail.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+[ adapted page_to_skb() call ]
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/virtio_net.c |   26 +++++++++++++-------------
+ 1 file changed, 13 insertions(+), 13 deletions(-)
+
+--- a/drivers/net/virtio_net.c
++++ b/drivers/net/virtio_net.c
+@@ -542,17 +542,6 @@ static struct sk_buff *page_to_skb(struc
+               goto ok;
+       }
+-      /*
+-       * Verify that we can indeed put this data into a skb.
+-       * This is here to handle cases when the device erroneously
+-       * tries to receive more than is possible. This is usually
+-       * the case of a broken device.
+-       */
+-      if (unlikely(len > MAX_SKB_FRAGS * PAGE_SIZE)) {
+-              net_dbg_ratelimited("%s: too much data\n", skb->dev->name);
+-              dev_kfree_skb(skb);
+-              return NULL;
+-      }
+       BUG_ON(offset >= PAGE_SIZE);
+       while (len) {
+               unsigned int frag_size = min((unsigned)PAGE_SIZE - offset, len);
+@@ -955,8 +944,19 @@ static struct sk_buff *receive_big(struc
+                                  struct virtnet_rq_stats *stats)
+ {
+       struct page *page = buf;
+-      struct sk_buff *skb =
+-              page_to_skb(vi, rq, page, 0, len, PAGE_SIZE, true, 0, 0);
++      struct sk_buff *skb;
++
++      /* Make sure that len does not exceed the size allocated in
++       * add_recvbuf_big.
++       */
++      if (unlikely(len > (vi->big_packets_num_skbfrags + 1) * PAGE_SIZE)) {
++              pr_debug("%s: rx error: len %u exceeds allocated size %lu\n",
++                       dev->name, len,
++                       (vi->big_packets_num_skbfrags + 1) * PAGE_SIZE);
++              goto err;
++      }
++
++      skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE, true, 0, 0);
+       stats->bytes += len - vi->hdr_len;
+       if (unlikely(!skb))