Fixes for 6.9

author Sasha Levin <sashal@kernel.org>

Sat, 13 Jul 2024 13:26:37 +0000 (09:26 -0400)

committer Sasha Levin <sashal@kernel.org>

Sat, 13 Jul 2024 13:26:37 +0000 (09:26 -0400)
author Sasha Levin <sashal@kernel.org>
Sat, 13 Jul 2024 13:26:37 +0000 (09:26 -0400)
committer Sasha Levin <sashal@kernel.org>
Sat, 13 Jul 2024 13:26:37 +0000 (09:26 -0400)
diff --git a/queue-6.9/bpf-defer-work-in-bpf_timer_cancel_and_free.patch b/queue-6.9/bpf-defer-work-in-bpf_timer_cancel_and_free.patch

new file mode 100644 (file)

index 0000000..883a0d0
--- /dev/null
+++ b/queue-6.9/bpf-defer-work-in-bpf_timer_cancel_and_free.patch
@@ -0,0 +1,150 @@
+From db676ab0b7af0b99a143284da45072f45c0ce74b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 9 Jul 2024 18:54:39 +0000
+Subject: bpf: Defer work in bpf_timer_cancel_and_free
+
+From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
+
+[ Upstream commit a6fcd19d7eac1335eb76bc16b6a66b7f574d1d69 ]
+
+Currently, the same case as previous patch (two timer callbacks trying
+to cancel each other) can be invoked through bpf_map_update_elem as
+well, or more precisely, freeing map elements containing timers. Since
+this relies on hrtimer_cancel as well, it is prone to the same deadlock
+situation as the previous patch.
+
+It would be sufficient to use hrtimer_try_to_cancel to fix this problem,
+as the timer cannot be enqueued after async_cancel_and_free. Once
+async_cancel_and_free has been done, the timer must be reinitialized
+before it can be armed again. The callback running in parallel trying to
+arm the timer will fail, and freeing bpf_hrtimer without waiting is
+sufficient (given kfree_rcu), and bpf_timer_cb will return
+HRTIMER_NORESTART, preventing the timer from being rearmed again.
+
+However, there exists a UAF scenario where the callback arms the timer
+before entering this function, such that if cancellation fails (due to
+timer callback invoking this routine, or the target timer callback
+running concurrently). In such a case, if the timer expiration is
+significantly far in the future, the RCU grace period expiration
+happening before it will free the bpf_hrtimer state and along with it
+the struct hrtimer, that is enqueued.
+
+Hence, it is clear cancellation needs to occur after
+async_cancel_and_free, and yet it cannot be done inline due to deadlock
+issues. We thus modify bpf_timer_cancel_and_free to defer work to the
+global workqueue, adding a work_struct alongside rcu_head (both used at
+_different_ points of time, so can share space).
+
+Update existing code comments to reflect the new state of affairs.
+
+Fixes: b00628b1c7d5 ("bpf: Introduce bpf timers.")
+Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
+Link: https://lore.kernel.org/r/20240709185440.1104957-3-memxor@gmail.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/bpf/helpers.c | 61 ++++++++++++++++++++++++++++++++++----------
+ 1 file changed, 47 insertions(+), 14 deletions(-)
+
+diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
+index 79cb5681cf136..6ad7a61c7617f 100644
+--- a/kernel/bpf/helpers.c
++++ b/kernel/bpf/helpers.c
+@@ -1084,7 +1084,10 @@ struct bpf_async_cb {
+       struct bpf_prog *prog;
+       void __rcu *callback_fn;
+       void *value;
+-      struct rcu_head rcu;
++      union {
++              struct rcu_head rcu;
++              struct work_struct delete_work;
++      };
+       u64 flags;
+ };
+ 
+@@ -1168,6 +1171,21 @@ static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
+       return HRTIMER_NORESTART;
+ }
+ 
++static void bpf_timer_delete_work(struct work_struct *work)
++{
++      struct bpf_hrtimer *t = container_of(work, struct bpf_hrtimer, cb.delete_work);
++
++      /* Cancel the timer and wait for callback to complete if it was running.
++       * If hrtimer_cancel() can be safely called it's safe to call
++       * kfree_rcu(t) right after for both preallocated and non-preallocated
++       * maps.  The async->cb = NULL was already done and no code path can see
++       * address 't' anymore. Timer if armed for existing bpf_hrtimer before
++       * bpf_timer_cancel_and_free will have been cancelled.
++       */
++      hrtimer_cancel(&t->timer);
++      kfree_rcu(t, cb.rcu);
++}
++
+ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags,
+                           enum bpf_async_type type)
+ {
+@@ -1207,6 +1225,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
+               t = (struct bpf_hrtimer *)cb;
+ 
+               atomic_set(&t->cancelling, 0);
++              INIT_WORK(&t->cb.delete_work, bpf_timer_delete_work);
+               hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);
+               t->timer.function = bpf_timer_cb;
+               cb->value = (void *)async - map->record->timer_off;
+@@ -1464,25 +1483,39 @@ void bpf_timer_cancel_and_free(void *val)
+       __bpf_spin_unlock_irqrestore(&timer->lock);
+       if (!t)
+               return;
+-      /* Cancel the timer and wait for callback to complete if it was running.
+-       * If hrtimer_cancel() can be safely called it's safe to call kfree(t)
+-       * right after for both preallocated and non-preallocated maps.
+-       * The timer->timer = NULL was already done and no code path can
+-       * see address 't' anymore.
+-       *
+-       * Check that bpf_map_delete/update_elem() wasn't called from timer
+-       * callback_fn. In such case don't call hrtimer_cancel() (since it will
+-       * deadlock) and don't call hrtimer_try_to_cancel() (since it will just
+-       * return -1). Though callback_fn is still running on this cpu it's
++      /* We check that bpf_map_delete/update_elem() was called from timer
++       * callback_fn. In such case we don't call hrtimer_cancel() (since it
++       * will deadlock) and don't call hrtimer_try_to_cancel() (since it will
++       * just return -1). Though callback_fn is still running on this cpu it's
+        * safe to do kfree(t) because bpf_timer_cb() read everything it needed
+        * from 't'. The bpf subprog callback_fn won't be able to access 't',
+        * since timer->timer = NULL was already done. The timer will be
+        * effectively cancelled because bpf_timer_cb() will return
+        * HRTIMER_NORESTART.
++       *
++       * However, it is possible the timer callback_fn calling us armed the
++       * timer _before_ calling us, such that failing to cancel it here will
++       * cause it to possibly use struct hrtimer after freeing bpf_hrtimer.
++       * Therefore, we _need_ to cancel any outstanding timers before we do
++       * kfree_rcu, even though no more timers can be armed.
++       *
++       * Moreover, we need to schedule work even if timer does not belong to
++       * the calling callback_fn, as on two different CPUs, we can end up in a
++       * situation where both sides run in parallel, try to cancel one
++       * another, and we end up waiting on both sides in hrtimer_cancel
++       * without making forward progress, since timer1 depends on time2
++       * callback to finish, and vice versa.
++       *
++       *  CPU 1 (timer1_cb)                   CPU 2 (timer2_cb)
++       *  bpf_timer_cancel_and_free(timer2)   bpf_timer_cancel_and_free(timer1)
++       *
++       * To avoid these issues, punt to workqueue context when we are in a
++       * timer callback.
+        */
+-      if (this_cpu_read(hrtimer_running) != t)
+-              hrtimer_cancel(&t->timer);
+-      kfree_rcu(t, cb.rcu);
++      if (this_cpu_read(hrtimer_running))
++              queue_work(system_unbound_wq, &t->cb.delete_work);
++      else
++              bpf_timer_delete_work(&t->cb.delete_work);
+ }
+ 
+ BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr)
+-- 
+2.43.0
+
diff --git a/queue-6.9/bpf-fail-bpf_timer_cancel-when-callback-is-being-can.patch b/queue-6.9/bpf-fail-bpf_timer_cancel-when-callback-is-being-can.patch

new file mode 100644 (file)

index 0000000..7ae3a5f
--- /dev/null
+++ b/queue-6.9/bpf-fail-bpf_timer_cancel-when-callback-is-being-can.patch
@@ -0,0 +1,162 @@
+From 3369ecf092fba3ce53a3b32a97c72e795a8b92e5 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 9 Jul 2024 18:54:38 +0000
+Subject: bpf: Fail bpf_timer_cancel when callback is being cancelled
+
+From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
+
+[ Upstream commit d4523831f07a267a943f0dde844bf8ead7495f13 ]
+
+Given a schedule:
+
+timer1 cb                      timer2 cb
+
+bpf_timer_cancel(timer2);      bpf_timer_cancel(timer1);
+
+Both bpf_timer_cancel calls would wait for the other callback to finish
+executing, introducing a lockup.
+
+Add an atomic_t count named 'cancelling' in bpf_hrtimer. This keeps
+track of all in-flight cancellation requests for a given BPF timer.
+Whenever cancelling a BPF timer, we must check if we have outstanding
+cancellation requests, and if so, we must fail the operation with an
+error (-EDEADLK) since cancellation is synchronous and waits for the
+callback to finish executing. This implies that we can enter a deadlock
+situation involving two or more timer callbacks executing in parallel
+and attempting to cancel one another.
+
+Note that we avoid incrementing the cancelling counter for the target
+timer (the one being cancelled) if bpf_timer_cancel is not invoked from
+a callback, to avoid spurious errors. The whole point of detecting
+cur->cancelling and returning -EDEADLK is to not enter a busy wait loop
+(which may or may not lead to a lockup). This does not apply in case the
+caller is in a non-callback context, the other side can continue to
+cancel as it sees fit without running into errors.
+
+Background on prior attempts:
+
+Earlier versions of this patch used a bool 'cancelling' bit and used the
+following pattern under timer->lock to publish cancellation status.
+
+lock(t->lock);
+t->cancelling = true;
+mb();
+if (cur->cancelling)
+       return -EDEADLK;
+unlock(t->lock);
+hrtimer_cancel(t->timer);
+t->cancelling = false;
+
+The store outside the critical section could overwrite a parallel
+requests t->cancelling assignment to true, to ensure the parallely
+executing callback observes its cancellation status.
+
+It would be necessary to clear this cancelling bit once hrtimer_cancel
+is done, but lack of serialization introduced races. Another option was
+explored where bpf_timer_start would clear the bit when (re)starting the
+timer under timer->lock. This would ensure serialized access to the
+cancelling bit, but may allow it to be cleared before in-flight
+hrtimer_cancel has finished executing, such that lockups can occur
+again.
+
+Thus, we choose an atomic counter to keep track of all outstanding
+cancellation requests and use it to prevent lockups in case callbacks
+attempt to cancel each other while executing in parallel.
+
+Reported-by: Dohyun Kim <dohyunkim@google.com>
+Reported-by: Neel Natu <neelnatu@google.com>
+Fixes: b00628b1c7d5 ("bpf: Introduce bpf timers.")
+Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
+Link: https://lore.kernel.org/r/20240709185440.1104957-2-memxor@gmail.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/bpf/helpers.c | 38 +++++++++++++++++++++++++++++++++++---
+ 1 file changed, 35 insertions(+), 3 deletions(-)
+
+diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
+index ff18b467d7d75..79cb5681cf136 100644
+--- a/kernel/bpf/helpers.c
++++ b/kernel/bpf/helpers.c
+@@ -1107,6 +1107,7 @@ struct bpf_async_cb {
+ struct bpf_hrtimer {
+       struct bpf_async_cb cb;
+       struct hrtimer timer;
++      atomic_t cancelling;
+ };
+ 
+ /* the actual struct hidden inside uapi struct bpf_timer */
+@@ -1205,6 +1206,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
+               clockid = flags & (MAX_CLOCKS - 1);
+               t = (struct bpf_hrtimer *)cb;
+ 
++              atomic_set(&t->cancelling, 0);
+               hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);
+               t->timer.function = bpf_timer_cb;
+               cb->value = (void *)async - map->record->timer_off;
+@@ -1368,7 +1370,8 @@ static void drop_prog_refcnt(struct bpf_async_cb *async)
+ 
+ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer)
+ {
+-      struct bpf_hrtimer *t;
++      struct bpf_hrtimer *t, *cur_t;
++      bool inc = false;
+       int ret = 0;
+ 
+       if (in_nmi())
+@@ -1380,14 +1383,41 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer)
+               ret = -EINVAL;
+               goto out;
+       }
+-      if (this_cpu_read(hrtimer_running) == t) {
++
++      cur_t = this_cpu_read(hrtimer_running);
++      if (cur_t == t) {
+               /* If bpf callback_fn is trying to bpf_timer_cancel()
+                * its own timer the hrtimer_cancel() will deadlock
+-               * since it waits for callback_fn to finish
++               * since it waits for callback_fn to finish.
++               */
++              ret = -EDEADLK;
++              goto out;
++      }
++
++      /* Only account in-flight cancellations when invoked from a timer
++       * callback, since we want to avoid waiting only if other _callbacks_
++       * are waiting on us, to avoid introducing lockups. Non-callback paths
++       * are ok, since nobody would synchronously wait for their completion.
++       */
++      if (!cur_t)
++              goto drop;
++      atomic_inc(&t->cancelling);
++      /* Need full barrier after relaxed atomic_inc */
++      smp_mb__after_atomic();
++      inc = true;
++      if (atomic_read(&cur_t->cancelling)) {
++              /* We're cancelling timer t, while some other timer callback is
++               * attempting to cancel us. In such a case, it might be possible
++               * that timer t belongs to the other callback, or some other
++               * callback waiting upon it (creating transitive dependencies
++               * upon us), and we will enter a deadlock if we continue
++               * cancelling and waiting for it synchronously, since it might
++               * do the same. Bail!
+                */
+               ret = -EDEADLK;
+               goto out;
+       }
++drop:
+       drop_prog_refcnt(&t->cb);
+ out:
+       __bpf_spin_unlock_irqrestore(&timer->lock);
+@@ -1395,6 +1425,8 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer)
+        * if it was running.
+        */
+       ret = ret ?: hrtimer_cancel(&t->timer);
++      if (inc)
++              atomic_dec(&t->cancelling);
+       rcu_read_unlock();
+       return ret;
+ }
+-- 
+2.43.0
+
diff --git a/queue-6.9/bpf-fix-order-of-args-in-call-to-bpf_map_kvcalloc.patch b/queue-6.9/bpf-fix-order-of-args-in-call-to-bpf_map_kvcalloc.patch

new file mode 100644 (file)

index 0000000..a19f5b9
--- /dev/null
+++ b/queue-6.9/bpf-fix-order-of-args-in-call-to-bpf_map_kvcalloc.patch
@@ -0,0 +1,61 @@
+From ef52e72bd5b015852a4640b3d6f20921e7f2a716 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 10 Jul 2024 12:05:22 +0200
+Subject: bpf: fix order of args in call to bpf_map_kvcalloc
+
+From: Mohammad Shehar Yaar Tausif <sheharyaar48@gmail.com>
+
+[ Upstream commit af253aef183a31ce62d2e39fc520b0ebfb562bb9 ]
+
+The original function call passed size of smap->bucket before the number of
+buckets which raises the error 'calloc-transposed-args' on compilation.
+
+Vlastimil Babka added:
+
+The order of parameters can be traced back all the way to 6ac99e8f23d4
+("bpf: Introduce bpf sk local storage") accross several refactorings,
+and that's why the commit is used as a Fixes: tag.
+
+In v6.10-rc1, a different commit 2c321f3f70bc ("mm: change inlined
+allocation helpers to account at the call site") however exposed the
+order of args in a way that gcc-14 has enough visibility to start
+warning about it, because (in !CONFIG_MEMCG case) bpf_map_kvcalloc is
+then a macro alias for kvcalloc instead of a static inline wrapper.
+
+To sum up the warning happens when the following conditions are all met:
+
+- gcc-14 is used (didn't see it with gcc-13)
+- commit 2c321f3f70bc is present
+- CONFIG_MEMCG is not enabled in .config
+- CONFIG_WERROR turns this from a compiler warning to error
+
+Fixes: 6ac99e8f23d4 ("bpf: Introduce bpf sk local storage")
+Reviewed-by: Andrii Nakryiko <andrii@kernel.org>
+Tested-by: Christian Kujau <lists@nerdbynature.de>
+Signed-off-by: Mohammad Shehar Yaar Tausif <sheharyaar48@gmail.com>
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+Link: https://lore.kernel.org/r/20240710100521.15061-2-vbabka@suse.cz
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/bpf/bpf_local_storage.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
+index bdea1a459153c..bea5873d96d15 100644
+--- a/kernel/bpf/bpf_local_storage.c
++++ b/kernel/bpf/bpf_local_storage.c
+@@ -782,8 +782,8 @@ bpf_local_storage_map_alloc(union bpf_attr *attr,
+       nbuckets = max_t(u32, 2, nbuckets);
+       smap->bucket_log = ilog2(nbuckets);
+ 
+-      smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets),
+-                                       nbuckets, GFP_USER | __GFP_NOWARN);
++      smap->buckets = bpf_map_kvcalloc(&smap->map, nbuckets,
++                                       sizeof(*smap->buckets), GFP_USER | __GFP_NOWARN);
+       if (!smap->buckets) {
+               err = -ENOMEM;
+               goto free_smap;
+-- 
+2.43.0
+
diff --git a/queue-6.9/bpf-fix-too-early-release-of-tcx_entry.patch b/queue-6.9/bpf-fix-too-early-release-of-tcx_entry.patch

new file mode 100644 (file)

index 0000000..48012a1
--- /dev/null
+++ b/queue-6.9/bpf-fix-too-early-release-of-tcx_entry.patch
@@ -0,0 +1,181 @@
+From ea30764220557e917e8dce7a0cfe7b0c850484ce Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 8 Jul 2024 15:31:29 +0200
+Subject: bpf: Fix too early release of tcx_entry
+
+From: Daniel Borkmann <daniel@iogearbox.net>
+
+[ Upstream commit 1cb6f0bae50441f4b4b32a28315853b279c7404e ]
+
+Pedro Pinto and later independently also Hyunwoo Kim and Wongi Lee reported
+an issue that the tcx_entry can be released too early leading to a use
+after free (UAF) when an active old-style ingress or clsact qdisc with a
+shared tc block is later replaced by another ingress or clsact instance.
+
+Essentially, the sequence to trigger the UAF (one example) can be as follows:
+
+  1. A network namespace is created
+  2. An ingress qdisc is created. This allocates a tcx_entry, and
+     &tcx_entry->miniq is stored in the qdisc's miniqp->p_miniq. At the
+     same time, a tcf block with index 1 is created.
+  3. chain0 is attached to the tcf block. chain0 must be connected to
+     the block linked to the ingress qdisc to later reach the function
+     tcf_chain0_head_change_cb_del() which triggers the UAF.
+  4. Create and graft a clsact qdisc. This causes the ingress qdisc
+     created in step 1 to be removed, thus freeing the previously linked
+     tcx_entry:
+
+     rtnetlink_rcv_msg()
+       => tc_modify_qdisc()
+         => qdisc_create()
+           => clsact_init() [a]
+         => qdisc_graft()
+           => qdisc_destroy()
+             => __qdisc_destroy()
+               => ingress_destroy() [b]
+                 => tcx_entry_free()
+                   => kfree_rcu() // tcx_entry freed
+
+  5. Finally, the network namespace is closed. This registers the
+     cleanup_net worker, and during the process of releasing the
+     remaining clsact qdisc, it accesses the tcx_entry that was
+     already freed in step 4, causing the UAF to occur:
+
+     cleanup_net()
+       => ops_exit_list()
+         => default_device_exit_batch()
+           => unregister_netdevice_many()
+             => unregister_netdevice_many_notify()
+               => dev_shutdown()
+                 => qdisc_put()
+                   => clsact_destroy() [c]
+                     => tcf_block_put_ext()
+                       => tcf_chain0_head_change_cb_del()
+                         => tcf_chain_head_change_item()
+                           => clsact_chain_head_change()
+                             => mini_qdisc_pair_swap() // UAF
+
+There are also other variants, the gist is to add an ingress (or clsact)
+qdisc with a specific shared block, then to replace that qdisc, waiting
+for the tcx_entry kfree_rcu() to be executed and subsequently accessing
+the current active qdisc's miniq one way or another.
+
+The correct fix is to turn the miniq_active boolean into a counter. What
+can be observed, at step 2 above, the counter transitions from 0->1, at
+step [a] from 1->2 (in order for the miniq object to remain active during
+the replacement), then in [b] from 2->1 and finally [c] 1->0 with the
+eventual release. The reference counter in general ranges from [0,2] and
+it does not need to be atomic since all access to the counter is protected
+by the rtnl mutex. With this in place, there is no longer a UAF happening
+and the tcx_entry is freed at the correct time.
+
+Fixes: e420bed02507 ("bpf: Add fd-based tcx multi-prog infra with link support")
+Reported-by: Pedro Pinto <xten@osec.io>
+Co-developed-by: Pedro Pinto <xten@osec.io>
+Signed-off-by: Pedro Pinto <xten@osec.io>
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+Cc: Hyunwoo Kim <v4bel@theori.io>
+Cc: Wongi Lee <qwerty@theori.io>
+Cc: Martin KaFai Lau <martin.lau@kernel.org>
+Link: https://lore.kernel.org/r/20240708133130.11609-1-daniel@iogearbox.net
+Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/tcx.h       | 13 +++++++++----
+ net/sched/sch_ingress.c | 12 ++++++------
+ 2 files changed, 15 insertions(+), 10 deletions(-)
+
+diff --git a/include/net/tcx.h b/include/net/tcx.h
+index 04be9377785d7..0a5f40a91c42f 100644
+--- a/include/net/tcx.h
++++ b/include/net/tcx.h
+@@ -13,7 +13,7 @@ struct mini_Qdisc;
+ struct tcx_entry {
+       struct mini_Qdisc __rcu *miniq;
+       struct bpf_mprog_bundle bundle;
+-      bool miniq_active;
++      u32 miniq_active;
+       struct rcu_head rcu;
+ };
+ 
+@@ -124,11 +124,16 @@ static inline void tcx_skeys_dec(bool ingress)
+       tcx_dec();
+ }
+ 
+-static inline void tcx_miniq_set_active(struct bpf_mprog_entry *entry,
+-                                      const bool active)
++static inline void tcx_miniq_inc(struct bpf_mprog_entry *entry)
+ {
+       ASSERT_RTNL();
+-      tcx_entry(entry)->miniq_active = active;
++      tcx_entry(entry)->miniq_active++;
++}
++
++static inline void tcx_miniq_dec(struct bpf_mprog_entry *entry)
++{
++      ASSERT_RTNL();
++      tcx_entry(entry)->miniq_active--;
+ }
+ 
+ static inline bool tcx_entry_is_active(struct bpf_mprog_entry *entry)
+diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
+index c2ef9dcf91d2d..cc6051d4f2ef8 100644
+--- a/net/sched/sch_ingress.c
++++ b/net/sched/sch_ingress.c
+@@ -91,7 +91,7 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt,
+       entry = tcx_entry_fetch_or_create(dev, true, &created);
+       if (!entry)
+               return -ENOMEM;
+-      tcx_miniq_set_active(entry, true);
++      tcx_miniq_inc(entry);
+       mini_qdisc_pair_init(&q->miniqp, sch, &tcx_entry(entry)->miniq);
+       if (created)
+               tcx_entry_update(dev, entry, true);
+@@ -121,7 +121,7 @@ static void ingress_destroy(struct Qdisc *sch)
+       tcf_block_put_ext(q->block, sch, &q->block_info);
+ 
+       if (entry) {
+-              tcx_miniq_set_active(entry, false);
++              tcx_miniq_dec(entry);
+               if (!tcx_entry_is_active(entry)) {
+                       tcx_entry_update(dev, NULL, true);
+                       tcx_entry_free(entry);
+@@ -257,7 +257,7 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt,
+       entry = tcx_entry_fetch_or_create(dev, true, &created);
+       if (!entry)
+               return -ENOMEM;
+-      tcx_miniq_set_active(entry, true);
++      tcx_miniq_inc(entry);
+       mini_qdisc_pair_init(&q->miniqp_ingress, sch, &tcx_entry(entry)->miniq);
+       if (created)
+               tcx_entry_update(dev, entry, true);
+@@ -276,7 +276,7 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt,
+       entry = tcx_entry_fetch_or_create(dev, false, &created);
+       if (!entry)
+               return -ENOMEM;
+-      tcx_miniq_set_active(entry, true);
++      tcx_miniq_inc(entry);
+       mini_qdisc_pair_init(&q->miniqp_egress, sch, &tcx_entry(entry)->miniq);
+       if (created)
+               tcx_entry_update(dev, entry, false);
+@@ -302,7 +302,7 @@ static void clsact_destroy(struct Qdisc *sch)
+       tcf_block_put_ext(q->egress_block, sch, &q->egress_block_info);
+ 
+       if (ingress_entry) {
+-              tcx_miniq_set_active(ingress_entry, false);
++              tcx_miniq_dec(ingress_entry);
+               if (!tcx_entry_is_active(ingress_entry)) {
+                       tcx_entry_update(dev, NULL, true);
+                       tcx_entry_free(ingress_entry);
+@@ -310,7 +310,7 @@ static void clsact_destroy(struct Qdisc *sch)
+       }
+ 
+       if (egress_entry) {
+-              tcx_miniq_set_active(egress_entry, false);
++              tcx_miniq_dec(egress_entry);
+               if (!tcx_entry_is_active(egress_entry)) {
+                       tcx_entry_update(dev, NULL, false);
+                       tcx_entry_free(egress_entry);
+-- 
+2.43.0
+
diff --git a/queue-6.9/bpf-make-timer-data-struct-more-generic.patch b/queue-6.9/bpf-make-timer-data-struct-more-generic.patch

new file mode 100644 (file)

index 0000000..0dc7a2a
--- /dev/null
+++ b/queue-6.9/bpf-make-timer-data-struct-more-generic.patch
@@ -0,0 +1,246 @@
+From ecdf1c4e966274f8edf5390de034c245a7575d8e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 20 Apr 2024 11:09:01 +0200
+Subject: bpf: make timer data struct more generic
+
+From: Benjamin Tissoires <bentiss@kernel.org>
+
+[ Upstream commit be2749beff62e0d63cf97fe63cabc79a68443139 ]
+
+To be able to add workqueues and reuse most of the timer code, we need
+to make bpf_hrtimer more generic.
+
+There is no code change except that the new struct gets a new u64 flags
+attribute. We are still below 2 cache lines, so this shouldn't impact
+the current running codes.
+
+The ordering is also changed. Everything related to async callback
+is now on top of bpf_hrtimer.
+
+Signed-off-by: Benjamin Tissoires <bentiss@kernel.org>
+Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-1-6c986a5a741f@kernel.org
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Stable-dep-of: d4523831f07a ("bpf: Fail bpf_timer_cancel when callback is being cancelled")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/bpf/helpers.c | 71 ++++++++++++++++++++++++--------------------
+ 1 file changed, 38 insertions(+), 33 deletions(-)
+
+diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
+index 449b9a5d3fe3f..2544220c23338 100644
+--- a/kernel/bpf/helpers.c
++++ b/kernel/bpf/helpers.c
+@@ -1079,11 +1079,20 @@ const struct bpf_func_proto bpf_snprintf_proto = {
+       .arg5_type      = ARG_CONST_SIZE_OR_ZERO,
+ };
+ 
++struct bpf_async_cb {
++      struct bpf_map *map;
++      struct bpf_prog *prog;
++      void __rcu *callback_fn;
++      void *value;
++      struct rcu_head rcu;
++      u64 flags;
++};
++
+ /* BPF map elements can contain 'struct bpf_timer'.
+  * Such map owns all of its BPF timers.
+  * 'struct bpf_timer' is allocated as part of map element allocation
+  * and it's zero initialized.
+- * That space is used to keep 'struct bpf_timer_kern'.
++ * That space is used to keep 'struct bpf_async_kern'.
+  * bpf_timer_init() allocates 'struct bpf_hrtimer', inits hrtimer, and
+  * remembers 'struct bpf_map *' pointer it's part of.
+  * bpf_timer_set_callback() increments prog refcnt and assign bpf callback_fn.
+@@ -1096,16 +1105,12 @@ const struct bpf_func_proto bpf_snprintf_proto = {
+  * freeing the timers when inner map is replaced or deleted by user space.
+  */
+ struct bpf_hrtimer {
++      struct bpf_async_cb cb;
+       struct hrtimer timer;
+-      struct bpf_map *map;
+-      struct bpf_prog *prog;
+-      void __rcu *callback_fn;
+-      void *value;
+-      struct rcu_head rcu;
+ };
+ 
+ /* the actual struct hidden inside uapi struct bpf_timer */
+-struct bpf_timer_kern {
++struct bpf_async_kern {
+       struct bpf_hrtimer *timer;
+       /* bpf_spin_lock is used here instead of spinlock_t to make
+        * sure that it always fits into space reserved by struct bpf_timer
+@@ -1119,14 +1124,14 @@ static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running);
+ static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
+ {
+       struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer);
+-      struct bpf_map *map = t->map;
+-      void *value = t->value;
++      struct bpf_map *map = t->cb.map;
++      void *value = t->cb.value;
+       bpf_callback_t callback_fn;
+       void *key;
+       u32 idx;
+ 
+       BTF_TYPE_EMIT(struct bpf_timer);
+-      callback_fn = rcu_dereference_check(t->callback_fn, rcu_read_lock_bh_held());
++      callback_fn = rcu_dereference_check(t->cb.callback_fn, rcu_read_lock_bh_held());
+       if (!callback_fn)
+               goto out;
+ 
+@@ -1155,7 +1160,7 @@ static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
+       return HRTIMER_NORESTART;
+ }
+ 
+-BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map,
++BPF_CALL_3(bpf_timer_init, struct bpf_async_kern *, timer, struct bpf_map *, map,
+          u64, flags)
+ {
+       clockid_t clockid = flags & (MAX_CLOCKS - 1);
+@@ -1163,8 +1168,8 @@ BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map
+       int ret = 0;
+ 
+       BUILD_BUG_ON(MAX_CLOCKS != 16);
+-      BUILD_BUG_ON(sizeof(struct bpf_timer_kern) > sizeof(struct bpf_timer));
+-      BUILD_BUG_ON(__alignof__(struct bpf_timer_kern) != __alignof__(struct bpf_timer));
++      BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_timer));
++      BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_timer));
+ 
+       if (in_nmi())
+               return -EOPNOTSUPP;
+@@ -1187,10 +1192,10 @@ BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map
+               ret = -ENOMEM;
+               goto out;
+       }
+-      t->value = (void *)timer - map->record->timer_off;
+-      t->map = map;
+-      t->prog = NULL;
+-      rcu_assign_pointer(t->callback_fn, NULL);
++      t->cb.value = (void *)timer - map->record->timer_off;
++      t->cb.map = map;
++      t->cb.prog = NULL;
++      rcu_assign_pointer(t->cb.callback_fn, NULL);
+       hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);
+       t->timer.function = bpf_timer_cb;
+       WRITE_ONCE(timer->timer, t);
+@@ -1222,7 +1227,7 @@ static const struct bpf_func_proto bpf_timer_init_proto = {
+       .arg3_type      = ARG_ANYTHING,
+ };
+ 
+-BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callback_fn,
++BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callback_fn,
+          struct bpf_prog_aux *, aux)
+ {
+       struct bpf_prog *prev, *prog = aux->prog;
+@@ -1237,7 +1242,7 @@ BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callb
+               ret = -EINVAL;
+               goto out;
+       }
+-      if (!atomic64_read(&t->map->usercnt)) {
++      if (!atomic64_read(&t->cb.map->usercnt)) {
+               /* maps with timers must be either held by user space
+                * or pinned in bpffs. Otherwise timer might still be
+                * running even when bpf prog is detached and user space
+@@ -1246,7 +1251,7 @@ BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callb
+               ret = -EPERM;
+               goto out;
+       }
+-      prev = t->prog;
++      prev = t->cb.prog;
+       if (prev != prog) {
+               /* Bump prog refcnt once. Every bpf_timer_set_callback()
+                * can pick different callback_fn-s within the same prog.
+@@ -1259,9 +1264,9 @@ BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callb
+               if (prev)
+                       /* Drop prev prog refcnt when swapping with new prog */
+                       bpf_prog_put(prev);
+-              t->prog = prog;
++              t->cb.prog = prog;
+       }
+-      rcu_assign_pointer(t->callback_fn, callback_fn);
++      rcu_assign_pointer(t->cb.callback_fn, callback_fn);
+ out:
+       __bpf_spin_unlock_irqrestore(&timer->lock);
+       return ret;
+@@ -1275,7 +1280,7 @@ static const struct bpf_func_proto bpf_timer_set_callback_proto = {
+       .arg2_type      = ARG_PTR_TO_FUNC,
+ };
+ 
+-BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, flags)
++BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, timer, u64, nsecs, u64, flags)
+ {
+       struct bpf_hrtimer *t;
+       int ret = 0;
+@@ -1287,7 +1292,7 @@ BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, fla
+               return -EINVAL;
+       __bpf_spin_lock_irqsave(&timer->lock);
+       t = timer->timer;
+-      if (!t || !t->prog) {
++      if (!t || !t->cb.prog) {
+               ret = -EINVAL;
+               goto out;
+       }
+@@ -1315,18 +1320,18 @@ static const struct bpf_func_proto bpf_timer_start_proto = {
+       .arg3_type      = ARG_ANYTHING,
+ };
+ 
+-static void drop_prog_refcnt(struct bpf_hrtimer *t)
++static void drop_prog_refcnt(struct bpf_async_cb *async)
+ {
+-      struct bpf_prog *prog = t->prog;
++      struct bpf_prog *prog = async->prog;
+ 
+       if (prog) {
+               bpf_prog_put(prog);
+-              t->prog = NULL;
+-              rcu_assign_pointer(t->callback_fn, NULL);
++              async->prog = NULL;
++              rcu_assign_pointer(async->callback_fn, NULL);
+       }
+ }
+ 
+-BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer)
++BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer)
+ {
+       struct bpf_hrtimer *t;
+       int ret = 0;
+@@ -1348,7 +1353,7 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer)
+               ret = -EDEADLK;
+               goto out;
+       }
+-      drop_prog_refcnt(t);
++      drop_prog_refcnt(&t->cb);
+ out:
+       __bpf_spin_unlock_irqrestore(&timer->lock);
+       /* Cancel the timer and wait for associated callback to finish
+@@ -1371,7 +1376,7 @@ static const struct bpf_func_proto bpf_timer_cancel_proto = {
+  */
+ void bpf_timer_cancel_and_free(void *val)
+ {
+-      struct bpf_timer_kern *timer = val;
++      struct bpf_async_kern *timer = val;
+       struct bpf_hrtimer *t;
+ 
+       /* Performance optimization: read timer->timer without lock first. */
+@@ -1383,7 +1388,7 @@ void bpf_timer_cancel_and_free(void *val)
+       t = timer->timer;
+       if (!t)
+               goto out;
+-      drop_prog_refcnt(t);
++      drop_prog_refcnt(&t->cb);
+       /* The subsequent bpf_timer_start/cancel() helpers won't be able to use
+        * this timer, since it won't be initialized.
+        */
+@@ -1410,7 +1415,7 @@ void bpf_timer_cancel_and_free(void *val)
+        */
+       if (this_cpu_read(hrtimer_running) != t)
+               hrtimer_cancel(&t->timer);
+-      kfree_rcu(t, rcu);
++      kfree_rcu(t, cb.rcu);
+ }
+ 
+ BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr)
+-- 
+2.43.0
+
diff --git a/queue-6.9/bpf-replace-bpf_timer_init-with-a-generic-helper.patch b/queue-6.9/bpf-replace-bpf_timer_init-with-a-generic-helper.patch

new file mode 100644 (file)

index 0000000..652158a
--- /dev/null
+++ b/queue-6.9/bpf-replace-bpf_timer_init-with-a-generic-helper.patch
@@ -0,0 +1,169 @@
+From 43b291fab248d2e209f38a39dd65cb0c1dbeced9 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 20 Apr 2024 11:09:02 +0200
+Subject: bpf: replace bpf_timer_init with a generic helper
+
+From: Benjamin Tissoires <bentiss@kernel.org>
+
+[ Upstream commit 56b4a177ae6322173360a93ea828ad18570a5a14 ]
+
+No code change except for the new flags argument being stored in the
+local data struct.
+
+Signed-off-by: Benjamin Tissoires <bentiss@kernel.org>
+Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-2-6c986a5a741f@kernel.org
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Stable-dep-of: d4523831f07a ("bpf: Fail bpf_timer_cancel when callback is being cancelled")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/bpf/helpers.c | 91 ++++++++++++++++++++++++++++++--------------
+ 1 file changed, 63 insertions(+), 28 deletions(-)
+
+diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
+index 2544220c23338..ff18b467d7d75 100644
+--- a/kernel/bpf/helpers.c
++++ b/kernel/bpf/helpers.c
+@@ -1111,7 +1111,10 @@ struct bpf_hrtimer {
+ 
+ /* the actual struct hidden inside uapi struct bpf_timer */
+ struct bpf_async_kern {
+-      struct bpf_hrtimer *timer;
++      union {
++              struct bpf_async_cb *cb;
++              struct bpf_hrtimer *timer;
++      };
+       /* bpf_spin_lock is used here instead of spinlock_t to make
+        * sure that it always fits into space reserved by struct bpf_timer
+        * regardless of LOCKDEP and spinlock debug flags.
+@@ -1119,6 +1122,10 @@ struct bpf_async_kern {
+       struct bpf_spin_lock lock;
+ } __attribute__((aligned(8)));
+ 
++enum bpf_async_type {
++      BPF_ASYNC_TYPE_TIMER = 0,
++};
++
+ static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running);
+ 
+ static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
+@@ -1160,46 +1167,55 @@ static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
+       return HRTIMER_NORESTART;
+ }
+ 
+-BPF_CALL_3(bpf_timer_init, struct bpf_async_kern *, timer, struct bpf_map *, map,
+-         u64, flags)
++static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags,
++                          enum bpf_async_type type)
+ {
+-      clockid_t clockid = flags & (MAX_CLOCKS - 1);
++      struct bpf_async_cb *cb;
+       struct bpf_hrtimer *t;
++      clockid_t clockid;
++      size_t size;
+       int ret = 0;
+ 
+-      BUILD_BUG_ON(MAX_CLOCKS != 16);
+-      BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_timer));
+-      BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_timer));
+-
+       if (in_nmi())
+               return -EOPNOTSUPP;
+ 
+-      if (flags >= MAX_CLOCKS ||
+-          /* similar to timerfd except _ALARM variants are not supported */
+-          (clockid != CLOCK_MONOTONIC &&
+-           clockid != CLOCK_REALTIME &&
+-           clockid != CLOCK_BOOTTIME))
++      switch (type) {
++      case BPF_ASYNC_TYPE_TIMER:
++              size = sizeof(struct bpf_hrtimer);
++              break;
++      default:
+               return -EINVAL;
+-      __bpf_spin_lock_irqsave(&timer->lock);
+-      t = timer->timer;
++      }
++
++      __bpf_spin_lock_irqsave(&async->lock);
++      t = async->timer;
+       if (t) {
+               ret = -EBUSY;
+               goto out;
+       }
++
+       /* allocate hrtimer via map_kmalloc to use memcg accounting */
+-      t = bpf_map_kmalloc_node(map, sizeof(*t), GFP_ATOMIC, map->numa_node);
+-      if (!t) {
++      cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node);
++      if (!cb) {
+               ret = -ENOMEM;
+               goto out;
+       }
+-      t->cb.value = (void *)timer - map->record->timer_off;
+-      t->cb.map = map;
+-      t->cb.prog = NULL;
+-      rcu_assign_pointer(t->cb.callback_fn, NULL);
+-      hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);
+-      t->timer.function = bpf_timer_cb;
+-      WRITE_ONCE(timer->timer, t);
+-      /* Guarantee the order between timer->timer and map->usercnt. So
++
++      if (type == BPF_ASYNC_TYPE_TIMER) {
++              clockid = flags & (MAX_CLOCKS - 1);
++              t = (struct bpf_hrtimer *)cb;
++
++              hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);
++              t->timer.function = bpf_timer_cb;
++              cb->value = (void *)async - map->record->timer_off;
++      }
++      cb->map = map;
++      cb->prog = NULL;
++      cb->flags = flags;
++      rcu_assign_pointer(cb->callback_fn, NULL);
++
++      WRITE_ONCE(async->cb, cb);
++      /* Guarantee the order between async->cb and map->usercnt. So
+        * when there are concurrent uref release and bpf timer init, either
+        * bpf_timer_cancel_and_free() called by uref release reads a no-NULL
+        * timer or atomic64_read() below returns a zero usercnt.
+@@ -1209,15 +1225,34 @@ BPF_CALL_3(bpf_timer_init, struct bpf_async_kern *, timer, struct bpf_map *, map
+               /* maps with timers must be either held by user space
+                * or pinned in bpffs.
+                */
+-              WRITE_ONCE(timer->timer, NULL);
+-              kfree(t);
++              WRITE_ONCE(async->cb, NULL);
++              kfree(cb);
+               ret = -EPERM;
+       }
+ out:
+-      __bpf_spin_unlock_irqrestore(&timer->lock);
++      __bpf_spin_unlock_irqrestore(&async->lock);
+       return ret;
+ }
+ 
++BPF_CALL_3(bpf_timer_init, struct bpf_async_kern *, timer, struct bpf_map *, map,
++         u64, flags)
++{
++      clock_t clockid = flags & (MAX_CLOCKS - 1);
++
++      BUILD_BUG_ON(MAX_CLOCKS != 16);
++      BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_timer));
++      BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_timer));
++
++      if (flags >= MAX_CLOCKS ||
++          /* similar to timerfd except _ALARM variants are not supported */
++          (clockid != CLOCK_MONOTONIC &&
++           clockid != CLOCK_REALTIME &&
++           clockid != CLOCK_BOOTTIME))
++              return -EINVAL;
++
++      return __bpf_async_init(timer, map, flags, BPF_ASYNC_TYPE_TIMER);
++}
++
+ static const struct bpf_func_proto bpf_timer_init_proto = {
+       .func           = bpf_timer_init,
+       .gpl_only       = true,
+-- 
+2.43.0
+
diff --git a/queue-6.9/cachefiles-add-missing-lock-protection-when-polling.patch b/queue-6.9/cachefiles-add-missing-lock-protection-when-polling.patch

new file mode 100644 (file)

index 0000000..655c5bc
--- /dev/null
+++ b/queue-6.9/cachefiles-add-missing-lock-protection-when-polling.patch
@@ -0,0 +1,56 @@
+From 2b2309ba4490d255735d2bda122e91c4cb6b3942 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 28 Jun 2024 14:29:30 +0800
+Subject: cachefiles: add missing lock protection when polling
+
+From: Jingbo Xu <jefflexu@linux.alibaba.com>
+
+[ Upstream commit cf5bb09e742a9cf6349127e868329a8f69b7a014 ]
+
+Add missing lock protection in poll routine when iterating xarray,
+otherwise:
+
+Even with RCU read lock held, only the slot of the radix tree is
+ensured to be pinned there, while the data structure (e.g. struct
+cachefiles_req) stored in the slot has no such guarantee.  The poll
+routine will iterate the radix tree and dereference cachefiles_req
+accordingly.  Thus RCU read lock is not adequate in this case and
+spinlock is needed here.
+
+Fixes: b817e22b2e91 ("cachefiles: narrow the scope of triggering EPOLLIN events in ondemand mode")
+Signed-off-by: Jingbo Xu <jefflexu@linux.alibaba.com>
+Signed-off-by: Baokun Li <libaokun1@huawei.com>
+Link: https://lore.kernel.org/r/20240628062930.2467993-10-libaokun@huaweicloud.com
+Acked-by: Jeff Layton <jlayton@kernel.org>
+Reviewed-by: Jia Zhu <zhujia.zj@bytedance.com>
+Reviewed-by: Gao Xiang <hsiangkao@linux.alibaba.com>
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/cachefiles/daemon.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
+index 06cdf1a8a16f6..89b11336a8369 100644
+--- a/fs/cachefiles/daemon.c
++++ b/fs/cachefiles/daemon.c
+@@ -366,14 +366,14 @@ static __poll_t cachefiles_daemon_poll(struct file *file,
+ 
+       if (cachefiles_in_ondemand_mode(cache)) {
+               if (!xa_empty(&cache->reqs)) {
+-                      rcu_read_lock();
++                      xas_lock(&xas);
+                       xas_for_each_marked(&xas, req, ULONG_MAX, CACHEFILES_REQ_NEW) {
+                               if (!cachefiles_ondemand_is_reopening_read(req)) {
+                                       mask |= EPOLLIN;
+                                       break;
+                               }
+                       }
+-                      rcu_read_unlock();
++                      xas_unlock(&xas);
+               }
+       } else {
+               if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags))
+-- 
+2.43.0
+
diff --git a/queue-6.9/cachefiles-cancel-all-requests-for-the-object-that-i.patch b/queue-6.9/cachefiles-cancel-all-requests-for-the-object-that-i.patch

new file mode 100644 (file)

index 0000000..c8f2520
--- /dev/null
+++ b/queue-6.9/cachefiles-cancel-all-requests-for-the-object-that-i.patch
@@ -0,0 +1,67 @@
+From 24e9b672fc1556e6c98bf83cba2b8f76f00686ac Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 28 Jun 2024 14:29:27 +0800
+Subject: cachefiles: cancel all requests for the object that is being dropped
+
+From: Baokun Li <libaokun1@huawei.com>
+
+[ Upstream commit 751f524635a4f076117d714705eeddadaf6748ee ]
+
+Because after an object is dropped, requests for that object are useless,
+cancel them to avoid causing other problems.
+
+This prepares for the later addition of cancel_work_sync(). After the
+reopen requests is generated, cancel it to avoid cancel_work_sync()
+blocking by waiting for daemon to complete the reopen requests.
+
+Signed-off-by: Baokun Li <libaokun1@huawei.com>
+Link: https://lore.kernel.org/r/20240628062930.2467993-7-libaokun@huaweicloud.com
+Acked-by: Jeff Layton <jlayton@kernel.org>
+Reviewed-by: Gao Xiang <hsiangkao@linux.alibaba.com>
+Reviewed-by: Jia Zhu <zhujia.zj@bytedance.com>
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Stable-dep-of: 12e009d60852 ("cachefiles: wait for ondemand_object_worker to finish when dropping object")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/cachefiles/ondemand.c | 19 +++++++++++++++++++
+ 1 file changed, 19 insertions(+)
+
+diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c
+index 14f91a9fbe447..1c0fa7412a6fa 100644
+--- a/fs/cachefiles/ondemand.c
++++ b/fs/cachefiles/ondemand.c
+@@ -636,12 +636,31 @@ int cachefiles_ondemand_init_object(struct cachefiles_object *object)
+ 
+ void cachefiles_ondemand_clean_object(struct cachefiles_object *object)
+ {
++      unsigned long index;
++      struct cachefiles_req *req;
++      struct cachefiles_cache *cache;
++
+       if (!object->ondemand)
+               return;
+ 
+       cachefiles_ondemand_send_req(object, CACHEFILES_OP_CLOSE, 0,
+                       cachefiles_ondemand_init_close_req, NULL);
++
++      if (!object->ondemand->ondemand_id)
++              return;
++
++      /* Cancel all requests for the object that is being dropped. */
++      cache = object->volume->cache;
++      xa_lock(&cache->reqs);
+       cachefiles_ondemand_set_object_dropping(object);
++      xa_for_each(&cache->reqs, index, req) {
++              if (req->object == object) {
++                      req->error = -EIO;
++                      complete(&req->done);
++                      __xa_erase(&cache->reqs, index);
++              }
++      }
++      xa_unlock(&cache->reqs);
+ }
+ 
+ int cachefiles_ondemand_init_obj_info(struct cachefiles_object *object,
+-- 
+2.43.0
+
diff --git a/queue-6.9/cachefiles-cyclic-allocation-of-msg_id-to-avoid-reus.patch b/queue-6.9/cachefiles-cyclic-allocation-of-msg_id-to-avoid-reus.patch

new file mode 100644 (file)

index 0000000..61452e6
--- /dev/null
+++ b/queue-6.9/cachefiles-cyclic-allocation-of-msg_id-to-avoid-reus.patch
@@ -0,0 +1,123 @@
+From 73716c7a4b9aed45c388e0b99c5a90c9b14cb6fb Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 28 Jun 2024 14:29:29 +0800
+Subject: cachefiles: cyclic allocation of msg_id to avoid reuse
+
+From: Baokun Li <libaokun1@huawei.com>
+
+[ Upstream commit 19f4f399091478c95947f6bd7ad61622300c30d9 ]
+
+Reusing the msg_id after a maliciously completed reopen request may cause
+a read request to remain unprocessed and result in a hung, as shown below:
+
+       t1       |      t2       |      t3
+-------------------------------------------------
+cachefiles_ondemand_select_req
+ cachefiles_ondemand_object_is_close(A)
+ cachefiles_ondemand_set_object_reopening(A)
+ queue_work(fscache_object_wq, &info->work)
+                ondemand_object_worker
+                 cachefiles_ondemand_init_object(A)
+                  cachefiles_ondemand_send_req(OPEN)
+                    // get msg_id 6
+                    wait_for_completion(&req_A->done)
+cachefiles_ondemand_daemon_read
+ // read msg_id 6 req_A
+ cachefiles_ondemand_get_fd
+ copy_to_user
+                                // Malicious completion msg_id 6
+                                copen 6,-1
+                                cachefiles_ondemand_copen
+                                 complete(&req_A->done)
+                                 // will not set the object to close
+                                 // because ondemand_id && fd is valid.
+
+                // ondemand_object_worker() is done
+                // but the object is still reopening.
+
+                                // new open req_B
+                                cachefiles_ondemand_init_object(B)
+                                 cachefiles_ondemand_send_req(OPEN)
+                                 // reuse msg_id 6
+process_open_req
+ copen 6,A.size
+ // The expected failed copen was executed successfully
+
+Expect copen to fail, and when it does, it closes fd, which sets the
+object to close, and then close triggers reopen again. However, due to
+msg_id reuse resulting in a successful copen, the anonymous fd is not
+closed until the daemon exits. Therefore read requests waiting for reopen
+to complete may trigger hung task.
+
+To avoid this issue, allocate the msg_id cyclically to avoid reusing the
+msg_id for a very short duration of time.
+
+Fixes: c8383054506c ("cachefiles: notify the user daemon when looking up cookie")
+Signed-off-by: Baokun Li <libaokun1@huawei.com>
+Link: https://lore.kernel.org/r/20240628062930.2467993-9-libaokun@huaweicloud.com
+Acked-by: Jeff Layton <jlayton@kernel.org>
+Reviewed-by: Gao Xiang <hsiangkao@linux.alibaba.com>
+Reviewed-by: Jia Zhu <zhujia.zj@bytedance.com>
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/cachefiles/internal.h |  1 +
+ fs/cachefiles/ondemand.c | 20 ++++++++++++++++----
+ 2 files changed, 17 insertions(+), 4 deletions(-)
+
+diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
+index a1a1d25e95147..7b99bd98de75b 100644
+--- a/fs/cachefiles/internal.h
++++ b/fs/cachefiles/internal.h
+@@ -129,6 +129,7 @@ struct cachefiles_cache {
+       unsigned long                   req_id_next;
+       struct xarray                   ondemand_ids;   /* xarray for ondemand_id allocation */
+       u32                             ondemand_id_next;
++      u32                             msg_id_next;
+ };
+ 
+ static inline bool cachefiles_in_ondemand_mode(struct cachefiles_cache *cache)
+diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c
+index 6b94f616e6579..7e4874f60de10 100644
+--- a/fs/cachefiles/ondemand.c
++++ b/fs/cachefiles/ondemand.c
+@@ -505,20 +505,32 @@ static int cachefiles_ondemand_send_req(struct cachefiles_object *object,
+               smp_mb();
+ 
+               if (opcode == CACHEFILES_OP_CLOSE &&
+-                      !cachefiles_ondemand_object_is_open(object)) {
++                  !cachefiles_ondemand_object_is_open(object)) {
+                       WARN_ON_ONCE(object->ondemand->ondemand_id == 0);
+                       xas_unlock(&xas);
+                       ret = -EIO;
+                       goto out;
+               }
+ 
+-              xas.xa_index = 0;
++              /*
++               * Cyclically find a free xas to avoid msg_id reuse that would
++               * cause the daemon to successfully copen a stale msg_id.
++               */
++              xas.xa_index = cache->msg_id_next;
+               xas_find_marked(&xas, UINT_MAX, XA_FREE_MARK);
++              if (xas.xa_node == XAS_RESTART) {
++                      xas.xa_index = 0;
++                      xas_find_marked(&xas, cache->msg_id_next - 1, XA_FREE_MARK);
++              }
+               if (xas.xa_node == XAS_RESTART)
+                       xas_set_err(&xas, -EBUSY);
++
+               xas_store(&xas, req);
+-              xas_clear_mark(&xas, XA_FREE_MARK);
+-              xas_set_mark(&xas, CACHEFILES_REQ_NEW);
++              if (xas_valid(&xas)) {
++                      cache->msg_id_next = xas.xa_index + 1;
++                      xas_clear_mark(&xas, XA_FREE_MARK);
++                      xas_set_mark(&xas, CACHEFILES_REQ_NEW);
++              }
+               xas_unlock(&xas);
+       } while (xas_nomem(&xas, GFP_KERNEL));
+ 
+-- 
+2.43.0
+
diff --git a/queue-6.9/cachefiles-propagate-errors-from-vfs_getxattr-to-avo.patch b/queue-6.9/cachefiles-propagate-errors-from-vfs_getxattr-to-avo.patch

new file mode 100644 (file)

index 0000000..0f85232
--- /dev/null
+++ b/queue-6.9/cachefiles-propagate-errors-from-vfs_getxattr-to-avo.patch
@@ -0,0 +1,70 @@
+From d47a352fe8600270cf3da4df70547c44fd45364a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 28 Jun 2024 14:29:25 +0800
+Subject: cachefiles: propagate errors from vfs_getxattr() to avoid infinite
+ loop
+
+From: Baokun Li <libaokun1@huawei.com>
+
+[ Upstream commit 0ece614a52bc9d219b839a6a29282b30d10e0c48 ]
+
+In cachefiles_check_volume_xattr(), the error returned by vfs_getxattr()
+is not passed to ret, so it ends up returning -ESTALE, which leads to an
+endless loop as follows:
+
+cachefiles_acquire_volume
+retry:
+  ret = cachefiles_check_volume_xattr
+    ret = -ESTALE
+    xlen = vfs_getxattr // return -EIO
+    // The ret is not updated when xlen < 0, so -ESTALE is returned.
+    return ret
+  // Supposed to jump out of the loop at this judgement.
+  if (ret != -ESTALE)
+      goto error_dir;
+  cachefiles_bury_object
+    //  EIO causes rename failure
+  goto retry;
+
+Hence propagate the error returned by vfs_getxattr() to avoid the above
+issue. Do the same in cachefiles_check_auxdata().
+
+Fixes: 32e150037dce ("fscache, cachefiles: Store the volume coherency data")
+Fixes: 72b957856b0c ("cachefiles: Implement metadata/coherency data storage in xattrs")
+Signed-off-by: Baokun Li <libaokun1@huawei.com>
+Link: https://lore.kernel.org/r/20240628062930.2467993-5-libaokun@huaweicloud.com
+Reviewed-by: Gao Xiang <hsiangkao@linux.alibaba.com>
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/cachefiles/xattr.c | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
+index bcb6173943ee4..4dd8a993c60a8 100644
+--- a/fs/cachefiles/xattr.c
++++ b/fs/cachefiles/xattr.c
+@@ -110,9 +110,11 @@ int cachefiles_check_auxdata(struct cachefiles_object *object, struct file *file
+       if (xlen == 0)
+               xlen = vfs_getxattr(&nop_mnt_idmap, dentry, cachefiles_xattr_cache, buf, tlen);
+       if (xlen != tlen) {
+-              if (xlen < 0)
++              if (xlen < 0) {
++                      ret = xlen;
+                       trace_cachefiles_vfs_error(object, file_inode(file), xlen,
+                                                  cachefiles_trace_getxattr_error);
++              }
+               if (xlen == -EIO)
+                       cachefiles_io_error_obj(
+                               object,
+@@ -252,6 +254,7 @@ int cachefiles_check_volume_xattr(struct cachefiles_volume *volume)
+               xlen = vfs_getxattr(&nop_mnt_idmap, dentry, cachefiles_xattr_cache, buf, len);
+       if (xlen != len) {
+               if (xlen < 0) {
++                      ret = xlen;
+                       trace_cachefiles_vfs_error(NULL, d_inode(dentry), xlen,
+                                                  cachefiles_trace_getxattr_error);
+                       if (xlen == -EIO)
+-- 
+2.43.0
+
diff --git a/queue-6.9/cachefiles-stop-sending-new-request-when-dropping-ob.patch b/queue-6.9/cachefiles-stop-sending-new-request-when-dropping-ob.patch

new file mode 100644 (file)

index 0000000..e1f0d51
--- /dev/null
+++ b/queue-6.9/cachefiles-stop-sending-new-request-when-dropping-ob.patch
@@ -0,0 +1,92 @@
+From 6cdfa341f0035832d957ef91bd0737302d9e79fb Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 28 Jun 2024 14:29:26 +0800
+Subject: cachefiles: stop sending new request when dropping object
+
+From: Baokun Li <libaokun1@huawei.com>
+
+[ Upstream commit b2415d1f4566b6939acacc69637eaa57815829c1 ]
+
+Added CACHEFILES_ONDEMAND_OBJSTATE_DROPPING indicates that the cachefiles
+object is being dropped, and is set after the close request for the dropped
+object completes, and no new requests are allowed to be sent after this
+state.
+
+This prepares for the later addition of cancel_work_sync(). It prevents
+leftover reopen requests from being sent, to avoid processing unnecessary
+requests and to avoid cancel_work_sync() blocking by waiting for daemon to
+complete the reopen requests.
+
+Signed-off-by: Baokun Li <libaokun1@huawei.com>
+Link: https://lore.kernel.org/r/20240628062930.2467993-6-libaokun@huaweicloud.com
+Acked-by: Jeff Layton <jlayton@kernel.org>
+Reviewed-by: Gao Xiang <hsiangkao@linux.alibaba.com>
+Reviewed-by: Jia Zhu <zhujia.zj@bytedance.com>
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Stable-dep-of: 12e009d60852 ("cachefiles: wait for ondemand_object_worker to finish when dropping object")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/cachefiles/internal.h |  2 ++
+ fs/cachefiles/ondemand.c | 10 ++++++++--
+ 2 files changed, 10 insertions(+), 2 deletions(-)
+
+diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
+index 6845a90cdfcce..a1a1d25e95147 100644
+--- a/fs/cachefiles/internal.h
++++ b/fs/cachefiles/internal.h
+@@ -48,6 +48,7 @@ enum cachefiles_object_state {
+       CACHEFILES_ONDEMAND_OBJSTATE_CLOSE, /* Anonymous fd closed by daemon or initial state */
+       CACHEFILES_ONDEMAND_OBJSTATE_OPEN, /* Anonymous fd associated with object is available */
+       CACHEFILES_ONDEMAND_OBJSTATE_REOPENING, /* Object that was closed and is being reopened. */
++      CACHEFILES_ONDEMAND_OBJSTATE_DROPPING, /* Object is being dropped. */
+ };
+ 
+ struct cachefiles_ondemand_info {
+@@ -335,6 +336,7 @@ cachefiles_ondemand_set_object_##_state(struct cachefiles_object *object) \
+ CACHEFILES_OBJECT_STATE_FUNCS(open, OPEN);
+ CACHEFILES_OBJECT_STATE_FUNCS(close, CLOSE);
+ CACHEFILES_OBJECT_STATE_FUNCS(reopening, REOPENING);
++CACHEFILES_OBJECT_STATE_FUNCS(dropping, DROPPING);
+ 
+ static inline bool cachefiles_ondemand_is_reopening_read(struct cachefiles_req *req)
+ {
+diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c
+index 89f118d68d125..14f91a9fbe447 100644
+--- a/fs/cachefiles/ondemand.c
++++ b/fs/cachefiles/ondemand.c
+@@ -494,7 +494,8 @@ static int cachefiles_ondemand_send_req(struct cachefiles_object *object,
+                */
+               xas_lock(&xas);
+ 
+-              if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
++              if (test_bit(CACHEFILES_DEAD, &cache->flags) ||
++                  cachefiles_ondemand_object_is_dropping(object)) {
+                       xas_unlock(&xas);
+                       ret = -EIO;
+                       goto out;
+@@ -535,7 +536,8 @@ static int cachefiles_ondemand_send_req(struct cachefiles_object *object,
+        * If error occurs after creating the anonymous fd,
+        * cachefiles_ondemand_fd_release() will set object to close.
+        */
+-      if (opcode == CACHEFILES_OP_OPEN)
++      if (opcode == CACHEFILES_OP_OPEN &&
++          !cachefiles_ondemand_object_is_dropping(object))
+               cachefiles_ondemand_set_object_close(object);
+       kfree(req);
+       return ret;
+@@ -634,8 +636,12 @@ int cachefiles_ondemand_init_object(struct cachefiles_object *object)
+ 
+ void cachefiles_ondemand_clean_object(struct cachefiles_object *object)
+ {
++      if (!object->ondemand)
++              return;
++
+       cachefiles_ondemand_send_req(object, CACHEFILES_OP_CLOSE, 0,
+                       cachefiles_ondemand_init_close_req, NULL);
++      cachefiles_ondemand_set_object_dropping(object);
+ }
+ 
+ int cachefiles_ondemand_init_obj_info(struct cachefiles_object *object,
+-- 
+2.43.0
+
diff --git a/queue-6.9/cachefiles-wait-for-ondemand_object_worker-to-finish.patch b/queue-6.9/cachefiles-wait-for-ondemand_object_worker-to-finish.patch

new file mode 100644 (file)

index 0000000..6037b36
--- /dev/null
+++ b/queue-6.9/cachefiles-wait-for-ondemand_object_worker-to-finish.patch
@@ -0,0 +1,78 @@
+From 76085f8ae3157d42f87bb0b085a926a61b19d8b7 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 28 Jun 2024 14:29:28 +0800
+Subject: cachefiles: wait for ondemand_object_worker to finish when dropping
+ object
+
+From: Hou Tao <houtao1@huawei.com>
+
+[ Upstream commit 12e009d60852f7bce0afc373ca0b320f14150418 ]
+
+When queuing ondemand_object_worker() to re-open the object,
+cachefiles_object is not pinned. The cachefiles_object may be freed when
+the pending read request is completed intentionally and the related
+erofs is umounted. If ondemand_object_worker() runs after the object is
+freed, it will incur use-after-free problem as shown below.
+
+process A  processs B  process C  process D
+
+cachefiles_ondemand_send_req()
+// send a read req X
+// wait for its completion
+
+           // close ondemand fd
+           cachefiles_ondemand_fd_release()
+           // set object as CLOSE
+
+                       cachefiles_ondemand_daemon_read()
+                       // set object as REOPENING
+                       queue_work(fscache_wq, &info->ondemand_work)
+
+                                // close /dev/cachefiles
+                                cachefiles_daemon_release
+                                cachefiles_flush_reqs
+                                complete(&req->done)
+
+// read req X is completed
+// umount the erofs fs
+cachefiles_put_object()
+// object will be freed
+cachefiles_ondemand_deinit_obj_info()
+kmem_cache_free(object)
+                       // both info and object are freed
+                       ondemand_object_worker()
+
+When dropping an object, it is no longer necessary to reopen the object,
+so use cancel_work_sync() to cancel or wait for ondemand_object_worker()
+to finish.
+
+Fixes: 0a7e54c1959c ("cachefiles: resend an open request if the read request's object is closed")
+Signed-off-by: Hou Tao <houtao1@huawei.com>
+Signed-off-by: Baokun Li <libaokun1@huawei.com>
+Link: https://lore.kernel.org/r/20240628062930.2467993-8-libaokun@huaweicloud.com
+Acked-by: Jeff Layton <jlayton@kernel.org>
+Reviewed-by: Jia Zhu <zhujia.zj@bytedance.com>
+Reviewed-by: Gao Xiang <hsiangkao@linux.alibaba.com>
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/cachefiles/ondemand.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c
+index 1c0fa7412a6fa..6b94f616e6579 100644
+--- a/fs/cachefiles/ondemand.c
++++ b/fs/cachefiles/ondemand.c
+@@ -661,6 +661,9 @@ void cachefiles_ondemand_clean_object(struct cachefiles_object *object)
+               }
+       }
+       xa_unlock(&cache->reqs);
++
++      /* Wait for ondemand_object_worker() to finish to avoid UAF. */
++      cancel_work_sync(&object->ondemand->ondemand_work);
+ }
+ 
+ int cachefiles_ondemand_init_obj_info(struct cachefiles_object *object,
+-- 
+2.43.0
+
diff --git a/queue-6.9/dsa-lan9303-fix-mapping-between-dsa-port-number-and-.patch b/queue-6.9/dsa-lan9303-fix-mapping-between-dsa-port-number-and-.patch

new file mode 100644 (file)

index 0000000..79dda49
--- /dev/null
+++ b/queue-6.9/dsa-lan9303-fix-mapping-between-dsa-port-number-and-.patch
@@ -0,0 +1,103 @@
+From 0307ec007a29a2655254cbfaf6dc31b938d22992 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 3 Jul 2024 16:57:17 +0200
+Subject: dsa: lan9303: Fix mapping between DSA port number and PHY address
+
+From: Christian Eggers <ceggers@arri.de>
+
+[ Upstream commit 0005b2dc43f96b93fc5b0850d7ca3f7aeac9129c ]
+
+The 'phy' parameter supplied to lan9303_phy_read/_write was sometimes a
+DSA port number and sometimes a PHY address. This isn't a problem as
+long as they are equal.  But if the external phy_addr_sel_strap pin is
+wired to 'high', the PHY addresses change from 0-1-2 to 1-2-3 (CPU,
+slave0, slave1).  In this case, lan9303_phy_read/_write must translate
+between DSA port numbers and the corresponding PHY address.
+
+Fixes: a1292595e006 ("net: dsa: add new DSA switch driver for the SMSC-LAN9303")
+Signed-off-by: Christian Eggers <ceggers@arri.de>
+Reviewed-by: Michal Kubiak <michal.kubiak@intel.com>
+Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
+Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
+Link: https://patch.msgid.link/20240703145718.19951-1-ceggers@arri.de
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/dsa/lan9303-core.c | 23 ++++++++++-------------
+ 1 file changed, 10 insertions(+), 13 deletions(-)
+
+diff --git a/drivers/net/dsa/lan9303-core.c b/drivers/net/dsa/lan9303-core.c
+index 666b4d766c005..1f7000f90bb78 100644
+--- a/drivers/net/dsa/lan9303-core.c
++++ b/drivers/net/dsa/lan9303-core.c
+@@ -1048,31 +1048,31 @@ static int lan9303_get_sset_count(struct dsa_switch *ds, int port, int sset)
+       return ARRAY_SIZE(lan9303_mib);
+ }
+ 
+-static int lan9303_phy_read(struct dsa_switch *ds, int phy, int regnum)
++static int lan9303_phy_read(struct dsa_switch *ds, int port, int regnum)
+ {
+       struct lan9303 *chip = ds->priv;
+       int phy_base = chip->phy_addr_base;
+ 
+-      if (phy == phy_base)
++      if (port == 0)
+               return lan9303_virt_phy_reg_read(chip, regnum);
+-      if (phy > phy_base + 2)
++      if (port > 2)
+               return -ENODEV;
+ 
+-      return chip->ops->phy_read(chip, phy, regnum);
++      return chip->ops->phy_read(chip, phy_base + port, regnum);
+ }
+ 
+-static int lan9303_phy_write(struct dsa_switch *ds, int phy, int regnum,
++static int lan9303_phy_write(struct dsa_switch *ds, int port, int regnum,
+                            u16 val)
+ {
+       struct lan9303 *chip = ds->priv;
+       int phy_base = chip->phy_addr_base;
+ 
+-      if (phy == phy_base)
++      if (port == 0)
+               return lan9303_virt_phy_reg_write(chip, regnum, val);
+-      if (phy > phy_base + 2)
++      if (port > 2)
+               return -ENODEV;
+ 
+-      return chip->ops->phy_write(chip, phy, regnum, val);
++      return chip->ops->phy_write(chip, phy_base + port, regnum, val);
+ }
+ 
+ static int lan9303_port_enable(struct dsa_switch *ds, int port,
+@@ -1100,7 +1100,7 @@ static void lan9303_port_disable(struct dsa_switch *ds, int port)
+       vlan_vid_del(dsa_port_to_conduit(dp), htons(ETH_P_8021Q), port);
+ 
+       lan9303_disable_processing_port(chip, port);
+-      lan9303_phy_write(ds, chip->phy_addr_base + port, MII_BMCR, BMCR_PDOWN);
++      lan9303_phy_write(ds, port, MII_BMCR, BMCR_PDOWN);
+ }
+ 
+ static int lan9303_port_bridge_join(struct dsa_switch *ds, int port,
+@@ -1375,8 +1375,6 @@ static const struct dsa_switch_ops lan9303_switch_ops = {
+ 
+ static int lan9303_register_switch(struct lan9303 *chip)
+ {
+-      int base;
+-
+       chip->ds = devm_kzalloc(chip->dev, sizeof(*chip->ds), GFP_KERNEL);
+       if (!chip->ds)
+               return -ENOMEM;
+@@ -1386,8 +1384,7 @@ static int lan9303_register_switch(struct lan9303 *chip)
+       chip->ds->priv = chip;
+       chip->ds->ops = &lan9303_switch_ops;
+       chip->ds->phylink_mac_ops = &lan9303_phylink_mac_ops;
+-      base = chip->phy_addr_base;
+-      chip->ds->phys_mii_mask = GENMASK(LAN9303_NUM_PORTS - 1 + base, base);
++      chip->ds->phys_mii_mask = GENMASK(LAN9303_NUM_PORTS - 1, 0);
+ 
+       return dsa_register_switch(chip->ds);
+ }
+-- 
+2.43.0
+
diff --git a/queue-6.9/ethtool-netlink-do-not-return-sqi-value-if-link-is-d.patch b/queue-6.9/ethtool-netlink-do-not-return-sqi-value-if-link-is-d.patch

new file mode 100644 (file)

index 0000000..1d2cd88
--- /dev/null
+++ b/queue-6.9/ethtool-netlink-do-not-return-sqi-value-if-link-is-d.patch
@@ -0,0 +1,122 @@
+From 1a0f643dd1a825ee9ec254cd5a83ff3f927f4bec Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 9 Jul 2024 08:19:43 +0200
+Subject: ethtool: netlink: do not return SQI value if link is down
+
+From: Oleksij Rempel <o.rempel@pengutronix.de>
+
+[ Upstream commit c184cf94e73b04ff7048d045f5413899bc664788 ]
+
+Do not attach SQI value if link is down. "SQI values are only valid if
+link-up condition is present" per OpenAlliance specification of
+100Base-T1 Interoperability Test suite [1]. The same rule would apply
+for other link types.
+
+[1] https://opensig.org/automotive-ethernet-specifications/#
+
+Fixes: 806602191592 ("ethtool: provide UAPI for PHY Signal Quality Index (SQI)")
+Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
+Reviewed-by: Andrew Lunn <andrew@lunn.ch>
+Reviewed-by: Woojung Huh <woojung.huh@microchip.com>
+Link: https://patch.msgid.link/20240709061943.729381-1-o.rempel@pengutronix.de
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ethtool/linkstate.c | 41 ++++++++++++++++++++++++++++-------------
+ 1 file changed, 28 insertions(+), 13 deletions(-)
+
+diff --git a/net/ethtool/linkstate.c b/net/ethtool/linkstate.c
+index b2de2108b356a..34d76e87847d0 100644
+--- a/net/ethtool/linkstate.c
++++ b/net/ethtool/linkstate.c
+@@ -37,6 +37,8 @@ static int linkstate_get_sqi(struct net_device *dev)
+       mutex_lock(&phydev->lock);
+       if (!phydev->drv || !phydev->drv->get_sqi)
+               ret = -EOPNOTSUPP;
++      else if (!phydev->link)
++              ret = -ENETDOWN;
+       else
+               ret = phydev->drv->get_sqi(phydev);
+       mutex_unlock(&phydev->lock);
+@@ -55,6 +57,8 @@ static int linkstate_get_sqi_max(struct net_device *dev)
+       mutex_lock(&phydev->lock);
+       if (!phydev->drv || !phydev->drv->get_sqi_max)
+               ret = -EOPNOTSUPP;
++      else if (!phydev->link)
++              ret = -ENETDOWN;
+       else
+               ret = phydev->drv->get_sqi_max(phydev);
+       mutex_unlock(&phydev->lock);
+@@ -62,6 +66,17 @@ static int linkstate_get_sqi_max(struct net_device *dev)
+       return ret;
+ };
+ 
++static bool linkstate_sqi_critical_error(int sqi)
++{
++      return sqi < 0 && sqi != -EOPNOTSUPP && sqi != -ENETDOWN;
++}
++
++static bool linkstate_sqi_valid(struct linkstate_reply_data *data)
++{
++      return data->sqi >= 0 && data->sqi_max >= 0 &&
++             data->sqi <= data->sqi_max;
++}
++
+ static int linkstate_get_link_ext_state(struct net_device *dev,
+                                       struct linkstate_reply_data *data)
+ {
+@@ -93,12 +108,12 @@ static int linkstate_prepare_data(const struct ethnl_req_info *req_base,
+       data->link = __ethtool_get_link(dev);
+ 
+       ret = linkstate_get_sqi(dev);
+-      if (ret < 0 && ret != -EOPNOTSUPP)
++      if (linkstate_sqi_critical_error(ret))
+               goto out;
+       data->sqi = ret;
+ 
+       ret = linkstate_get_sqi_max(dev);
+-      if (ret < 0 && ret != -EOPNOTSUPP)
++      if (linkstate_sqi_critical_error(ret))
+               goto out;
+       data->sqi_max = ret;
+ 
+@@ -136,11 +151,10 @@ static int linkstate_reply_size(const struct ethnl_req_info *req_base,
+       len = nla_total_size(sizeof(u8)) /* LINKSTATE_LINK */
+               + 0;
+ 
+-      if (data->sqi != -EOPNOTSUPP)
+-              len += nla_total_size(sizeof(u32));
+-
+-      if (data->sqi_max != -EOPNOTSUPP)
+-              len += nla_total_size(sizeof(u32));
++      if (linkstate_sqi_valid(data)) {
++              len += nla_total_size(sizeof(u32)); /* LINKSTATE_SQI */
++              len += nla_total_size(sizeof(u32)); /* LINKSTATE_SQI_MAX */
++      }
+ 
+       if (data->link_ext_state_provided)
+               len += nla_total_size(sizeof(u8)); /* LINKSTATE_EXT_STATE */
+@@ -164,13 +178,14 @@ static int linkstate_fill_reply(struct sk_buff *skb,
+           nla_put_u8(skb, ETHTOOL_A_LINKSTATE_LINK, !!data->link))
+               return -EMSGSIZE;
+ 
+-      if (data->sqi != -EOPNOTSUPP &&
+-          nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI, data->sqi))
+-              return -EMSGSIZE;
++      if (linkstate_sqi_valid(data)) {
++              if (nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI, data->sqi))
++                      return -EMSGSIZE;
+ 
+-      if (data->sqi_max != -EOPNOTSUPP &&
+-          nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI_MAX, data->sqi_max))
+-              return -EMSGSIZE;
++              if (nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI_MAX,
++                              data->sqi_max))
++                      return -EMSGSIZE;
++      }
+ 
+       if (data->link_ext_state_provided) {
+               if (nla_put_u8(skb, ETHTOOL_A_LINKSTATE_EXT_STATE,
+-- 
+2.43.0
+
diff --git a/queue-6.9/filelock-fix-potential-use-after-free-in-posix_lock_.patch b/queue-6.9/filelock-fix-potential-use-after-free-in-posix_lock_.patch

new file mode 100644 (file)

index 0000000..e80ce56
--- /dev/null
+++ b/queue-6.9/filelock-fix-potential-use-after-free-in-posix_lock_.patch
@@ -0,0 +1,50 @@
+From c22aead6a1ca317701bf7dcf412a10957c2313f9 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 2 Jul 2024 18:44:48 -0400
+Subject: filelock: fix potential use-after-free in posix_lock_inode
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Jeff Layton <jlayton@kernel.org>
+
+[ Upstream commit 1b3ec4f7c03d4b07bad70697d7e2f4088d2cfe92 ]
+
+Light Hsieh reported a KASAN UAF warning in trace_posix_lock_inode().
+The request pointer had been changed earlier to point to a lock entry
+that was added to the inode's list. However, before the tracepoint could
+fire, another task raced in and freed that lock.
+
+Fix this by moving the tracepoint inside the spinlock, which should
+ensure that this doesn't happen.
+
+Fixes: 74f6f5912693 ("locks: fix KASAN: use-after-free in trace_event_raw_event_filelock_lock")
+Link: https://lore.kernel.org/linux-fsdevel/724ffb0a2962e912ea62bb0515deadf39c325112.camel@kernel.org/
+Reported-by: Light Hsieh (謝明燈) <Light.Hsieh@mediatek.com>
+Signed-off-by: Jeff Layton <jlayton@kernel.org>
+Link: https://lore.kernel.org/r/20240702-filelock-6-10-v1-1-96e766aadc98@kernel.org
+Reviewed-by: Alexander Aring <aahringo@redhat.com>
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/locks.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/fs/locks.c b/fs/locks.c
+index c360d1992d21f..bdd94c32256f5 100644
+--- a/fs/locks.c
++++ b/fs/locks.c
+@@ -1367,9 +1367,9 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
+               locks_wake_up_blocks(&left->c);
+       }
+  out:
++      trace_posix_lock_inode(inode, request, error);
+       spin_unlock(&ctx->flc_lock);
+       percpu_up_read(&file_rwsem);
+-      trace_posix_lock_inode(inode, request, error);
+       /*
+        * Free any unused locks.
+        */
+-- 
+2.43.0
+
diff --git a/queue-6.9/fs-dcache-re-use-value-stored-to-dentry-d_flags-inst.patch b/queue-6.9/fs-dcache-re-use-value-stored-to-dentry-d_flags-inst.patch

new file mode 100644 (file)

index 0000000..177c6bf
--- /dev/null
+++ b/queue-6.9/fs-dcache-re-use-value-stored-to-dentry-d_flags-inst.patch
@@ -0,0 +1,44 @@
+From 72c979fe95179e528834031562234e7a9ee690c1 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 3 Apr 2024 10:10:08 +0800
+Subject: fs/dcache: Re-use value stored to dentry->d_flags instead of
+ re-reading
+
+From: linke li <lilinke99@qq.com>
+
+[ Upstream commit 8bfb40be31ddea0cb4664b352e1797cfe6c91976 ]
+
+Currently, the __d_clear_type_and_inode() writes the value flags to
+dentry->d_flags, then immediately re-reads it in order to use it in a if
+statement. This re-read is useless because no other update to
+dentry->d_flags can occur at this point.
+
+This commit therefore re-use flags in the if statement instead of
+re-reading dentry->d_flags.
+
+Signed-off-by: linke li <lilinke99@qq.com>
+Link: https://lore.kernel.org/r/tencent_5E187BD0A61BA28605E85405F15228254D0A@qq.com
+Reviewed-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Stable-dep-of: aabfe57ebaa7 ("vfs: don't mod negative dentry count when on shrinker list")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/dcache.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/fs/dcache.c b/fs/dcache.c
+index 71a8e943a0fa5..407095188f83a 100644
+--- a/fs/dcache.c
++++ b/fs/dcache.c
+@@ -355,7 +355,7 @@ static inline void __d_clear_type_and_inode(struct dentry *dentry)
+       flags &= ~DCACHE_ENTRY_TYPE;
+       WRITE_ONCE(dentry->d_flags, flags);
+       dentry->d_inode = NULL;
+-      if (dentry->d_flags & DCACHE_LRU_LIST)
++      if (flags & DCACHE_LRU_LIST)
+               this_cpu_inc(nr_dentry_negative);
+ }
+ 
+-- 
+2.43.0
+
diff --git a/queue-6.9/i40e-fix-xdp-program-unloading-while-removing-the-dr.patch b/queue-6.9/i40e-fix-xdp-program-unloading-while-removing-the-dr.patch

new file mode 100644 (file)

index 0000000..dfe4376
--- /dev/null
+++ b/queue-6.9/i40e-fix-xdp-program-unloading-while-removing-the-dr.patch
@@ -0,0 +1,119 @@
+From 5be5fb99ff9e7849d580df6b54c2c7e44b4eaa67 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 8 Jul 2024 16:07:49 -0700
+Subject: i40e: Fix XDP program unloading while removing the driver
+
+From: Michal Kubiak <michal.kubiak@intel.com>
+
+[ Upstream commit 01fc5142ae6b06b61ed51a624f2732d6525d8ea3 ]
+
+The commit 6533e558c650 ("i40e: Fix reset path while removing
+the driver") introduced a new PF state "__I40E_IN_REMOVE" to block
+modifying the XDP program while the driver is being removed.
+Unfortunately, such a change is useful only if the ".ndo_bpf()"
+callback was called out of the rmmod context because unloading the
+existing XDP program is also a part of driver removing procedure.
+In other words, from the rmmod context the driver is expected to
+unload the XDP program without reporting any errors. Otherwise,
+the kernel warning with callstack is printed out to dmesg.
+
+Example failing scenario:
+ 1. Load the i40e driver.
+ 2. Load the XDP program.
+ 3. Unload the i40e driver (using "rmmod" command).
+
+The example kernel warning log:
+
+[  +0.004646] WARNING: CPU: 94 PID: 10395 at net/core/dev.c:9290 unregister_netdevice_many_notify+0x7a9/0x870
+[...]
+[  +0.010959] RIP: 0010:unregister_netdevice_many_notify+0x7a9/0x870
+[...]
+[  +0.002726] Call Trace:
+[  +0.002457]  <TASK>
+[  +0.002119]  ? __warn+0x80/0x120
+[  +0.003245]  ? unregister_netdevice_many_notify+0x7a9/0x870
+[  +0.005586]  ? report_bug+0x164/0x190
+[  +0.003678]  ? handle_bug+0x3c/0x80
+[  +0.003503]  ? exc_invalid_op+0x17/0x70
+[  +0.003846]  ? asm_exc_invalid_op+0x1a/0x20
+[  +0.004200]  ? unregister_netdevice_many_notify+0x7a9/0x870
+[  +0.005579]  ? unregister_netdevice_many_notify+0x3cc/0x870
+[  +0.005586]  unregister_netdevice_queue+0xf7/0x140
+[  +0.004806]  unregister_netdev+0x1c/0x30
+[  +0.003933]  i40e_vsi_release+0x87/0x2f0 [i40e]
+[  +0.004604]  i40e_remove+0x1a1/0x420 [i40e]
+[  +0.004220]  pci_device_remove+0x3f/0xb0
+[  +0.003943]  device_release_driver_internal+0x19f/0x200
+[  +0.005243]  driver_detach+0x48/0x90
+[  +0.003586]  bus_remove_driver+0x6d/0xf0
+[  +0.003939]  pci_unregister_driver+0x2e/0xb0
+[  +0.004278]  i40e_exit_module+0x10/0x5f0 [i40e]
+[  +0.004570]  __do_sys_delete_module.isra.0+0x197/0x310
+[  +0.005153]  do_syscall_64+0x85/0x170
+[  +0.003684]  ? syscall_exit_to_user_mode+0x69/0x220
+[  +0.004886]  ? do_syscall_64+0x95/0x170
+[  +0.003851]  ? exc_page_fault+0x7e/0x180
+[  +0.003932]  entry_SYSCALL_64_after_hwframe+0x71/0x79
+[  +0.005064] RIP: 0033:0x7f59dc9347cb
+[  +0.003648] Code: 73 01 c3 48 8b 0d 65 16 0c 00 f7 d8 64 89 01 48 83
+c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa b8 b0 00 00 00 0f
+05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 35 16 0c 00 f7 d8 64 89 01 48
+[  +0.018753] RSP: 002b:00007ffffac99048 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
+[  +0.007577] RAX: ffffffffffffffda RBX: 0000559b9bb2f6e0 RCX: 00007f59dc9347cb
+[  +0.007140] RDX: 0000000000000000 RSI: 0000000000000800 RDI: 0000559b9bb2f748
+[  +0.007146] RBP: 00007ffffac99070 R08: 1999999999999999 R09: 0000000000000000
+[  +0.007133] R10: 00007f59dc9a5ac0 R11: 0000000000000206 R12: 0000000000000000
+[  +0.007141] R13: 00007ffffac992d8 R14: 0000559b9bb2f6e0 R15: 0000000000000000
+[  +0.007151]  </TASK>
+[  +0.002204] ---[ end trace 0000000000000000 ]---
+
+Fix this by checking if the XDP program is being loaded or unloaded.
+Then, block only loading a new program while "__I40E_IN_REMOVE" is set.
+Also, move testing "__I40E_IN_REMOVE" flag to the beginning of XDP_SETUP
+callback to avoid unnecessary operations and checks.
+
+Fixes: 6533e558c650 ("i40e: Fix reset path while removing the driver")
+Signed-off-by: Michal Kubiak <michal.kubiak@intel.com>
+Reviewed-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+Tested-by: Chandan Kumar Rout <chandanx.rout@intel.com> (A Contingent Worker at Intel)
+Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
+Link: https://patch.msgid.link/20240708230750.625986-1-anthony.l.nguyen@intel.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/i40e/i40e_main.c | 9 ++++-----
+ 1 file changed, 4 insertions(+), 5 deletions(-)
+
+diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
+index ffb9f9f15c523..3a2d4d0697955 100644
+--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
++++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
+@@ -13264,6 +13264,10 @@ static int i40e_xdp_setup(struct i40e_vsi *vsi, struct bpf_prog *prog,
+       bool need_reset;
+       int i;
+ 
++      /* VSI shall be deleted in a moment, block loading new programs */
++      if (prog && test_bit(__I40E_IN_REMOVE, pf->state))
++              return -EINVAL;
++
+       /* Don't allow frames that span over multiple buffers */
+       if (vsi->netdev->mtu > frame_size - I40E_PACKET_HDR_PAD) {
+               NL_SET_ERR_MSG_MOD(extack, "MTU too large for linear frames and XDP prog does not support frags");
+@@ -13272,14 +13276,9 @@ static int i40e_xdp_setup(struct i40e_vsi *vsi, struct bpf_prog *prog,
+ 
+       /* When turning XDP on->off/off->on we reset and rebuild the rings. */
+       need_reset = (i40e_enabled_xdp_vsi(vsi) != !!prog);
+-
+       if (need_reset)
+               i40e_prep_for_reset(pf);
+ 
+-      /* VSI shall be deleted in a moment, just return EINVAL */
+-      if (test_bit(__I40E_IN_REMOVE, pf->state))
+-              return -EINVAL;
+-
+       old_prog = xchg(&vsi->xdp_prog, prog);
+ 
+       if (need_reset) {
+-- 
+2.43.0
+
diff --git a/queue-6.9/minixfs-fix-minixfs_rename-with-highmem.patch b/queue-6.9/minixfs-fix-minixfs_rename-with-highmem.patch

new file mode 100644 (file)

index 0000000..b8283ef
--- /dev/null
+++ b/queue-6.9/minixfs-fix-minixfs_rename-with-highmem.patch
@@ -0,0 +1,38 @@
+From 705c527354b81489c775410fe58211af5b374223 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 9 Jul 2024 20:58:39 +0100
+Subject: minixfs: Fix minixfs_rename with HIGHMEM
+
+From: Matthew Wilcox (Oracle) <willy@infradead.org>
+
+[ Upstream commit 3d1bec293378700dddc087d4d862306702276c23 ]
+
+minixfs now uses kmap_local_page(), so we can't call kunmap() to
+undo it.  This one call was missed as part of the commit this fixes.
+
+Fixes: 6628f69ee66a (minixfs: Use dir_put_page() in minix_unlink() and minix_rename())
+Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
+Link: https://lore.kernel.org/r/20240709195841.1986374-1-willy@infradead.org
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/minix/namei.c | 3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+diff --git a/fs/minix/namei.c b/fs/minix/namei.c
+index d6031acc34f0c..a944a0f17b537 100644
+--- a/fs/minix/namei.c
++++ b/fs/minix/namei.c
+@@ -213,8 +213,7 @@ static int minix_rename(struct mnt_idmap *idmap,
+               if (!new_de)
+                       goto out_dir;
+               err = minix_set_link(new_de, new_page, old_inode);
+-              kunmap(new_page);
+-              put_page(new_page);
++              unmap_and_put_page(new_page, new_de);
+               if (err)
+                       goto out_dir;
+               inode_set_ctime_current(new_inode);
+-- 
+2.43.0
+
diff --git a/queue-6.9/mm-prevent-derefencing-null-ptr-in-pfn_section_valid.patch b/queue-6.9/mm-prevent-derefencing-null-ptr-in-pfn_section_valid.patch

new file mode 100644 (file)

index 0000000..a0da978
--- /dev/null
+++ b/queue-6.9/mm-prevent-derefencing-null-ptr-in-pfn_section_valid.patch
@@ -0,0 +1,44 @@
+From 889b6d9cba2f1874b9975769fcb01aebb1450832 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 25 Jun 2024 20:16:39 -0400
+Subject: mm: prevent derefencing NULL ptr in pfn_section_valid()
+
+From: Waiman Long <longman@redhat.com>
+
+[ Upstream commit 82f0b6f041fad768c28b4ad05a683065412c226e ]
+
+Commit 5ec8e8ea8b77 ("mm/sparsemem: fix race in accessing
+memory_section->usage") changed pfn_section_valid() to add a READ_ONCE()
+call around "ms->usage" to fix a race with section_deactivate() where
+ms->usage can be cleared.  The READ_ONCE() call, by itself, is not enough
+to prevent NULL pointer dereference.  We need to check its value before
+dereferencing it.
+
+Link: https://lkml.kernel.org/r/20240626001639.1350646-1-longman@redhat.com
+Fixes: 5ec8e8ea8b77 ("mm/sparsemem: fix race in accessing memory_section->usage")
+Signed-off-by: Waiman Long <longman@redhat.com>
+Cc: Charan Teja Kalla <quic_charante@quicinc.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/mmzone.h | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
+index a4f6f1fecc6f3..f8d89a021abc9 100644
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -1976,8 +1976,9 @@ static inline int subsection_map_index(unsigned long pfn)
+ static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
+ {
+       int idx = subsection_map_index(pfn);
++      struct mem_section_usage *usage = READ_ONCE(ms->usage);
+ 
+-      return test_bit(idx, READ_ONCE(ms->usage)->subsection_map);
++      return usage ? test_bit(idx, usage->subsection_map) : 0;
+ }
+ #else
+ static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
+-- 
+2.43.0
+
diff --git a/queue-6.9/net-bcmasp-fix-error-code-in-probe.patch b/queue-6.9/net-bcmasp-fix-error-code-in-probe.patch

new file mode 100644 (file)

index 0000000..c355521
--- /dev/null
+++ b/queue-6.9/net-bcmasp-fix-error-code-in-probe.patch
@@ -0,0 +1,38 @@
+From 9415ef3b60315bdbc54cd8429ce495769d547ac9 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 4 Jul 2024 10:19:44 -0500
+Subject: net: bcmasp: Fix error code in probe()
+
+From: Dan Carpenter <dan.carpenter@linaro.org>
+
+[ Upstream commit 0c754d9d86ffdf2f86b4272b25d759843fb62fd8 ]
+
+Return an error code if bcmasp_interface_create() fails.  Don't return
+success.
+
+Fixes: 490cb412007d ("net: bcmasp: Add support for ASP2.0 Ethernet controller")
+Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
+Reviewed-by: Michal Kubiak <michal.kubiak@intel.com>
+Reviewed-by: Justin Chen <justin.chen@broadcom.com>
+Link: https://patch.msgid.link/ZoWKBkHH9D1fqV4r@stanley.mountain
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/broadcom/asp2/bcmasp.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/drivers/net/ethernet/broadcom/asp2/bcmasp.c b/drivers/net/ethernet/broadcom/asp2/bcmasp.c
+index a806dadc41965..20c6529ec1350 100644
+--- a/drivers/net/ethernet/broadcom/asp2/bcmasp.c
++++ b/drivers/net/ethernet/broadcom/asp2/bcmasp.c
+@@ -1380,6 +1380,7 @@ static int bcmasp_probe(struct platform_device *pdev)
+                       dev_err(dev, "Cannot create eth interface %d\n", i);
+                       bcmasp_remove_intfs(priv);
+                       of_node_put(intf_node);
++                      ret = -ENOMEM;
+                       goto of_put_exit;
+               }
+               list_add_tail(&intf->list, &priv->intfs);
+-- 
+2.43.0
+
diff --git a/queue-6.9/net-dsa-allow-dsa-switch-drivers-to-provide-their-ow.patch b/queue-6.9/net-dsa-allow-dsa-switch-drivers-to-provide-their-ow.patch

new file mode 100644 (file)

index 0000000..11a8118
--- /dev/null
+++ b/queue-6.9/net-dsa-allow-dsa-switch-drivers-to-provide-their-ow.patch
@@ -0,0 +1,134 @@
+From 2cf66bca77e312bf1604c5451dfc2fe43ce59ed8 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 10 Apr 2024 20:42:43 +0100
+Subject: net: dsa: allow DSA switch drivers to provide their own phylink mac
+ ops
+
+From: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
+
+[ Upstream commit cae425cb43feddd9fd62fc1b25567f9463da4915 ]
+
+Rather than having a shim for each and every phylink MAC operation,
+allow DSA switch drivers to provide their own ops structure. When a
+DSA driver provides the phylink MAC operations, the shimmed ops must
+not be provided, so fail an attempt to register a switch with both
+the phylink_mac_ops in struct dsa_switch and the phylink_mac_*
+operations populated in dsa_switch_ops populated.
+
+Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
+Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
+Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
+Link: https://lore.kernel.org/r/E1rudqF-006K9H-Cc@rmk-PC.armlinux.org.uk
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Stable-dep-of: 0005b2dc43f9 ("dsa: lan9303: Fix mapping between DSA port number and PHY address")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/dsa.h |  5 +++++
+ net/dsa/dsa.c     | 11 +++++++++++
+ net/dsa/port.c    | 26 ++++++++++++++++++++------
+ 3 files changed, 36 insertions(+), 6 deletions(-)
+
+diff --git a/include/net/dsa.h b/include/net/dsa.h
+index f228b479a5fd2..7edfd8de8882f 100644
+--- a/include/net/dsa.h
++++ b/include/net/dsa.h
+@@ -457,6 +457,11 @@ struct dsa_switch {
+        */
+       const struct dsa_switch_ops     *ops;
+ 
++      /*
++       * Allow a DSA switch driver to override the phylink MAC ops
++       */
++      const struct phylink_mac_ops    *phylink_mac_ops;
++
+       /*
+        * User mii_bus and devices for the individual ports.
+        */
+diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
+index 09d2f5d4b3dd4..2f347cd373162 100644
+--- a/net/dsa/dsa.c
++++ b/net/dsa/dsa.c
+@@ -1505,6 +1505,17 @@ static int dsa_switch_probe(struct dsa_switch *ds)
+       if (!ds->num_ports)
+               return -EINVAL;
+ 
++      if (ds->phylink_mac_ops) {
++              if (ds->ops->phylink_mac_select_pcs ||
++                  ds->ops->phylink_mac_prepare ||
++                  ds->ops->phylink_mac_config ||
++                  ds->ops->phylink_mac_finish ||
++                  ds->ops->phylink_mac_link_down ||
++                  ds->ops->phylink_mac_link_up ||
++                  ds->ops->adjust_link)
++                      return -EINVAL;
++      }
++
+       if (np) {
+               err = dsa_switch_parse_of(ds, np);
+               if (err)
+diff --git a/net/dsa/port.c b/net/dsa/port.c
+index 02bf1c306bdca..c6febc3d96d9b 100644
+--- a/net/dsa/port.c
++++ b/net/dsa/port.c
+@@ -1662,6 +1662,7 @@ static const struct phylink_mac_ops dsa_port_phylink_mac_ops = {
+ 
+ int dsa_port_phylink_create(struct dsa_port *dp)
+ {
++      const struct phylink_mac_ops *mac_ops;
+       struct dsa_switch *ds = dp->ds;
+       phy_interface_t mode;
+       struct phylink *pl;
+@@ -1685,8 +1686,12 @@ int dsa_port_phylink_create(struct dsa_port *dp)
+               }
+       }
+ 
+-      pl = phylink_create(&dp->pl_config, of_fwnode_handle(dp->dn),
+-                          mode, &dsa_port_phylink_mac_ops);
++      mac_ops = &dsa_port_phylink_mac_ops;
++      if (ds->phylink_mac_ops)
++              mac_ops = ds->phylink_mac_ops;
++
++      pl = phylink_create(&dp->pl_config, of_fwnode_handle(dp->dn), mode,
++                          mac_ops);
+       if (IS_ERR(pl)) {
+               pr_err("error creating PHYLINK: %ld\n", PTR_ERR(pl));
+               return PTR_ERR(pl);
+@@ -1952,12 +1957,23 @@ static void dsa_shared_port_validate_of(struct dsa_port *dp,
+               dn, dsa_port_is_cpu(dp) ? "CPU" : "DSA", dp->index);
+ }
+ 
++static void dsa_shared_port_link_down(struct dsa_port *dp)
++{
++      struct dsa_switch *ds = dp->ds;
++
++      if (ds->phylink_mac_ops && ds->phylink_mac_ops->mac_link_down)
++              ds->phylink_mac_ops->mac_link_down(&dp->pl_config, MLO_AN_FIXED,
++                                                 PHY_INTERFACE_MODE_NA);
++      else if (ds->ops->phylink_mac_link_down)
++              ds->ops->phylink_mac_link_down(ds, dp->index, MLO_AN_FIXED,
++                                             PHY_INTERFACE_MODE_NA);
++}
++
+ int dsa_shared_port_link_register_of(struct dsa_port *dp)
+ {
+       struct dsa_switch *ds = dp->ds;
+       bool missing_link_description;
+       bool missing_phy_mode;
+-      int port = dp->index;
+ 
+       dsa_shared_port_validate_of(dp, &missing_phy_mode,
+                                   &missing_link_description);
+@@ -1973,9 +1989,7 @@ int dsa_shared_port_link_register_of(struct dsa_port *dp)
+                                "Skipping phylink registration for %s port %d\n",
+                                dsa_port_is_cpu(dp) ? "CPU" : "DSA", dp->index);
+               } else {
+-                      if (ds->ops->phylink_mac_link_down)
+-                              ds->ops->phylink_mac_link_down(ds, port,
+-                                      MLO_AN_FIXED, PHY_INTERFACE_MODE_NA);
++                      dsa_shared_port_link_down(dp);
+ 
+                       return dsa_shared_port_phylink_register(dp);
+               }
+-- 
+2.43.0
+
diff --git a/queue-6.9/net-dsa-introduce-dsa_phylink_to_port.patch b/queue-6.9/net-dsa-introduce-dsa_phylink_to_port.patch

new file mode 100644 (file)

index 0000000..71ed007
--- /dev/null
+++ b/queue-6.9/net-dsa-introduce-dsa_phylink_to_port.patch
@@ -0,0 +1,103 @@
+From a578bfc5658fe19e87f8b0d4304bfba5e0bcf5c4 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 10 Apr 2024 20:42:38 +0100
+Subject: net: dsa: introduce dsa_phylink_to_port()
+
+From: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
+
+[ Upstream commit dd0c9855b41310470086500c9963bbb64bb90dd0 ]
+
+We convert from a phylink_config struct to a dsa_port struct in many
+places, let's provide a helper for this.
+
+Reviewed-by: Andrew Lunn <andrew@lunn.ch>
+Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
+Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
+Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
+Link: https://lore.kernel.org/r/E1rudqA-006K9B-85@rmk-PC.armlinux.org.uk
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Stable-dep-of: 0005b2dc43f9 ("dsa: lan9303: Fix mapping between DSA port number and PHY address")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/dsa.h |  6 ++++++
+ net/dsa/port.c    | 12 ++++++------
+ 2 files changed, 12 insertions(+), 6 deletions(-)
+
+diff --git a/include/net/dsa.h b/include/net/dsa.h
+index 7c0da9effe4e9..f228b479a5fd2 100644
+--- a/include/net/dsa.h
++++ b/include/net/dsa.h
+@@ -327,6 +327,12 @@ struct dsa_port {
+       };
+ };
+ 
++static inline struct dsa_port *
++dsa_phylink_to_port(struct phylink_config *config)
++{
++      return container_of(config, struct dsa_port, pl_config);
++}
++
+ /* TODO: ideally DSA ports would have a single dp->link_dp member,
+  * and no dst->rtable nor this struct dsa_link would be needed,
+  * but this would require some more complex tree walking,
+diff --git a/net/dsa/port.c b/net/dsa/port.c
+index c42dac87671b1..02bf1c306bdca 100644
+--- a/net/dsa/port.c
++++ b/net/dsa/port.c
+@@ -1558,7 +1558,7 @@ static struct phylink_pcs *
+ dsa_port_phylink_mac_select_pcs(struct phylink_config *config,
+                               phy_interface_t interface)
+ {
+-      struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
++      struct dsa_port *dp = dsa_phylink_to_port(config);
+       struct phylink_pcs *pcs = ERR_PTR(-EOPNOTSUPP);
+       struct dsa_switch *ds = dp->ds;
+ 
+@@ -1572,7 +1572,7 @@ static int dsa_port_phylink_mac_prepare(struct phylink_config *config,
+                                       unsigned int mode,
+                                       phy_interface_t interface)
+ {
+-      struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
++      struct dsa_port *dp = dsa_phylink_to_port(config);
+       struct dsa_switch *ds = dp->ds;
+       int err = 0;
+ 
+@@ -1587,7 +1587,7 @@ static void dsa_port_phylink_mac_config(struct phylink_config *config,
+                                       unsigned int mode,
+                                       const struct phylink_link_state *state)
+ {
+-      struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
++      struct dsa_port *dp = dsa_phylink_to_port(config);
+       struct dsa_switch *ds = dp->ds;
+ 
+       if (!ds->ops->phylink_mac_config)
+@@ -1600,7 +1600,7 @@ static int dsa_port_phylink_mac_finish(struct phylink_config *config,
+                                      unsigned int mode,
+                                      phy_interface_t interface)
+ {
+-      struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
++      struct dsa_port *dp = dsa_phylink_to_port(config);
+       struct dsa_switch *ds = dp->ds;
+       int err = 0;
+ 
+@@ -1615,7 +1615,7 @@ static void dsa_port_phylink_mac_link_down(struct phylink_config *config,
+                                          unsigned int mode,
+                                          phy_interface_t interface)
+ {
+-      struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
++      struct dsa_port *dp = dsa_phylink_to_port(config);
+       struct phy_device *phydev = NULL;
+       struct dsa_switch *ds = dp->ds;
+ 
+@@ -1638,7 +1638,7 @@ static void dsa_port_phylink_mac_link_up(struct phylink_config *config,
+                                        int speed, int duplex,
+                                        bool tx_pause, bool rx_pause)
+ {
+-      struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
++      struct dsa_port *dp = dsa_phylink_to_port(config);
+       struct dsa_switch *ds = dp->ds;
+ 
+       if (!ds->ops->phylink_mac_link_up) {
+-- 
+2.43.0
+
diff --git a/queue-6.9/net-dsa-lan9303-provide-own-phylink-mac-operations.patch b/queue-6.9/net-dsa-lan9303-provide-own-phylink-mac-operations.patch

new file mode 100644 (file)

index 0000000..74a49e9
--- /dev/null
+++ b/queue-6.9/net-dsa-lan9303-provide-own-phylink-mac-operations.patch
@@ -0,0 +1,92 @@
+From a4a7537a9af06d98efcd54ac125675ffcaf4b2c2 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 16 Apr 2024 11:19:14 +0100
+Subject: net: dsa: lan9303: provide own phylink MAC operations
+
+From: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
+
+[ Upstream commit 855b4ac06e46eaaf0f28484863e55d23fee89a0c ]
+
+Convert lan9303 to provide its own phylink MAC operations, thus
+avoiding the shim layer in DSA's port.c. We need to provide stubs for
+the mac_link_down() and mac_config() methods which are mandatory.
+
+Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
+Link: https://lore.kernel.org/r/E1rwfuE-007537-1u@rmk-PC.armlinux.org.uk
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Stable-dep-of: 0005b2dc43f9 ("dsa: lan9303: Fix mapping between DSA port number and PHY address")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/dsa/lan9303-core.c | 31 ++++++++++++++++++++++++++-----
+ 1 file changed, 26 insertions(+), 5 deletions(-)
+
+diff --git a/drivers/net/dsa/lan9303-core.c b/drivers/net/dsa/lan9303-core.c
+index fcb20eac332a6..666b4d766c005 100644
+--- a/drivers/net/dsa/lan9303-core.c
++++ b/drivers/net/dsa/lan9303-core.c
+@@ -1293,14 +1293,29 @@ static void lan9303_phylink_get_caps(struct dsa_switch *ds, int port,
+       }
+ }
+ 
+-static void lan9303_phylink_mac_link_up(struct dsa_switch *ds, int port,
++static void lan9303_phylink_mac_config(struct phylink_config *config,
++                                     unsigned int mode,
++                                     const struct phylink_link_state *state)
++{
++}
++
++static void lan9303_phylink_mac_link_down(struct phylink_config *config,
++                                        unsigned int mode,
++                                        phy_interface_t interface)
++{
++}
++
++static void lan9303_phylink_mac_link_up(struct phylink_config *config,
++                                      struct phy_device *phydev,
+                                       unsigned int mode,
+                                       phy_interface_t interface,
+-                                      struct phy_device *phydev, int speed,
+-                                      int duplex, bool tx_pause,
++                                      int speed, int duplex, bool tx_pause,
+                                       bool rx_pause)
+ {
+-      struct lan9303 *chip = ds->priv;
++      struct dsa_port *dp = dsa_phylink_to_port(config);
++      struct lan9303 *chip = dp->ds->priv;
++      struct dsa_switch *ds = dp->ds;
++      int port = dp->index;
+       u32 ctl;
+       u32 reg;
+ 
+@@ -1330,6 +1345,12 @@ static void lan9303_phylink_mac_link_up(struct dsa_switch *ds, int port,
+       regmap_write(chip->regmap, flow_ctl_reg[port], reg);
+ }
+ 
++static const struct phylink_mac_ops lan9303_phylink_mac_ops = {
++      .mac_config     = lan9303_phylink_mac_config,
++      .mac_link_down  = lan9303_phylink_mac_link_down,
++      .mac_link_up    = lan9303_phylink_mac_link_up,
++};
++
+ static const struct dsa_switch_ops lan9303_switch_ops = {
+       .get_tag_protocol       = lan9303_get_tag_protocol,
+       .setup                  = lan9303_setup,
+@@ -1337,7 +1358,6 @@ static const struct dsa_switch_ops lan9303_switch_ops = {
+       .phy_read               = lan9303_phy_read,
+       .phy_write              = lan9303_phy_write,
+       .phylink_get_caps       = lan9303_phylink_get_caps,
+-      .phylink_mac_link_up    = lan9303_phylink_mac_link_up,
+       .get_ethtool_stats      = lan9303_get_ethtool_stats,
+       .get_sset_count         = lan9303_get_sset_count,
+       .port_enable            = lan9303_port_enable,
+@@ -1365,6 +1385,7 @@ static int lan9303_register_switch(struct lan9303 *chip)
+       chip->ds->num_ports = LAN9303_NUM_PORTS;
+       chip->ds->priv = chip;
+       chip->ds->ops = &lan9303_switch_ops;
++      chip->ds->phylink_mac_ops = &lan9303_phylink_mac_ops;
+       base = chip->phy_addr_base;
+       chip->ds->phys_mii_mask = GENMASK(LAN9303_NUM_PORTS - 1 + base, base);
+ 
+-- 
+2.43.0
+
diff --git a/queue-6.9/net-ethernet-lantiq_etop-fix-double-free-in-detach.patch b/queue-6.9/net-ethernet-lantiq_etop-fix-double-free-in-detach.patch

new file mode 100644 (file)

index 0000000..5f0028d
--- /dev/null
+++ b/queue-6.9/net-ethernet-lantiq_etop-fix-double-free-in-detach.patch
@@ -0,0 +1,43 @@
+From e1416fbd44785728a1a20c9d95b2b47b3b6e0c3e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 8 Jul 2024 22:58:26 +0200
+Subject: net: ethernet: lantiq_etop: fix double free in detach
+
+From: Aleksander Jan Bajkowski <olek2@wp.pl>
+
+[ Upstream commit e1533b6319ab9c3a97dad314dd88b3783bc41b69 ]
+
+The number of the currently released descriptor is never incremented
+which results in the same skb being released multiple times.
+
+Fixes: 504d4721ee8e ("MIPS: Lantiq: Add ethernet driver")
+Reported-by: Joe Perches <joe@perches.com>
+Closes: https://lore.kernel.org/all/fc1bf93d92bb5b2f99c6c62745507cc22f3a7b2d.camel@perches.com/
+Signed-off-by: Aleksander Jan Bajkowski <olek2@wp.pl>
+Reviewed-by: Andrew Lunn <andrew@lunn.ch>
+Link: https://patch.msgid.link/20240708205826.5176-1-olek2@wp.pl
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/lantiq_etop.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/net/ethernet/lantiq_etop.c b/drivers/net/ethernet/lantiq_etop.c
+index 1d5b7bb6380f9..8a810e69cb338 100644
+--- a/drivers/net/ethernet/lantiq_etop.c
++++ b/drivers/net/ethernet/lantiq_etop.c
+@@ -217,9 +217,9 @@ ltq_etop_free_channel(struct net_device *dev, struct ltq_etop_chan *ch)
+       if (ch->dma.irq)
+               free_irq(ch->dma.irq, priv);
+       if (IS_RX(ch->idx)) {
+-              int desc;
++              struct ltq_dma_channel *dma = &ch->dma;
+ 
+-              for (desc = 0; desc < LTQ_DESC_NUM; desc++)
++              for (dma->desc = 0; dma->desc < LTQ_DESC_NUM; dma->desc++)
+                       dev_kfree_skb_any(ch->skb[ch->dma.desc]);
+       }
+ }
+-- 
+2.43.0
+
diff --git a/queue-6.9/net-ethernet-mtk-star-emac-set-mac_managed_pm-when-p.patch b/queue-6.9/net-ethernet-mtk-star-emac-set-mac_managed_pm-when-p.patch

new file mode 100644 (file)

index 0000000..973f8c7
--- /dev/null
+++ b/queue-6.9/net-ethernet-mtk-star-emac-set-mac_managed_pm-when-p.patch
@@ -0,0 +1,55 @@
+From 23ec424f94336d95df0c9dab3926244e2ef9fb83 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 8 Jul 2024 14:52:09 +0800
+Subject: net: ethernet: mtk-star-emac: set mac_managed_pm when probing
+
+From: Jian Hui Lee <jianhui.lee@canonical.com>
+
+[ Upstream commit 8c6790b5c25dfac11b589cc37346bcf9e23ad468 ]
+
+The below commit introduced a warning message when phy state is not in
+the states: PHY_HALTED, PHY_READY, and PHY_UP.
+commit 744d23c71af3 ("net: phy: Warn about incorrect mdio_bus_phy_resume() state")
+
+mtk-star-emac doesn't need mdiobus suspend/resume. To fix the warning
+message during resume, indicate the phy resume/suspend is managed by the
+mac when probing.
+
+Fixes: 744d23c71af3 ("net: phy: Warn about incorrect mdio_bus_phy_resume() state")
+Signed-off-by: Jian Hui Lee <jianhui.lee@canonical.com>
+Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
+Link: https://patch.msgid.link/20240708065210.4178980-1-jianhui.lee@canonical.com
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/mediatek/mtk_star_emac.c | 7 +++++++
+ 1 file changed, 7 insertions(+)
+
+diff --git a/drivers/net/ethernet/mediatek/mtk_star_emac.c b/drivers/net/ethernet/mediatek/mtk_star_emac.c
+index 31aebeb2e2858..25989c79c92e6 100644
+--- a/drivers/net/ethernet/mediatek/mtk_star_emac.c
++++ b/drivers/net/ethernet/mediatek/mtk_star_emac.c
+@@ -1524,6 +1524,7 @@ static int mtk_star_probe(struct platform_device *pdev)
+ {
+       struct device_node *of_node;
+       struct mtk_star_priv *priv;
++      struct phy_device *phydev;
+       struct net_device *ndev;
+       struct device *dev;
+       void __iomem *base;
+@@ -1649,6 +1650,12 @@ static int mtk_star_probe(struct platform_device *pdev)
+       netif_napi_add(ndev, &priv->rx_napi, mtk_star_rx_poll);
+       netif_napi_add_tx(ndev, &priv->tx_napi, mtk_star_tx_poll);
+ 
++      phydev = of_phy_find_device(priv->phy_node);
++      if (phydev) {
++              phydev->mac_managed_pm = true;
++              put_device(&phydev->mdio.dev);
++      }
++
+       return devm_register_netdev(dev, ndev);
+ }
+ 
+-- 
+2.43.0
+
diff --git a/queue-6.9/net-fix-rc7-s-__skb_datagram_iter.patch b/queue-6.9/net-fix-rc7-s-__skb_datagram_iter.patch

new file mode 100644 (file)

index 0000000..6d8c473
--- /dev/null
+++ b/queue-6.9/net-fix-rc7-s-__skb_datagram_iter.patch
@@ -0,0 +1,45 @@
+From 733f62e2b14867199a08dbda10e9c1efa3e56a47 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 8 Jul 2024 07:46:00 -0700
+Subject: net: fix rc7's __skb_datagram_iter()
+
+From: Hugh Dickins <hughd@google.com>
+
+[ Upstream commit f153831097b4435f963e385304cc0f1acba1c657 ]
+
+X would not start in my old 32-bit partition (and the "n"-handling looks
+just as wrong on 64-bit, but for whatever reason did not show up there):
+"n" must be accumulated over all pages before it's added to "offset" and
+compared with "copy", immediately after the skb_frag_foreach_page() loop.
+
+Fixes: d2d30a376d9c ("net: allow skb_datagram_iter to be called from any context")
+Signed-off-by: Hugh Dickins <hughd@google.com>
+Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
+Link: https://patch.msgid.link/fef352e8-b89a-da51-f8ce-04bc39ee6481@google.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/core/datagram.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/net/core/datagram.c b/net/core/datagram.c
+index cb72923acc21c..99abfafb0b439 100644
+--- a/net/core/datagram.c
++++ b/net/core/datagram.c
+@@ -442,11 +442,12 @@ static int __skb_datagram_iter(const struct sk_buff *skb, int offset,
+                       if (copy > len)
+                               copy = len;
+ 
++                      n = 0;
+                       skb_frag_foreach_page(frag,
+                                             skb_frag_off(frag) + offset - start,
+                                             copy, p, p_off, p_len, copied) {
+                               vaddr = kmap_local_page(p);
+-                              n = INDIRECT_CALL_1(cb, simple_copy_to_iter,
++                              n += INDIRECT_CALL_1(cb, simple_copy_to_iter,
+                                       vaddr + p_off, p_len, data, to);
+                               kunmap_local(vaddr);
+                       }
+-- 
+2.43.0
+
diff --git a/queue-6.9/net-phy-microchip-lan87xx-reinit-phy-after-cable-tes.patch b/queue-6.9/net-phy-microchip-lan87xx-reinit-phy-after-cable-tes.patch

new file mode 100644 (file)

index 0000000..19579fd
--- /dev/null
+++ b/queue-6.9/net-phy-microchip-lan87xx-reinit-phy-after-cable-tes.patch
@@ -0,0 +1,41 @@
+From b6c62237d1c7ba11c8e853024cf4ab15e305509c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 5 Jul 2024 10:49:54 +0200
+Subject: net: phy: microchip: lan87xx: reinit PHY after cable test
+
+From: Oleksij Rempel <o.rempel@pengutronix.de>
+
+[ Upstream commit 30f747b8d53bc73555f268d0f48f56174fa5bf10 ]
+
+Reinit PHY after cable test, otherwise link can't be established on
+tested port. This issue is reproducible on LAN9372 switches with
+integrated 100BaseT1 PHYs.
+
+Fixes: 788050256c411 ("net: phy: microchip_t1: add cable test support for lan87xx phy")
+Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
+Reviewed-by: Andrew Lunn <andrew@lunn.ch>
+Reviewed-by: Michal Kubiak <michal.kubiak@intel.com>
+Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
+Link: https://patch.msgid.link/20240705084954.83048-1-o.rempel@pengutronix.de
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/phy/microchip_t1.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/net/phy/microchip_t1.c b/drivers/net/phy/microchip_t1.c
+index a838b61cd844b..a35528497a576 100644
+--- a/drivers/net/phy/microchip_t1.c
++++ b/drivers/net/phy/microchip_t1.c
+@@ -748,7 +748,7 @@ static int lan87xx_cable_test_report(struct phy_device *phydev)
+       ethnl_cable_test_result(phydev, ETHTOOL_A_CABLE_PAIR_A,
+                               lan87xx_cable_test_report_trans(detect));
+ 
+-      return 0;
++      return phy_init_hw(phydev);
+ }
+ 
+ static int lan87xx_cable_test_get_status(struct phy_device *phydev,
+-- 
+2.43.0
+
diff --git a/queue-6.9/net-sched-fix-uaf-when-resolving-a-clash.patch b/queue-6.9/net-sched-fix-uaf-when-resolving-a-clash.patch

new file mode 100644 (file)

index 0000000..e205a83
--- /dev/null
+++ b/queue-6.9/net-sched-fix-uaf-when-resolving-a-clash.patch
@@ -0,0 +1,131 @@
+From 18235ac702f016b3ca838758a186901b11f04598 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 10 Jul 2024 13:37:47 +0800
+Subject: net/sched: Fix UAF when resolving a clash
+
+From: Chengen Du <chengen.du@canonical.com>
+
+[ Upstream commit 26488172b0292bed837b95a006a3f3431d1898c3 ]
+
+KASAN reports the following UAF:
+
+ BUG: KASAN: slab-use-after-free in tcf_ct_flow_table_process_conn+0x12b/0x380 [act_ct]
+ Read of size 1 at addr ffff888c07603600 by task handler130/6469
+
+ Call Trace:
+  <IRQ>
+  dump_stack_lvl+0x48/0x70
+  print_address_description.constprop.0+0x33/0x3d0
+  print_report+0xc0/0x2b0
+  kasan_report+0xd0/0x120
+  __asan_load1+0x6c/0x80
+  tcf_ct_flow_table_process_conn+0x12b/0x380 [act_ct]
+  tcf_ct_act+0x886/0x1350 [act_ct]
+  tcf_action_exec+0xf8/0x1f0
+  fl_classify+0x355/0x360 [cls_flower]
+  __tcf_classify+0x1fd/0x330
+  tcf_classify+0x21c/0x3c0
+  sch_handle_ingress.constprop.0+0x2c5/0x500
+  __netif_receive_skb_core.constprop.0+0xb25/0x1510
+  __netif_receive_skb_list_core+0x220/0x4c0
+  netif_receive_skb_list_internal+0x446/0x620
+  napi_complete_done+0x157/0x3d0
+  gro_cell_poll+0xcf/0x100
+  __napi_poll+0x65/0x310
+  net_rx_action+0x30c/0x5c0
+  __do_softirq+0x14f/0x491
+  __irq_exit_rcu+0x82/0xc0
+  irq_exit_rcu+0xe/0x20
+  common_interrupt+0xa1/0xb0
+  </IRQ>
+  <TASK>
+  asm_common_interrupt+0x27/0x40
+
+ Allocated by task 6469:
+  kasan_save_stack+0x38/0x70
+  kasan_set_track+0x25/0x40
+  kasan_save_alloc_info+0x1e/0x40
+  __kasan_krealloc+0x133/0x190
+  krealloc+0xaa/0x130
+  nf_ct_ext_add+0xed/0x230 [nf_conntrack]
+  tcf_ct_act+0x1095/0x1350 [act_ct]
+  tcf_action_exec+0xf8/0x1f0
+  fl_classify+0x355/0x360 [cls_flower]
+  __tcf_classify+0x1fd/0x330
+  tcf_classify+0x21c/0x3c0
+  sch_handle_ingress.constprop.0+0x2c5/0x500
+  __netif_receive_skb_core.constprop.0+0xb25/0x1510
+  __netif_receive_skb_list_core+0x220/0x4c0
+  netif_receive_skb_list_internal+0x446/0x620
+  napi_complete_done+0x157/0x3d0
+  gro_cell_poll+0xcf/0x100
+  __napi_poll+0x65/0x310
+  net_rx_action+0x30c/0x5c0
+  __do_softirq+0x14f/0x491
+
+ Freed by task 6469:
+  kasan_save_stack+0x38/0x70
+  kasan_set_track+0x25/0x40
+  kasan_save_free_info+0x2b/0x60
+  ____kasan_slab_free+0x180/0x1f0
+  __kasan_slab_free+0x12/0x30
+  slab_free_freelist_hook+0xd2/0x1a0
+  __kmem_cache_free+0x1a2/0x2f0
+  kfree+0x78/0x120
+  nf_conntrack_free+0x74/0x130 [nf_conntrack]
+  nf_ct_destroy+0xb2/0x140 [nf_conntrack]
+  __nf_ct_resolve_clash+0x529/0x5d0 [nf_conntrack]
+  nf_ct_resolve_clash+0xf6/0x490 [nf_conntrack]
+  __nf_conntrack_confirm+0x2c6/0x770 [nf_conntrack]
+  tcf_ct_act+0x12ad/0x1350 [act_ct]
+  tcf_action_exec+0xf8/0x1f0
+  fl_classify+0x355/0x360 [cls_flower]
+  __tcf_classify+0x1fd/0x330
+  tcf_classify+0x21c/0x3c0
+  sch_handle_ingress.constprop.0+0x2c5/0x500
+  __netif_receive_skb_core.constprop.0+0xb25/0x1510
+  __netif_receive_skb_list_core+0x220/0x4c0
+  netif_receive_skb_list_internal+0x446/0x620
+  napi_complete_done+0x157/0x3d0
+  gro_cell_poll+0xcf/0x100
+  __napi_poll+0x65/0x310
+  net_rx_action+0x30c/0x5c0
+  __do_softirq+0x14f/0x491
+
+The ct may be dropped if a clash has been resolved but is still passed to
+the tcf_ct_flow_table_process_conn function for further usage. This issue
+can be fixed by retrieving ct from skb again after confirming conntrack.
+
+Fixes: 0cc254e5aa37 ("net/sched: act_ct: Offload connections with commit action")
+Co-developed-by: Gerald Yang <gerald.yang@canonical.com>
+Signed-off-by: Gerald Yang <gerald.yang@canonical.com>
+Signed-off-by: Chengen Du <chengen.du@canonical.com>
+Link: https://patch.msgid.link/20240710053747.13223-1-chengen.du@canonical.com
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/sched/act_ct.c | 8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
+index 2a96d9c1db65b..6fa3cca87d346 100644
+--- a/net/sched/act_ct.c
++++ b/net/sched/act_ct.c
+@@ -1077,6 +1077,14 @@ TC_INDIRECT_SCOPE int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
+                */
+               if (nf_conntrack_confirm(skb) != NF_ACCEPT)
+                       goto drop;
++
++              /* The ct may be dropped if a clash has been resolved,
++               * so it's necessary to retrieve it from skb again to
++               * prevent UAF.
++               */
++              ct = nf_ct_get(skb, &ctinfo);
++              if (!ct)
++                      skip_add = true;
+       }
+ 
+       if (!skip_add)
+-- 
+2.43.0
+
diff --git a/queue-6.9/net-sunrpc-remap-eperm-in-case-of-connection-failure.patch b/queue-6.9/net-sunrpc-remap-eperm-in-case-of-connection-failure.patch

new file mode 100644 (file)

index 0000000..b244617
--- /dev/null
+++ b/queue-6.9/net-sunrpc-remap-eperm-in-case-of-connection-failure.patch
@@ -0,0 +1,64 @@
+From bccfe53373e01f920cd5e98e6e221b5fa0743524 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 4 Jul 2024 08:41:57 +0200
+Subject: net, sunrpc: Remap EPERM in case of connection failure in
+ xs_tcp_setup_socket
+
+From: Daniel Borkmann <daniel@iogearbox.net>
+
+[ Upstream commit 626dfed5fa3bfb41e0dffd796032b555b69f9cde ]
+
+When using a BPF program on kernel_connect(), the call can return -EPERM. This
+causes xs_tcp_setup_socket() to loop forever, filling up the syslog and causing
+the kernel to potentially freeze up.
+
+Neil suggested:
+
+  This will propagate -EPERM up into other layers which might not be ready
+  to handle it. It might be safer to map EPERM to an error we would be more
+  likely to expect from the network system - such as ECONNREFUSED or ENETDOWN.
+
+ECONNREFUSED as error seems reasonable. For programs setting a different error
+can be out of reach (see handling in 4fbac77d2d09) in particular on kernels
+which do not have f10d05966196 ("bpf: Make BPF_PROG_RUN_ARRAY return -err
+instead of allow boolean"), thus given that it is better to simply remap for
+consistent behavior. UDP does handle EPERM in xs_udp_send_request().
+
+Fixes: d74bad4e74ee ("bpf: Hooks for sys_connect")
+Fixes: 4fbac77d2d09 ("bpf: Hooks for sys_bind")
+Co-developed-by: Lex Siegel <usiegl00@gmail.com>
+Signed-off-by: Lex Siegel <usiegl00@gmail.com>
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+Cc: Neil Brown <neilb@suse.de>
+Cc: Trond Myklebust <trondmy@kernel.org>
+Cc: Anna Schumaker <anna@kernel.org>
+Link: https://github.com/cilium/cilium/issues/33395
+Link: https://lore.kernel.org/bpf/171374175513.12877.8993642908082014881@noble.neil.brown.name
+Link: https://patch.msgid.link/9069ec1d59e4b2129fc23433349fd5580ad43921.1720075070.git.daniel@iogearbox.net
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/sunrpc/xprtsock.c | 7 +++++++
+ 1 file changed, 7 insertions(+)
+
+diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
+index ce18716491c8f..b9121adef8b76 100644
+--- a/net/sunrpc/xprtsock.c
++++ b/net/sunrpc/xprtsock.c
+@@ -2442,6 +2442,13 @@ static void xs_tcp_setup_socket(struct work_struct *work)
+               transport->srcport = 0;
+               status = -EAGAIN;
+               break;
++      case -EPERM:
++              /* Happens, for instance, if a BPF program is preventing
++               * the connect. Remap the error so upper layers can better
++               * deal with it.
++               */
++              status = -ECONNREFUSED;
++              fallthrough;
+       case -EINVAL:
+               /* Happens, for instance, if the user specified a link
+                * local IPv6 address without a scope-id.
+-- 
+2.43.0
+
diff --git a/queue-6.9/netfilter-nf_tables-prefer-nft_chain_validate.patch b/queue-6.9/netfilter-nf_tables-prefer-nft_chain_validate.patch

new file mode 100644 (file)

index 0000000..a6f7b1a
--- /dev/null
+++ b/queue-6.9/netfilter-nf_tables-prefer-nft_chain_validate.patch
@@ -0,0 +1,237 @@
+From 03dd9b3bc83470fd6b30e3bade71af0730b645ab Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 11 Jul 2024 11:06:39 +0200
+Subject: netfilter: nf_tables: prefer nft_chain_validate
+
+From: Florian Westphal <fw@strlen.de>
+
+[ Upstream commit cff3bd012a9512ac5ed858d38e6ed65f6391008c ]
+
+nft_chain_validate already performs loop detection because a cycle will
+result in a call stack overflow (ctx->level >= NFT_JUMP_STACK_SIZE).
+
+It also follows maps via ->validate callback in nft_lookup, so there
+appears no reason to iterate the maps again.
+
+nf_tables_check_loops() and all its helper functions can be removed.
+This improves ruleset load time significantly, from 23s down to 12s.
+
+This also fixes a crash bug. Old loop detection code can result in
+unbounded recursion:
+
+BUG: TASK stack guard page was hit at ....
+Oops: stack guard page: 0000 [#1] PREEMPT SMP KASAN
+CPU: 4 PID: 1539 Comm: nft Not tainted 6.10.0-rc5+ #1
+[..]
+
+with a suitable ruleset during validation of register stores.
+
+I can't see any actual reason to attempt to check for this from
+nft_validate_register_store(), at this point the transaction is still in
+progress, so we don't have a full picture of the rule graph.
+
+For nf-next it might make sense to either remove it or make this depend
+on table->validate_state in case we could catch an error earlier
+(for improved error reporting to userspace).
+
+Fixes: 20a69341f2d0 ("netfilter: nf_tables: add netlink set API")
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netfilter/nf_tables_api.c | 158 +++-------------------------------
+ 1 file changed, 13 insertions(+), 145 deletions(-)
+
+diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
+index 0f77ba3306c23..d129b826924eb 100644
+--- a/net/netfilter/nf_tables_api.c
++++ b/net/netfilter/nf_tables_api.c
+@@ -3823,6 +3823,15 @@ static void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *r
+       nf_tables_rule_destroy(ctx, rule);
+ }
+ 
++/** nft_chain_validate - loop detection and hook validation
++ *
++ * @ctx: context containing call depth and base chain
++ * @chain: chain to validate
++ *
++ * Walk through the rules of the given chain and chase all jumps/gotos
++ * and set lookups until either the jump limit is hit or all reachable
++ * chains have been validated.
++ */
+ int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain)
+ {
+       struct nft_expr *expr, *last;
+@@ -3844,6 +3853,9 @@ int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain)
+                       if (!expr->ops->validate)
+                               continue;
+ 
++                      /* This may call nft_chain_validate() recursively,
++                       * callers that do so must increment ctx->level.
++                       */
+                       err = expr->ops->validate(ctx, expr, &data);
+                       if (err < 0)
+                               return err;
+@@ -10805,150 +10817,6 @@ int nft_chain_validate_hooks(const struct nft_chain *chain,
+ }
+ EXPORT_SYMBOL_GPL(nft_chain_validate_hooks);
+ 
+-/*
+- * Loop detection - walk through the ruleset beginning at the destination chain
+- * of a new jump until either the source chain is reached (loop) or all
+- * reachable chains have been traversed.
+- *
+- * The loop check is performed whenever a new jump verdict is added to an
+- * expression or verdict map or a verdict map is bound to a new chain.
+- */
+-
+-static int nf_tables_check_loops(const struct nft_ctx *ctx,
+-                               const struct nft_chain *chain);
+-
+-static int nft_check_loops(const struct nft_ctx *ctx,
+-                         const struct nft_set_ext *ext)
+-{
+-      const struct nft_data *data;
+-      int ret;
+-
+-      data = nft_set_ext_data(ext);
+-      switch (data->verdict.code) {
+-      case NFT_JUMP:
+-      case NFT_GOTO:
+-              ret = nf_tables_check_loops(ctx, data->verdict.chain);
+-              break;
+-      default:
+-              ret = 0;
+-              break;
+-      }
+-
+-      return ret;
+-}
+-
+-static int nf_tables_loop_check_setelem(const struct nft_ctx *ctx,
+-                                      struct nft_set *set,
+-                                      const struct nft_set_iter *iter,
+-                                      struct nft_elem_priv *elem_priv)
+-{
+-      const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
+-
+-      if (!nft_set_elem_active(ext, iter->genmask))
+-              return 0;
+-
+-      if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
+-          *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END)
+-              return 0;
+-
+-      return nft_check_loops(ctx, ext);
+-}
+-
+-static int nft_set_catchall_loops(const struct nft_ctx *ctx,
+-                                struct nft_set *set)
+-{
+-      u8 genmask = nft_genmask_next(ctx->net);
+-      struct nft_set_elem_catchall *catchall;
+-      struct nft_set_ext *ext;
+-      int ret = 0;
+-
+-      list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
+-              ext = nft_set_elem_ext(set, catchall->elem);
+-              if (!nft_set_elem_active(ext, genmask))
+-                      continue;
+-
+-              ret = nft_check_loops(ctx, ext);
+-              if (ret < 0)
+-                      return ret;
+-      }
+-
+-      return ret;
+-}
+-
+-static int nf_tables_check_loops(const struct nft_ctx *ctx,
+-                               const struct nft_chain *chain)
+-{
+-      const struct nft_rule *rule;
+-      const struct nft_expr *expr, *last;
+-      struct nft_set *set;
+-      struct nft_set_binding *binding;
+-      struct nft_set_iter iter;
+-
+-      if (ctx->chain == chain)
+-              return -ELOOP;
+-
+-      if (fatal_signal_pending(current))
+-              return -EINTR;
+-
+-      list_for_each_entry(rule, &chain->rules, list) {
+-              nft_rule_for_each_expr(expr, last, rule) {
+-                      struct nft_immediate_expr *priv;
+-                      const struct nft_data *data;
+-                      int err;
+-
+-                      if (strcmp(expr->ops->type->name, "immediate"))
+-                              continue;
+-
+-                      priv = nft_expr_priv(expr);
+-                      if (priv->dreg != NFT_REG_VERDICT)
+-                              continue;
+-
+-                      data = &priv->data;
+-                      switch (data->verdict.code) {
+-                      case NFT_JUMP:
+-                      case NFT_GOTO:
+-                              err = nf_tables_check_loops(ctx,
+-                                                      data->verdict.chain);
+-                              if (err < 0)
+-                                      return err;
+-                              break;
+-                      default:
+-                              break;
+-                      }
+-              }
+-      }
+-
+-      list_for_each_entry(set, &ctx->table->sets, list) {
+-              if (!nft_is_active_next(ctx->net, set))
+-                      continue;
+-              if (!(set->flags & NFT_SET_MAP) ||
+-                  set->dtype != NFT_DATA_VERDICT)
+-                      continue;
+-
+-              list_for_each_entry(binding, &set->bindings, list) {
+-                      if (!(binding->flags & NFT_SET_MAP) ||
+-                          binding->chain != chain)
+-                              continue;
+-
+-                      iter.genmask    = nft_genmask_next(ctx->net);
+-                      iter.type       = NFT_ITER_UPDATE;
+-                      iter.skip       = 0;
+-                      iter.count      = 0;
+-                      iter.err        = 0;
+-                      iter.fn         = nf_tables_loop_check_setelem;
+-
+-                      set->ops->walk(ctx, set, &iter);
+-                      if (!iter.err)
+-                              iter.err = nft_set_catchall_loops(ctx, set);
+-
+-                      if (iter.err < 0)
+-                              return iter.err;
+-              }
+-      }
+-
+-      return 0;
+-}
+-
+ /**
+  *    nft_parse_u32_check - fetch u32 attribute and check for maximum value
+  *
+@@ -11061,7 +10929,7 @@ static int nft_validate_register_store(const struct nft_ctx *ctx,
+               if (data != NULL &&
+                   (data->verdict.code == NFT_GOTO ||
+                    data->verdict.code == NFT_JUMP)) {
+-                      err = nf_tables_check_loops(ctx, data->verdict.chain);
++                      err = nft_chain_validate(ctx, data->verdict.chain);
+                       if (err < 0)
+                               return err;
+               }
+-- 
+2.43.0
+
diff --git a/queue-6.9/netfilter-nfnetlink_queue-drop-bogus-warn_on.patch b/queue-6.9/netfilter-nfnetlink_queue-drop-bogus-warn_on.patch

new file mode 100644 (file)

index 0000000..c30e256
--- /dev/null
+++ b/queue-6.9/netfilter-nfnetlink_queue-drop-bogus-warn_on.patch
@@ -0,0 +1,41 @@
+From a41ccb5bac485aafe516011a22511c2fa34dcf13 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 9 Jul 2024 02:02:26 +0200
+Subject: netfilter: nfnetlink_queue: drop bogus WARN_ON
+
+From: Florian Westphal <fw@strlen.de>
+
+[ Upstream commit 631a4b3ddc7831b20442c59c28b0476d0704c9af ]
+
+Happens when rules get flushed/deleted while packet is out, so remove
+this WARN_ON.
+
+This WARN exists in one form or another since v4.14, no need to backport
+this to older releases, hence use a more recent fixes tag.
+
+Fixes: 3f8019688894 ("netfilter: move nf_reinject into nfnetlink_queue modules")
+Reported-by: kernel test robot <oliver.sang@intel.com>
+Closes: https://lore.kernel.org/oe-lkp/202407081453.11ac0f63-lkp@intel.com
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netfilter/nfnetlink_queue.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
+index f1c31757e4969..55e28e1da66ec 100644
+--- a/net/netfilter/nfnetlink_queue.c
++++ b/net/netfilter/nfnetlink_queue.c
+@@ -325,7 +325,7 @@ static void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
+       hooks = nf_hook_entries_head(net, pf, entry->state.hook);
+ 
+       i = entry->hook_index;
+-      if (WARN_ON_ONCE(!hooks || i >= hooks->num_hook_entries)) {
++      if (!hooks || i >= hooks->num_hook_entries) {
+               kfree_skb_reason(skb, SKB_DROP_REASON_NETFILTER_DROP);
+               nf_queue_entry_free(entry);
+               return;
+-- 
+2.43.0
+
diff --git a/queue-6.9/octeontx2-af-fix-incorrect-value-output-on-error-pat.patch b/queue-6.9/octeontx2-af-fix-incorrect-value-output-on-error-pat.patch

new file mode 100644 (file)

index 0000000..fb55925
--- /dev/null
+++ b/queue-6.9/octeontx2-af-fix-incorrect-value-output-on-error-pat.patch
@@ -0,0 +1,44 @@
+From b3abb5c0c68a7b1ef3a9bc6fcc42da24af509433 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 5 Jul 2024 12:53:17 +0300
+Subject: octeontx2-af: Fix incorrect value output on error path in
+ rvu_check_rsrc_availability()
+
+From: Aleksandr Mishin <amishin@t-argos.ru>
+
+[ Upstream commit 442e26af9aa8115c96541026cbfeaaa76c85d178 ]
+
+In rvu_check_rsrc_availability() in case of invalid SSOW req, an incorrect
+data is printed to error log. 'req->sso' value is printed instead of
+'req->ssow'. Looks like "copy-paste" mistake.
+
+Fix this mistake by replacing 'req->sso' with 'req->ssow'.
+
+Found by Linux Verification Center (linuxtesting.org) with SVACE.
+
+Fixes: 746ea74241fa ("octeontx2-af: Add RVU block LF provisioning support")
+Signed-off-by: Aleksandr Mishin <amishin@t-argos.ru>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Link: https://patch.msgid.link/20240705095317.12640-1-amishin@t-argos.ru
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/marvell/octeontx2/af/rvu.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
+index ff78251f92d44..5f661e67ccbcf 100644
+--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
++++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
+@@ -1643,7 +1643,7 @@ static int rvu_check_rsrc_availability(struct rvu *rvu,
+               if (req->ssow > block->lf.max) {
+                       dev_err(&rvu->pdev->dev,
+                               "Func 0x%x: Invalid SSOW req, %d > max %d\n",
+-                               pcifunc, req->sso, block->lf.max);
++                               pcifunc, req->ssow, block->lf.max);
+                       return -EINVAL;
+               }
+               mappedlfs = rvu_get_rsrc_mapcount(pfvf, block->addr);
+-- 
+2.43.0
+
diff --git a/queue-6.9/ppp-reject-claimed-as-lcp-but-actually-malformed-pac.patch b/queue-6.9/ppp-reject-claimed-as-lcp-but-actually-malformed-pac.patch

new file mode 100644 (file)

index 0000000..389dfd9
--- /dev/null
+++ b/queue-6.9/ppp-reject-claimed-as-lcp-but-actually-malformed-pac.patch
@@ -0,0 +1,67 @@
+From 58fe1d745546db3bf4b48d644a58cb4fb4a52c7e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 8 Jul 2024 14:56:15 +0300
+Subject: ppp: reject claimed-as-LCP but actually malformed packets
+
+From: Dmitry Antipov <dmantipov@yandex.ru>
+
+[ Upstream commit f2aeb7306a898e1cbd03963d376f4b6656ca2b55 ]
+
+Since 'ppp_async_encode()' assumes valid LCP packets (with code
+from 1 to 7 inclusive), add 'ppp_check_packet()' to ensure that
+LCP packet has an actual body beyond PPP_LCP header bytes, and
+reject claimed-as-LCP but actually malformed data otherwise.
+
+Reported-by: syzbot+ec0723ba9605678b14bf@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=ec0723ba9605678b14bf
+Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
+Signed-off-by: Dmitry Antipov <dmantipov@yandex.ru>
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ppp/ppp_generic.c | 15 +++++++++++++++
+ 1 file changed, 15 insertions(+)
+
+diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c
+index fe380fe196e7b..996dee54d751d 100644
+--- a/drivers/net/ppp/ppp_generic.c
++++ b/drivers/net/ppp/ppp_generic.c
+@@ -70,6 +70,7 @@
+ #define MPHDRLEN_SSN  4       /* ditto with short sequence numbers */
+ 
+ #define PPP_PROTO_LEN 2
++#define PPP_LCP_HDRLEN        4
+ 
+ /*
+  * An instance of /dev/ppp can be associated with either a ppp
+@@ -493,6 +494,15 @@ static ssize_t ppp_read(struct file *file, char __user *buf,
+       return ret;
+ }
+ 
++static bool ppp_check_packet(struct sk_buff *skb, size_t count)
++{
++      /* LCP packets must include LCP header which 4 bytes long:
++       * 1-byte code, 1-byte identifier, and 2-byte length.
++       */
++      return get_unaligned_be16(skb->data) != PPP_LCP ||
++              count >= PPP_PROTO_LEN + PPP_LCP_HDRLEN;
++}
++
+ static ssize_t ppp_write(struct file *file, const char __user *buf,
+                        size_t count, loff_t *ppos)
+ {
+@@ -515,6 +525,11 @@ static ssize_t ppp_write(struct file *file, const char __user *buf,
+               kfree_skb(skb);
+               goto out;
+       }
++      ret = -EINVAL;
++      if (unlikely(!ppp_check_packet(skb, count))) {
++              kfree_skb(skb);
++              goto out;
++      }
+ 
+       switch (pf->kind) {
+       case INTERFACE:
+-- 
+2.43.0
+
diff --git a/queue-6.9/scsi-ufs-core-fix-ufshcd_abort_one-racing-issue.patch b/queue-6.9/scsi-ufs-core-fix-ufshcd_abort_one-racing-issue.patch

new file mode 100644 (file)

index 0000000..a230bfc
--- /dev/null
+++ b/queue-6.9/scsi-ufs-core-fix-ufshcd_abort_one-racing-issue.patch
@@ -0,0 +1,78 @@
+From 16aaf7559c680ebbbe579ac95f2f1ac0b1d112f9 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 28 Jun 2024 15:00:30 +0800
+Subject: scsi: ufs: core: Fix ufshcd_abort_one racing issue
+
+From: Peter Wang <peter.wang@mediatek.com>
+
+[ Upstream commit 74736103fb4123c71bf11fb7a6abe7c884c5269e ]
+
+When ufshcd_abort_one is racing with the completion ISR, the completed tag
+of the request's mq_hctx pointer will be set to NULL by ISR.  Return
+success when request is completed by ISR because ufshcd_abort_one does not
+need to do anything.
+
+The racing flow is:
+
+Thread A
+ufshcd_err_handler                                     step 1
+       ...
+       ufshcd_abort_one
+               ufshcd_try_to_abort_task
+                       ufshcd_cmd_inflight(true)       step 3
+               ufshcd_mcq_req_to_hwq
+                       blk_mq_unique_tag
+                               rq->mq_hctx->queue_num  step 5
+
+Thread B
+ufs_mtk_mcq_intr(cq complete ISR)                      step 2
+       scsi_done
+               ...
+               __blk_mq_free_request
+                       rq->mq_hctx = NULL;             step 4
+
+Below is KE back trace.
+  ufshcd_try_to_abort_task: cmd at tag 41 not pending in the device.
+  ufshcd_try_to_abort_task: cmd at tag=41 is cleared.
+  Aborting tag 41 / CDB 0x28 succeeded
+  Unable to handle kernel NULL pointer dereference at virtual address 0000000000000194
+  pc : [0xffffffddd7a79bf8] blk_mq_unique_tag+0x8/0x14
+  lr : [0xffffffddd6155b84] ufshcd_mcq_req_to_hwq+0x1c/0x40 [ufs_mediatek_mod_ise]
+   do_mem_abort+0x58/0x118
+   el1_abort+0x3c/0x5c
+   el1h_64_sync_handler+0x54/0x90
+   el1h_64_sync+0x68/0x6c
+   blk_mq_unique_tag+0x8/0x14
+   ufshcd_err_handler+0xae4/0xfa8 [ufs_mediatek_mod_ise]
+   process_one_work+0x208/0x4fc
+   worker_thread+0x228/0x438
+   kthread+0x104/0x1d4
+   ret_from_fork+0x10/0x20
+
+Fixes: 93e6c0e19d5b ("scsi: ufs: core: Clear cmd if abort succeeds in MCQ mode")
+Suggested-by: Bart Van Assche <bvanassche@acm.org>
+Signed-off-by: Peter Wang <peter.wang@mediatek.com>
+Link: https://lore.kernel.org/r/20240628070030.30929-3-peter.wang@mediatek.com
+Reviewed-by: Bart Van Assche <bvanassche@acm.org>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/ufs/core/ufshcd.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
+index f7d04f7c0017d..ad192b74536a2 100644
+--- a/drivers/ufs/core/ufshcd.c
++++ b/drivers/ufs/core/ufshcd.c
+@@ -6506,6 +6506,8 @@ static bool ufshcd_abort_one(struct request *rq, void *priv)
+       /* Release cmd in MCQ mode if abort succeeds */
+       if (is_mcq_enabled(hba) && (*ret == 0)) {
+               hwq = ufshcd_mcq_req_to_hwq(hba, scsi_cmd_to_rq(lrbp->cmd));
++              if (!hwq)
++                      return 0;
+               spin_lock_irqsave(&hwq->cq_lock, flags);
+               if (ufshcd_cmd_inflight(lrbp->cmd))
+                       ufshcd_release_scsi_cmd(hba, lrbp);
+-- 
+2.43.0
+
diff --git a/queue-6.9/scsi-ufs-core-fix-ufshcd_clear_cmd-racing-issue.patch b/queue-6.9/scsi-ufs-core-fix-ufshcd_clear_cmd-racing-issue.patch

new file mode 100644 (file)

index 0000000..d7ee448
--- /dev/null
+++ b/queue-6.9/scsi-ufs-core-fix-ufshcd_clear_cmd-racing-issue.patch
@@ -0,0 +1,114 @@
+From 13329858193e739ea296167c584b81c6e66abf52 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 28 Jun 2024 15:00:29 +0800
+Subject: scsi: ufs: core: Fix ufshcd_clear_cmd racing issue
+
+From: Peter Wang <peter.wang@mediatek.com>
+
+[ Upstream commit 9307a998cb9846a2557fdca286997430bee36a2a ]
+
+When ufshcd_clear_cmd is racing with the completion ISR, the completed tag
+of the request's mq_hctx pointer will be set to NULL by the ISR.  And
+ufshcd_clear_cmd's call to ufshcd_mcq_req_to_hwq will get NULL pointer KE.
+Return success when the request is completed by ISR because sq does not
+need cleanup.
+
+The racing flow is:
+
+Thread A
+ufshcd_err_handler                                     step 1
+       ufshcd_try_to_abort_task
+               ufshcd_cmd_inflight(true)               step 3
+               ufshcd_clear_cmd
+                       ...
+                       ufshcd_mcq_req_to_hwq
+                       blk_mq_unique_tag
+                               rq->mq_hctx->queue_num  step 5
+
+Thread B
+ufs_mtk_mcq_intr(cq complete ISR)                      step 2
+       scsi_done
+               ...
+               __blk_mq_free_request
+                       rq->mq_hctx = NULL;             step 4
+
+Below is KE back trace:
+
+  ufshcd_try_to_abort_task: cmd pending in the device. tag = 6
+  Unable to handle kernel NULL pointer dereference at virtual address 0000000000000194
+   pc : [0xffffffd589679bf8] blk_mq_unique_tag+0x8/0x14
+   lr : [0xffffffd5862f95b4] ufshcd_mcq_sq_cleanup+0x6c/0x1cc [ufs_mediatek_mod_ise]
+   Workqueue: ufs_eh_wq_0 ufshcd_err_handler [ufs_mediatek_mod_ise]
+   Call trace:
+    dump_backtrace+0xf8/0x148
+    show_stack+0x18/0x24
+    dump_stack_lvl+0x60/0x7c
+    dump_stack+0x18/0x3c
+    mrdump_common_die+0x24c/0x398 [mrdump]
+    ipanic_die+0x20/0x34 [mrdump]
+    notify_die+0x80/0xd8
+    die+0x94/0x2b8
+    __do_kernel_fault+0x264/0x298
+    do_page_fault+0xa4/0x4b8
+    do_translation_fault+0x38/0x54
+    do_mem_abort+0x58/0x118
+    el1_abort+0x3c/0x5c
+    el1h_64_sync_handler+0x54/0x90
+    el1h_64_sync+0x68/0x6c
+    blk_mq_unique_tag+0x8/0x14
+    ufshcd_clear_cmd+0x34/0x118 [ufs_mediatek_mod_ise]
+    ufshcd_try_to_abort_task+0x2c8/0x5b4 [ufs_mediatek_mod_ise]
+    ufshcd_err_handler+0xa7c/0xfa8 [ufs_mediatek_mod_ise]
+    process_one_work+0x208/0x4fc
+    worker_thread+0x228/0x438
+    kthread+0x104/0x1d4
+    ret_from_fork+0x10/0x20
+
+Fixes: 8d7290348992 ("scsi: ufs: mcq: Add supporting functions for MCQ abort")
+Suggested-by: Bart Van Assche <bvanassche@acm.org>
+Signed-off-by: Peter Wang <peter.wang@mediatek.com>
+Link: https://lore.kernel.org/r/20240628070030.30929-2-peter.wang@mediatek.com
+Reviewed-by: Bart Van Assche <bvanassche@acm.org>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/ufs/core/ufs-mcq.c | 11 ++++++-----
+ 1 file changed, 6 insertions(+), 5 deletions(-)
+
+diff --git a/drivers/ufs/core/ufs-mcq.c b/drivers/ufs/core/ufs-mcq.c
+index 8944548c30fa1..c532416aec229 100644
+--- a/drivers/ufs/core/ufs-mcq.c
++++ b/drivers/ufs/core/ufs-mcq.c
+@@ -105,16 +105,15 @@ EXPORT_SYMBOL_GPL(ufshcd_mcq_config_mac);
+  * @hba: per adapter instance
+  * @req: pointer to the request to be issued
+  *
+- * Return: the hardware queue instance on which the request would
+- * be queued.
++ * Return: the hardware queue instance on which the request will be or has
++ * been queued. %NULL if the request has already been freed.
+  */
+ struct ufs_hw_queue *ufshcd_mcq_req_to_hwq(struct ufs_hba *hba,
+                                        struct request *req)
+ {
+-      u32 utag = blk_mq_unique_tag(req);
+-      u32 hwq = blk_mq_unique_tag_to_hwq(utag);
++      struct blk_mq_hw_ctx *hctx = READ_ONCE(req->mq_hctx);
+ 
+-      return &hba->uhq[hwq];
++      return hctx ? &hba->uhq[hctx->queue_num] : NULL;
+ }
+ 
+ /**
+@@ -515,6 +514,8 @@ int ufshcd_mcq_sq_cleanup(struct ufs_hba *hba, int task_tag)
+               if (!cmd)
+                       return -EINVAL;
+               hwq = ufshcd_mcq_req_to_hwq(hba, scsi_cmd_to_rq(cmd));
++              if (!hwq)
++                      return 0;
+       } else {
+               hwq = hba->dev_cmd_queue;
+       }
+-- 
+2.43.0
+
diff --git a/queue-6.9/series b/queue-6.9/series

new file mode 100644 (file)

index 0000000..b2e4dc3
--- /dev/null
+++ b/queue-6.9/series
@@ -0,0 +1,44 @@
+mm-prevent-derefencing-null-ptr-in-pfn_section_valid.patch
+scsi-ufs-core-fix-ufshcd_clear_cmd-racing-issue.patch
+scsi-ufs-core-fix-ufshcd_abort_one-racing-issue.patch
+vfio-pci-init-the-count-variable-in-collecting-hot-r.patch
+spi-axi-spi-engine-fix-sleep-calculation.patch
+cachefiles-propagate-errors-from-vfs_getxattr-to-avo.patch
+cachefiles-stop-sending-new-request-when-dropping-ob.patch
+cachefiles-cancel-all-requests-for-the-object-that-i.patch
+cachefiles-wait-for-ondemand_object_worker-to-finish.patch
+cachefiles-cyclic-allocation-of-msg_id-to-avoid-reus.patch
+cachefiles-add-missing-lock-protection-when-polling.patch
+net-dsa-introduce-dsa_phylink_to_port.patch
+net-dsa-allow-dsa-switch-drivers-to-provide-their-ow.patch
+net-dsa-lan9303-provide-own-phylink-mac-operations.patch
+dsa-lan9303-fix-mapping-between-dsa-port-number-and-.patch
+filelock-fix-potential-use-after-free-in-posix_lock_.patch
+fs-dcache-re-use-value-stored-to-dentry-d_flags-inst.patch
+vfs-don-t-mod-negative-dentry-count-when-on-shrinker.patch
+net-bcmasp-fix-error-code-in-probe.patch
+tcp-fix-incorrect-undo-caused-by-dsack-of-tlp-retran.patch
+bpf-fix-too-early-release-of-tcx_entry.patch
+net-phy-microchip-lan87xx-reinit-phy-after-cable-tes.patch
+skmsg-skip-zero-length-skb-in-sk_msg_recvmsg.patch
+octeontx2-af-fix-incorrect-value-output-on-error-pat.patch
+spi-don-t-unoptimize-message-in-spi_async.patch
+spi-add-defer_optimize_message-controller-flag.patch
+net-fix-rc7-s-__skb_datagram_iter.patch
+i40e-fix-xdp-program-unloading-while-removing-the-dr.patch
+net-ethernet-lantiq_etop-fix-double-free-in-detach.patch
+minixfs-fix-minixfs_rename-with-highmem.patch
+bpf-fix-order-of-args-in-call-to-bpf_map_kvcalloc.patch
+bpf-make-timer-data-struct-more-generic.patch
+bpf-replace-bpf_timer_init-with-a-generic-helper.patch
+bpf-fail-bpf_timer_cancel-when-callback-is-being-can.patch
+bpf-defer-work-in-bpf_timer_cancel_and_free.patch
+tcp-avoid-too-many-retransmit-packets.patch
+net-ethernet-mtk-star-emac-set-mac_managed_pm-when-p.patch
+ppp-reject-claimed-as-lcp-but-actually-malformed-pac.patch
+ethtool-netlink-do-not-return-sqi-value-if-link-is-d.patch
+netfilter-nfnetlink_queue-drop-bogus-warn_on.patch
+netfilter-nf_tables-prefer-nft_chain_validate.patch
+udp-set-sock_rcu_free-earlier-in-udp_lib_get_port.patch
+net-sched-fix-uaf-when-resolving-a-clash.patch
+net-sunrpc-remap-eperm-in-case-of-connection-failure.patch
diff --git a/queue-6.9/skmsg-skip-zero-length-skb-in-sk_msg_recvmsg.patch b/queue-6.9/skmsg-skip-zero-length-skb-in-sk_msg_recvmsg.patch

new file mode 100644 (file)

index 0000000..5e49482
--- /dev/null
+++ b/queue-6.9/skmsg-skip-zero-length-skb-in-sk_msg_recvmsg.patch
@@ -0,0 +1,105 @@
+From be3a9a762578471efbc3466e6bf978333a1163fc Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 3 Jul 2024 16:39:31 +0800
+Subject: skmsg: Skip zero length skb in sk_msg_recvmsg
+
+From: Geliang Tang <tanggeliang@kylinos.cn>
+
+[ Upstream commit f0c18025693707ec344a70b6887f7450bf4c826b ]
+
+When running BPF selftests (./test_progs -t sockmap_basic) on a Loongarch
+platform, the following kernel panic occurs:
+
+  [...]
+  Oops[#1]:
+  CPU: 22 PID: 2824 Comm: test_progs Tainted: G           OE  6.10.0-rc2+ #18
+  Hardware name: LOONGSON Dabieshan/Loongson-TC542F0, BIOS Loongson-UDK2018
+     ... ...
+     ra: 90000000048bf6c0 sk_msg_recvmsg+0x120/0x560
+    ERA: 9000000004162774 copy_page_to_iter+0x74/0x1c0
+   CRMD: 000000b0 (PLV0 -IE -DA +PG DACF=CC DACM=CC -WE)
+   PRMD: 0000000c (PPLV0 +PIE +PWE)
+   EUEN: 00000007 (+FPE +SXE +ASXE -BTE)
+   ECFG: 00071c1d (LIE=0,2-4,10-12 VS=7)
+  ESTAT: 00010000 [PIL] (IS= ECode=1 EsubCode=0)
+   BADV: 0000000000000040
+   PRID: 0014c011 (Loongson-64bit, Loongson-3C5000)
+  Modules linked in: bpf_testmod(OE) xt_CHECKSUM xt_MASQUERADE xt_conntrack
+  Process test_progs (pid: 2824, threadinfo=0000000000863a31, task=...)
+  Stack : ...
+  Call Trace:
+  [<9000000004162774>] copy_page_to_iter+0x74/0x1c0
+  [<90000000048bf6c0>] sk_msg_recvmsg+0x120/0x560
+  [<90000000049f2b90>] tcp_bpf_recvmsg_parser+0x170/0x4e0
+  [<90000000049aae34>] inet_recvmsg+0x54/0x100
+  [<900000000481ad5c>] sock_recvmsg+0x7c/0xe0
+  [<900000000481e1a8>] __sys_recvfrom+0x108/0x1c0
+  [<900000000481e27c>] sys_recvfrom+0x1c/0x40
+  [<9000000004c076ec>] do_syscall+0x8c/0xc0
+  [<9000000003731da4>] handle_syscall+0xc4/0x160
+  Code: ...
+  ---[ end trace 0000000000000000 ]---
+  Kernel panic - not syncing: Fatal exception
+  Kernel relocated by 0x3510000
+   .text @ 0x9000000003710000
+   .data @ 0x9000000004d70000
+   .bss  @ 0x9000000006469400
+  ---[ end Kernel panic - not syncing: Fatal exception ]---
+  [...]
+
+This crash happens every time when running sockmap_skb_verdict_shutdown
+subtest in sockmap_basic.
+
+This crash is because a NULL pointer is passed to page_address() in the
+sk_msg_recvmsg(). Due to the different implementations depending on the
+architecture, page_address(NULL) will trigger a panic on Loongarch
+platform but not on x86 platform. So this bug was hidden on x86 platform
+for a while, but now it is exposed on Loongarch platform. The root cause
+is that a zero length skb (skb->len == 0) was put on the queue.
+
+This zero length skb is a TCP FIN packet, which was sent by shutdown(),
+invoked in test_sockmap_skb_verdict_shutdown():
+
+       shutdown(p1, SHUT_WR);
+
+In this case, in sk_psock_skb_ingress_enqueue(), num_sge is zero, and no
+page is put to this sge (see sg_set_page in sg_set_page), but this empty
+sge is queued into ingress_msg list.
+
+And in sk_msg_recvmsg(), this empty sge is used, and a NULL page is got by
+sg_page(sge). Pass this NULL page to copy_page_to_iter(), which passes it
+to kmap_local_page() and to page_address(), then kernel panics.
+
+To solve this, we should skip this zero length skb. So in sk_msg_recvmsg(),
+if copy is zero, that means it's a zero length skb, skip invoking
+copy_page_to_iter(). We are using the EFAULT return triggered by
+copy_page_to_iter to check for is_fin in tcp_bpf.c.
+
+Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface")
+Suggested-by: John Fastabend <john.fastabend@gmail.com>
+Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+Reviewed-by: John Fastabend <john.fastabend@gmail.com>
+Link: https://lore.kernel.org/bpf/e3a16eacdc6740658ee02a33489b1b9d4912f378.1719992715.git.tanggeliang@kylinos.cn
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/core/skmsg.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/net/core/skmsg.c b/net/core/skmsg.c
+index fd20aae30be23..bbf40b9997138 100644
+--- a/net/core/skmsg.c
++++ b/net/core/skmsg.c
+@@ -434,7 +434,8 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
+                       page = sg_page(sge);
+                       if (copied + copy > len)
+                               copy = len - copied;
+-                      copy = copy_page_to_iter(page, sge->offset, copy, iter);
++                      if (copy)
++                              copy = copy_page_to_iter(page, sge->offset, copy, iter);
+                       if (!copy) {
+                               copied = copied ? copied : -EFAULT;
+                               goto out;
+-- 
+2.43.0
+
diff --git a/queue-6.9/spi-add-defer_optimize_message-controller-flag.patch b/queue-6.9/spi-add-defer_optimize_message-controller-flag.patch

new file mode 100644 (file)

index 0000000..a363258
--- /dev/null
+++ b/queue-6.9/spi-add-defer_optimize_message-controller-flag.patch
@@ -0,0 +1,137 @@
+From bd18cef36942a09fbb1dbf4748bb8afce992b43b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 8 Jul 2024 20:05:29 -0500
+Subject: spi: add defer_optimize_message controller flag
+
+From: David Lechner <dlechner@baylibre.com>
+
+[ Upstream commit ca52aa4c60f76566601b42e935b8a78f0fb4f8eb ]
+
+Adding spi_optimize_message() broke the spi-mux driver because it
+calls spi_async() from it's transfer_one_message() callback. This
+resulted in passing an incorrectly optimized message to the controller.
+For example, if the underlying controller has an optimize_message()
+callback, this would have not been called and can cause a crash when
+the underlying controller driver tries to transfer the message.
+
+Also, since the spi-mux driver swaps out the controller pointer by
+replacing msg->spi, __spi_unoptimize_message() was being called with a
+different controller than the one used in __spi_optimize_message(). This
+could cause a crash when attempting to free the message resources when
+__spi_unoptimize_message() is called in spi_finalize_current_message()
+since it is being called with a controller that did not allocate the
+resources.
+
+This is fixed by adding a defer_optimize_message flag for controllers.
+This flag causes all of the spi_[maybe_][un]optimize_message() calls to
+be a no-op (other than attaching a pointer to the spi device to the
+message).
+
+This allows the spi-mux driver to pass an unmodified message to
+spi_async() in spi_mux_transfer_one_message() after the spi device has
+been swapped out. This causes __spi_optimize_message() and
+__spi_unoptimize_message() to be called only once per message and with
+the correct/same controller in each case.
+
+Reported-by: Oleksij Rempel <o.rempel@pengutronix.de>
+Closes: https://lore.kernel.org/linux-spi/Zn6HMrYG2b7epUxT@pengutronix.de/
+Reported-by: Marc Kleine-Budde <mkl@pengutronix.de>
+Closes: https://lore.kernel.org/linux-spi/20240628-awesome-discerning-bear-1621f9-mkl@pengutronix.de/
+Fixes: 7b1d87af14d9 ("spi: add spi_optimize_message() APIs")
+Signed-off-by: David Lechner <dlechner@baylibre.com>
+Link: https://patch.msgid.link/20240708-spi-mux-fix-v1-2-6c8845193128@baylibre.com
+Signed-off-by: Mark Brown <broonie@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/spi/spi-mux.c   |  1 +
+ drivers/spi/spi.c       | 18 +++++++++++++++++-
+ include/linux/spi/spi.h |  4 ++++
+ 3 files changed, 22 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/spi/spi-mux.c b/drivers/spi/spi-mux.c
+index bd988f53753e2..031b5795d1060 100644
+--- a/drivers/spi/spi-mux.c
++++ b/drivers/spi/spi-mux.c
+@@ -162,6 +162,7 @@ static int spi_mux_probe(struct spi_device *spi)
+       ctlr->bus_num = -1;
+       ctlr->dev.of_node = spi->dev.of_node;
+       ctlr->must_async = true;
++      ctlr->defer_optimize_message = true;
+ 
+       ret = devm_spi_register_controller(&spi->dev, ctlr);
+       if (ret)
+diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
+index a1958e86f75c8..9304fd03bf764 100644
+--- a/drivers/spi/spi.c
++++ b/drivers/spi/spi.c
+@@ -2137,7 +2137,8 @@ static void __spi_unoptimize_message(struct spi_message *msg)
+  */
+ static void spi_maybe_unoptimize_message(struct spi_message *msg)
+ {
+-      if (!msg->pre_optimized && msg->optimized)
++      if (!msg->pre_optimized && msg->optimized &&
++          !msg->spi->controller->defer_optimize_message)
+               __spi_unoptimize_message(msg);
+ }
+ 
+@@ -4285,6 +4286,11 @@ static int __spi_optimize_message(struct spi_device *spi,
+ static int spi_maybe_optimize_message(struct spi_device *spi,
+                                     struct spi_message *msg)
+ {
++      if (spi->controller->defer_optimize_message) {
++              msg->spi = spi;
++              return 0;
++      }
++
+       if (msg->pre_optimized)
+               return 0;
+ 
+@@ -4315,6 +4321,13 @@ int spi_optimize_message(struct spi_device *spi, struct spi_message *msg)
+ {
+       int ret;
+ 
++      /*
++       * Pre-optimization is not supported and optimization is deferred e.g.
++       * when using spi-mux.
++       */
++      if (spi->controller->defer_optimize_message)
++              return 0;
++
+       ret = __spi_optimize_message(spi, msg);
+       if (ret)
+               return ret;
+@@ -4341,6 +4354,9 @@ EXPORT_SYMBOL_GPL(spi_optimize_message);
+  */
+ void spi_unoptimize_message(struct spi_message *msg)
+ {
++      if (msg->spi->controller->defer_optimize_message)
++              return;
++
+       __spi_unoptimize_message(msg);
+       msg->pre_optimized = false;
+ }
+diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
+index c459809efee4f..64a4deb18dd00 100644
+--- a/include/linux/spi/spi.h
++++ b/include/linux/spi/spi.h
+@@ -532,6 +532,9 @@ extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 ch
+  * @queue_empty: signal green light for opportunistically skipping the queue
+  *    for spi_sync transfers.
+  * @must_async: disable all fast paths in the core
++ * @defer_optimize_message: set to true if controller cannot pre-optimize messages
++ *    and needs to defer the optimization step until the message is actually
++ *    being transferred
+  *
+  * Each SPI controller can communicate with one or more @spi_device
+  * children.  These make a small bus, sharing MOSI, MISO and SCK signals
+@@ -775,6 +778,7 @@ struct spi_controller {
+       /* Flag for enabling opportunistic skipping of the queue in spi_sync */
+       bool                    queue_empty;
+       bool                    must_async;
++      bool                    defer_optimize_message;
+ };
+ 
+ static inline void *spi_controller_get_devdata(struct spi_controller *ctlr)
+-- 
+2.43.0
+
diff --git a/queue-6.9/spi-axi-spi-engine-fix-sleep-calculation.patch b/queue-6.9/spi-axi-spi-engine-fix-sleep-calculation.patch

new file mode 100644 (file)

index 0000000..8f4fa5d
--- /dev/null
+++ b/queue-6.9/spi-axi-spi-engine-fix-sleep-calculation.patch
@@ -0,0 +1,96 @@
+From 1461ddd3c3e87f93dc0743bd6dd4dd8f62c0d9c9 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 20 Jun 2024 11:43:58 -0500
+Subject: spi: axi-spi-engine: fix sleep calculation
+
+From: David Lechner <dlechner@baylibre.com>
+
+[ Upstream commit 40b3d0838a1ff242e61f341e49226074bbdd319f ]
+
+The sleep calculation was not taking into account increased delay when
+the SPI device is not running at the maximum SCLK frequency.
+
+Rounding down when one SCLK tick was the same as the instruction
+execution time was fine, but it rounds down too much when SCLK is
+slower. This changes the rounding to round up instead while still
+taking into account the instruction execution time so that small
+delays remain accurate.
+
+Fixes: be9070bcf670 ("spi: axi-spi-engine: fix sleep ticks calculation")
+Signed-off-by: David Lechner <dlechner@baylibre.com>
+Link: https://patch.msgid.link/20240620-spi-axi-spi-engine-fix-sleep-time-v1-1-b20b527924a0@baylibre.com
+Signed-off-by: Mark Brown <broonie@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/spi/spi-axi-spi-engine.c | 26 ++++++++++++++++++--------
+ 1 file changed, 18 insertions(+), 8 deletions(-)
+
+diff --git a/drivers/spi/spi-axi-spi-engine.c b/drivers/spi/spi-axi-spi-engine.c
+index e358ac5b45097..96a524772549e 100644
+--- a/drivers/spi/spi-axi-spi-engine.c
++++ b/drivers/spi/spi-axi-spi-engine.c
+@@ -164,16 +164,20 @@ static void spi_engine_gen_xfer(struct spi_engine_program *p, bool dry,
+ }
+ 
+ static void spi_engine_gen_sleep(struct spi_engine_program *p, bool dry,
+-                               int delay_ns, u32 sclk_hz)
++                               int delay_ns, int inst_ns, u32 sclk_hz)
+ {
+       unsigned int t;
+ 
+-      /* negative delay indicates error, e.g. from spi_delay_to_ns() */
+-      if (delay_ns <= 0)
++      /*
++       * Negative delay indicates error, e.g. from spi_delay_to_ns(). And if
++       * delay is less that the instruction execution time, there is no need
++       * for an extra sleep instruction since the instruction execution time
++       * will already cover the required delay.
++       */
++      if (delay_ns < 0 || delay_ns <= inst_ns)
+               return;
+ 
+-      /* rounding down since executing the instruction adds a couple of ticks delay */
+-      t = DIV_ROUND_DOWN_ULL((u64)delay_ns * sclk_hz, NSEC_PER_SEC);
++      t = DIV_ROUND_UP_ULL((u64)(delay_ns - inst_ns) * sclk_hz, NSEC_PER_SEC);
+       while (t) {
+               unsigned int n = min(t, 256U);
+ 
+@@ -220,10 +224,16 @@ static void spi_engine_compile_message(struct spi_message *msg, bool dry,
+       struct spi_device *spi = msg->spi;
+       struct spi_controller *host = spi->controller;
+       struct spi_transfer *xfer;
+-      int clk_div, new_clk_div;
++      int clk_div, new_clk_div, inst_ns;
+       bool keep_cs = false;
+       u8 bits_per_word = 0;
+ 
++      /*
++       * Take into account instruction execution time for more accurate sleep
++       * times, especially when the delay is small.
++       */
++      inst_ns = DIV_ROUND_UP(NSEC_PER_SEC, host->max_speed_hz);
++
+       clk_div = 1;
+ 
+       spi_engine_program_add_cmd(p, dry,
+@@ -252,7 +262,7 @@ static void spi_engine_compile_message(struct spi_message *msg, bool dry,
+ 
+               spi_engine_gen_xfer(p, dry, xfer);
+               spi_engine_gen_sleep(p, dry, spi_delay_to_ns(&xfer->delay, xfer),
+-                                   xfer->effective_speed_hz);
++                                   inst_ns, xfer->effective_speed_hz);
+ 
+               if (xfer->cs_change) {
+                       if (list_is_last(&xfer->transfer_list, &msg->transfers)) {
+@@ -262,7 +272,7 @@ static void spi_engine_compile_message(struct spi_message *msg, bool dry,
+                                       spi_engine_gen_cs(p, dry, spi, false);
+ 
+                               spi_engine_gen_sleep(p, dry, spi_delay_to_ns(
+-                                      &xfer->cs_change_delay, xfer),
++                                      &xfer->cs_change_delay, xfer), inst_ns,
+                                       xfer->effective_speed_hz);
+ 
+                               if (!list_next_entry(xfer, transfer_list)->cs_off)
+-- 
+2.43.0
+
diff --git a/queue-6.9/spi-don-t-unoptimize-message-in-spi_async.patch b/queue-6.9/spi-don-t-unoptimize-message-in-spi_async.patch

new file mode 100644 (file)

index 0000000..3a09aa8
--- /dev/null
+++ b/queue-6.9/spi-don-t-unoptimize-message-in-spi_async.patch
@@ -0,0 +1,42 @@
+From e886397569fa772a6773c21fb29fed1cd5866dc3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 8 Jul 2024 20:05:28 -0500
+Subject: spi: don't unoptimize message in spi_async()
+
+From: David Lechner <dlechner@baylibre.com>
+
+[ Upstream commit c86a918b1bdba78fb155184f8d88dfba1e63335d ]
+
+Calling spi_maybe_unoptimize_message() in spi_async() is wrong because
+the message is likely to be in the queue and not transferred yet. This
+can corrupt the message while it is being used by the controller driver.
+
+spi_maybe_unoptimize_message() is already called in the correct place
+in spi_finalize_current_message() to balance the call to
+spi_maybe_optimize_message() in spi_async().
+
+Fixes: 7b1d87af14d9 ("spi: add spi_optimize_message() APIs")
+Signed-off-by: David Lechner <dlechner@baylibre.com>
+Link: https://patch.msgid.link/20240708-spi-mux-fix-v1-1-6c8845193128@baylibre.com
+Signed-off-by: Mark Brown <broonie@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/spi/spi.c | 2 --
+ 1 file changed, 2 deletions(-)
+
+diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
+index c349d6012625a..a1958e86f75c8 100644
+--- a/drivers/spi/spi.c
++++ b/drivers/spi/spi.c
+@@ -4423,8 +4423,6 @@ int spi_async(struct spi_device *spi, struct spi_message *message)
+ 
+       spin_unlock_irqrestore(&ctlr->bus_lock_spinlock, flags);
+ 
+-      spi_maybe_unoptimize_message(message);
+-
+       return ret;
+ }
+ EXPORT_SYMBOL_GPL(spi_async);
+-- 
+2.43.0
+
diff --git a/queue-6.9/tcp-avoid-too-many-retransmit-packets.patch b/queue-6.9/tcp-avoid-too-many-retransmit-packets.patch

new file mode 100644 (file)

index 0000000..e0e795d
--- /dev/null
+++ b/queue-6.9/tcp-avoid-too-many-retransmit-packets.patch
@@ -0,0 +1,74 @@
+From 6d0a55397594271363d675d5a0b9845b022f87fb Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 10 Jul 2024 00:14:01 +0000
+Subject: tcp: avoid too many retransmit packets
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit 97a9063518f198ec0adb2ecb89789de342bb8283 ]
+
+If a TCP socket is using TCP_USER_TIMEOUT, and the other peer
+retracted its window to zero, tcp_retransmit_timer() can
+retransmit a packet every two jiffies (2 ms for HZ=1000),
+for about 4 minutes after TCP_USER_TIMEOUT has 'expired'.
+
+The fix is to make sure tcp_rtx_probe0_timed_out() takes
+icsk->icsk_user_timeout into account.
+
+Before blamed commit, the socket would not timeout after
+icsk->icsk_user_timeout, but would use standard exponential
+backoff for the retransmits.
+
+Also worth noting that before commit e89688e3e978 ("net: tcp:
+fix unexcepted socket die when snd_wnd is 0"), the issue
+would last 2 minutes instead of 4.
+
+Fixes: b701a99e431d ("tcp: Add tcp_clamp_rto_to_user_timeout() helper to improve accuracy")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Cc: Neal Cardwell <ncardwell@google.com>
+Reviewed-by: Jason Xing <kerneljasonxing@gmail.com>
+Reviewed-by: Jon Maxwell <jmaxwell37@gmail.com>
+Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Link: https://patch.msgid.link/20240710001402.2758273-1-edumazet@google.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv4/tcp_timer.c | 15 +++++++++++++--
+ 1 file changed, 13 insertions(+), 2 deletions(-)
+
+diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
+index 22d25f63858b9..cceb4fabd4c85 100644
+--- a/net/ipv4/tcp_timer.c
++++ b/net/ipv4/tcp_timer.c
+@@ -481,15 +481,26 @@ static bool tcp_rtx_probe0_timed_out(const struct sock *sk,
+                                    const struct sk_buff *skb,
+                                    u32 rtx_delta)
+ {
++      const struct inet_connection_sock *icsk = inet_csk(sk);
++      u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout);
+       const struct tcp_sock *tp = tcp_sk(sk);
+-      const int timeout = TCP_RTO_MAX * 2;
++      int timeout = TCP_RTO_MAX * 2;
+       s32 rcv_delta;
+ 
++      if (user_timeout) {
++              /* If user application specified a TCP_USER_TIMEOUT,
++               * it does not want win 0 packets to 'reset the timer'
++               * while retransmits are not making progress.
++               */
++              if (rtx_delta > user_timeout)
++                      return true;
++              timeout = min_t(u32, timeout, msecs_to_jiffies(user_timeout));
++      }
+       /* Note: timer interrupt might have been delayed by at least one jiffy,
+        * and tp->rcv_tstamp might very well have been written recently.
+        * rcv_delta can thus be negative.
+        */
+-      rcv_delta = inet_csk(sk)->icsk_timeout - tp->rcv_tstamp;
++      rcv_delta = icsk->icsk_timeout - tp->rcv_tstamp;
+       if (rcv_delta <= timeout)
+               return false;
+ 
+-- 
+2.43.0
+
diff --git a/queue-6.9/tcp-fix-incorrect-undo-caused-by-dsack-of-tlp-retran.patch b/queue-6.9/tcp-fix-incorrect-undo-caused-by-dsack-of-tlp-retran.patch

new file mode 100644 (file)

index 0000000..b3e9b2a
--- /dev/null
+++ b/queue-6.9/tcp-fix-incorrect-undo-caused-by-dsack-of-tlp-retran.patch
@@ -0,0 +1,107 @@
+From f15a7324f90226993462fa3abef149e1aa332d6c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 3 Jul 2024 13:12:46 -0400
+Subject: tcp: fix incorrect undo caused by DSACK of TLP retransmit
+
+From: Neal Cardwell <ncardwell@google.com>
+
+[ Upstream commit 0ec986ed7bab6801faed1440e8839dcc710331ff ]
+
+Loss recovery undo_retrans bookkeeping had a long-standing bug where a
+DSACK from a spurious TLP retransmit packet could cause an erroneous
+undo of a fast recovery or RTO recovery that repaired a single
+really-lost packet (in a sequence range outside that of the TLP
+retransmit). Basically, because the loss recovery state machine didn't
+account for the fact that it sent a TLP retransmit, the DSACK for the
+TLP retransmit could erroneously be implicitly be interpreted as
+corresponding to the normal fast recovery or RTO recovery retransmit
+that plugged a real hole, thus resulting in an improper undo.
+
+For example, consider the following buggy scenario where there is a
+real packet loss but the congestion control response is improperly
+undone because of this bug:
+
++ send packets P1, P2, P3, P4
++ P1 is really lost
++ send TLP retransmit of P4
++ receive SACK for original P2, P3, P4
++ enter fast recovery, fast-retransmit P1, increment undo_retrans to 1
++ receive DSACK for TLP P4, decrement undo_retrans to 0, undo (bug!)
++ receive cumulative ACK for P1-P4 (fast retransmit plugged real hole)
+
+The fix: when we initialize undo machinery in tcp_init_undo(), if
+there is a TLP retransmit in flight, then increment tp->undo_retrans
+so that we make sure that we receive a DSACK corresponding to the TLP
+retransmit, as well as DSACKs for all later normal retransmits, before
+triggering a loss recovery undo. Note that we also have to move the
+line that clears tp->tlp_high_seq for RTO recovery, so that upon RTO
+we remember the tp->tlp_high_seq value until tcp_init_undo() and clear
+it only afterward.
+
+Also note that the bug dates back to the original 2013 TLP
+implementation, commit 6ba8a3b19e76 ("tcp: Tail loss probe (TLP)").
+
+However, this patch will only compile and work correctly with kernels
+that have tp->tlp_retrans, which was added only in v5.8 in 2020 in
+commit 76be93fc0702 ("tcp: allow at most one TLP probe per flight").
+So we associate this fix with that later commit.
+
+Fixes: 76be93fc0702 ("tcp: allow at most one TLP probe per flight")
+Signed-off-by: Neal Cardwell <ncardwell@google.com>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Cc: Yuchung Cheng <ycheng@google.com>
+Cc: Kevin Yang <yyd@google.com>
+Link: https://patch.msgid.link/20240703171246.1739561-1-ncardwell.sw@gmail.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv4/tcp_input.c | 11 ++++++++++-
+ net/ipv4/tcp_timer.c |  2 --
+ 2 files changed, 10 insertions(+), 3 deletions(-)
+
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+index 7b692bcb61d4a..c765d479869dc 100644
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -2126,8 +2126,16 @@ void tcp_clear_retrans(struct tcp_sock *tp)
+ static inline void tcp_init_undo(struct tcp_sock *tp)
+ {
+       tp->undo_marker = tp->snd_una;
++
+       /* Retransmission still in flight may cause DSACKs later. */
+-      tp->undo_retrans = tp->retrans_out ? : -1;
++      /* First, account for regular retransmits in flight: */
++      tp->undo_retrans = tp->retrans_out;
++      /* Next, account for TLP retransmits in flight: */
++      if (tp->tlp_high_seq && tp->tlp_retrans)
++              tp->undo_retrans++;
++      /* Finally, avoid 0, because undo_retrans==0 means "can undo now": */
++      if (!tp->undo_retrans)
++              tp->undo_retrans = -1;
+ }
+ 
+ static bool tcp_is_rack(const struct sock *sk)
+@@ -2206,6 +2214,7 @@ void tcp_enter_loss(struct sock *sk)
+ 
+       tcp_set_ca_state(sk, TCP_CA_Loss);
+       tp->high_seq = tp->snd_nxt;
++      tp->tlp_high_seq = 0;
+       tcp_ecn_queue_cwr(tp);
+ 
+       /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
+diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
+index f96f68cf7961c..22d25f63858b9 100644
+--- a/net/ipv4/tcp_timer.c
++++ b/net/ipv4/tcp_timer.c
+@@ -534,8 +534,6 @@ void tcp_retransmit_timer(struct sock *sk)
+       if (WARN_ON_ONCE(!skb))
+               return;
+ 
+-      tp->tlp_high_seq = 0;
+-
+       if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) &&
+           !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
+               /* Receiver dastardly shrinks window. Our retransmits
+-- 
+2.43.0
+
diff --git a/queue-6.9/udp-set-sock_rcu_free-earlier-in-udp_lib_get_port.patch b/queue-6.9/udp-set-sock_rcu_free-earlier-in-udp_lib_get_port.patch

new file mode 100644 (file)

index 0000000..54b4a79
--- /dev/null
+++ b/queue-6.9/udp-set-sock_rcu_free-earlier-in-udp_lib_get_port.patch
@@ -0,0 +1,123 @@
+From 70d06fb0d77b8fd23f9138e10d667a3a9c731cfe Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 9 Jul 2024 12:13:56 -0700
+Subject: udp: Set SOCK_RCU_FREE earlier in udp_lib_get_port().
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+[ Upstream commit 5c0b485a8c6116516f33925b9ce5b6104a6eadfd ]
+
+syzkaller triggered the warning [0] in udp_v4_early_demux().
+
+In udp_v[46]_early_demux() and sk_lookup(), we do not touch the refcount
+of the looked-up sk and use sock_pfree() as skb->destructor, so we check
+SOCK_RCU_FREE to ensure that the sk is safe to access during the RCU grace
+period.
+
+Currently, SOCK_RCU_FREE is flagged for a bound socket after being put
+into the hash table.  Moreover, the SOCK_RCU_FREE check is done too early
+in udp_v[46]_early_demux() and sk_lookup(), so there could be a small race
+window:
+
+  CPU1                                 CPU2
+  ----                                 ----
+  udp_v4_early_demux()                 udp_lib_get_port()
+  |                                    |- hlist_add_head_rcu()
+  |- sk = __udp4_lib_demux_lookup()    |
+  |- DEBUG_NET_WARN_ON_ONCE(sk_is_refcounted(sk));
+                                       `- sock_set_flag(sk, SOCK_RCU_FREE)
+
+We had the same bug in TCP and fixed it in commit 871019b22d1b ("net:
+set SOCK_RCU_FREE before inserting socket into hashtable").
+
+Let's apply the same fix for UDP.
+
+[0]:
+WARNING: CPU: 0 PID: 11198 at net/ipv4/udp.c:2599 udp_v4_early_demux+0x481/0xb70 net/ipv4/udp.c:2599
+Modules linked in:
+CPU: 0 PID: 11198 Comm: syz-executor.1 Not tainted 6.9.0-g93bda33046e7 #13
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
+RIP: 0010:udp_v4_early_demux+0x481/0xb70 net/ipv4/udp.c:2599
+Code: c5 7a 15 fe bb 01 00 00 00 44 89 e9 31 ff d3 e3 81 e3 bf ef ff ff 89 de e8 2c 74 15 fe 85 db 0f 85 02 06 00 00 e8 9f 7a 15 fe <0f> 0b e8 98 7a 15 fe 49 8d 7e 60 e8 4f 39 2f fe 49 c7 46 60 20 52
+RSP: 0018:ffffc9000ce3fa58 EFLAGS: 00010293
+RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffffff8318c92c
+RDX: ffff888036ccde00 RSI: ffffffff8318c2f1 RDI: 0000000000000001
+RBP: ffff88805a2dd6e0 R08: 0000000000000001 R09: 0000000000000000
+R10: 0000000000000000 R11: 0001ffffffffffff R12: ffff88805a2dd680
+R13: 0000000000000007 R14: ffff88800923f900 R15: ffff88805456004e
+FS:  00007fc449127640(0000) GS:ffff88807dc00000(0000) knlGS:0000000000000000
+CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+CR2: 00007fc449126e38 CR3: 000000003de4b002 CR4: 0000000000770ef0
+DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000600
+PKRU: 55555554
+Call Trace:
+ <TASK>
+ ip_rcv_finish_core.constprop.0+0xbdd/0xd20 net/ipv4/ip_input.c:349
+ ip_rcv_finish+0xda/0x150 net/ipv4/ip_input.c:447
+ NF_HOOK include/linux/netfilter.h:314 [inline]
+ NF_HOOK include/linux/netfilter.h:308 [inline]
+ ip_rcv+0x16c/0x180 net/ipv4/ip_input.c:569
+ __netif_receive_skb_one_core+0xb3/0xe0 net/core/dev.c:5624
+ __netif_receive_skb+0x21/0xd0 net/core/dev.c:5738
+ netif_receive_skb_internal net/core/dev.c:5824 [inline]
+ netif_receive_skb+0x271/0x300 net/core/dev.c:5884
+ tun_rx_batched drivers/net/tun.c:1549 [inline]
+ tun_get_user+0x24db/0x2c50 drivers/net/tun.c:2002
+ tun_chr_write_iter+0x107/0x1a0 drivers/net/tun.c:2048
+ new_sync_write fs/read_write.c:497 [inline]
+ vfs_write+0x76f/0x8d0 fs/read_write.c:590
+ ksys_write+0xbf/0x190 fs/read_write.c:643
+ __do_sys_write fs/read_write.c:655 [inline]
+ __se_sys_write fs/read_write.c:652 [inline]
+ __x64_sys_write+0x41/0x50 fs/read_write.c:652
+ x64_sys_call+0xe66/0x1990 arch/x86/include/generated/asm/syscalls_64.h:2
+ do_syscall_x64 arch/x86/entry/common.c:52 [inline]
+ do_syscall_64+0x4b/0x110 arch/x86/entry/common.c:83
+ entry_SYSCALL_64_after_hwframe+0x4b/0x53
+RIP: 0033:0x7fc44a68bc1f
+Code: 89 54 24 18 48 89 74 24 10 89 7c 24 08 e8 e9 cf f5 ff 48 8b 54 24 18 48 8b 74 24 10 41 89 c0 8b 7c 24 08 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 31 44 89 c7 48 89 44 24 08 e8 3c d0 f5 ff 48
+RSP: 002b:00007fc449126c90 EFLAGS: 00000293 ORIG_RAX: 0000000000000001
+RAX: ffffffffffffffda RBX: 00000000004bc050 RCX: 00007fc44a68bc1f
+RDX: 0000000000000032 RSI: 00000000200000c0 RDI: 00000000000000c8
+RBP: 00000000004bc050 R08: 0000000000000000 R09: 0000000000000000
+R10: 0000000000000032 R11: 0000000000000293 R12: 0000000000000000
+R13: 000000000000000b R14: 00007fc44a5ec530 R15: 0000000000000000
+ </TASK>
+
+Fixes: 6acc9b432e67 ("bpf: Add helper to retrieve socket in BPF")
+Reported-by: syzkaller <syzkaller@googlegroups.com>
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Link: https://patch.msgid.link/20240709191356.24010-1-kuniyu@amazon.com
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv4/udp.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
+index 72d3bf136810d..fb71bf3b12b47 100644
+--- a/net/ipv4/udp.c
++++ b/net/ipv4/udp.c
+@@ -326,6 +326,8 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
+                       goto fail_unlock;
+               }
+ 
++              sock_set_flag(sk, SOCK_RCU_FREE);
++
+               sk_add_node_rcu(sk, &hslot->head);
+               hslot->count++;
+               sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+@@ -342,7 +344,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
+               hslot2->count++;
+               spin_unlock(&hslot2->lock);
+       }
+-      sock_set_flag(sk, SOCK_RCU_FREE);
++
+       error = 0;
+ fail_unlock:
+       spin_unlock_bh(&hslot->lock);
+-- 
+2.43.0
+
diff --git a/queue-6.9/vfio-pci-init-the-count-variable-in-collecting-hot-r.patch b/queue-6.9/vfio-pci-init-the-count-variable-in-collecting-hot-r.patch

new file mode 100644 (file)

index 0000000..4d05577
--- /dev/null
+++ b/queue-6.9/vfio-pci-init-the-count-variable-in-collecting-hot-r.patch
@@ -0,0 +1,46 @@
+From 712fb7c5dae10b234c2fe336b8ca00c5ef7c8bca Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 9 Jul 2024 17:41:50 -0700
+Subject: vfio/pci: Init the count variable in collecting hot-reset devices
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Yi Liu <yi.l.liu@intel.com>
+
+[ Upstream commit 5a88a3f67e37e39f933b38ebb4985ba5822e9eca ]
+
+The count variable is used without initialization, it results in mistakes
+in the device counting and crashes the userspace if the get hot reset info
+path is triggered.
+
+Fixes: f6944d4a0b87 ("vfio/pci: Collect hot-reset devices to local buffer")
+Link: https://bugzilla.kernel.org/show_bug.cgi?id=219010
+Reported-by: Žilvinas Žaltiena <zaltys@natrix.lt>
+Cc: Beld Zhang <beldzhang@gmail.com>
+Signed-off-by: Yi Liu <yi.l.liu@intel.com>
+Reviewed-by: Kevin Tian <kevin.tian@intel.com>
+Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
+Link: https://lore.kernel.org/r/20240710004150.319105-1-yi.l.liu@intel.com
+Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/vfio/pci/vfio_pci_core.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
+index d8c95cc16be81..ea36d2139590f 100644
+--- a/drivers/vfio/pci/vfio_pci_core.c
++++ b/drivers/vfio/pci/vfio_pci_core.c
+@@ -1260,7 +1260,7 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info(
+       struct vfio_pci_hot_reset_info hdr;
+       struct vfio_pci_fill_info fill = {};
+       bool slot = false;
+-      int ret, count;
++      int ret, count = 0;
+ 
+       if (copy_from_user(&hdr, arg, minsz))
+               return -EFAULT;
+-- 
+2.43.0
+
diff --git a/queue-6.9/vfs-don-t-mod-negative-dentry-count-when-on-shrinker.patch b/queue-6.9/vfs-don-t-mod-negative-dentry-count-when-on-shrinker.patch

new file mode 100644 (file)

index 0000000..3c7d6a1
--- /dev/null
+++ b/queue-6.9/vfs-don-t-mod-negative-dentry-count-when-on-shrinker.patch
@@ -0,0 +1,88 @@
+From 36c99593ef5931337d41da7919778d9813ae8357 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 3 Jul 2024 08:13:01 -0400
+Subject: vfs: don't mod negative dentry count when on shrinker list
+
+From: Brian Foster <bfoster@redhat.com>
+
+[ Upstream commit aabfe57ebaa75841db47ea59091ec3c5a06d2f52 ]
+
+The nr_dentry_negative counter is intended to only account negative
+dentries that are present on the superblock LRU. Therefore, the LRU
+add, remove and isolate helpers modify the counter based on whether
+the dentry is negative, but the shrinker list related helpers do not
+modify the counter, and the paths that change a dentry between
+positive and negative only do so if DCACHE_LRU_LIST is set.
+
+The problem with this is that a dentry on a shrinker list still has
+DCACHE_LRU_LIST set to indicate ->d_lru is in use. The additional
+DCACHE_SHRINK_LIST flag denotes whether the dentry is on LRU or a
+shrink related list. Therefore if a relevant operation (i.e. unlink)
+occurs while a dentry is present on a shrinker list, and the
+associated codepath only checks for DCACHE_LRU_LIST, then it is
+technically possible to modify the negative dentry count for a
+dentry that is off the LRU. Since the shrinker list related helpers
+do not modify the negative dentry count (because non-LRU dentries
+should not be included in the count) when the dentry is ultimately
+removed from the shrinker list, this can cause the negative dentry
+count to become permanently inaccurate.
+
+This problem can be reproduced via a heavy file create/unlink vs.
+drop_caches workload. On an 80xcpu system, I start 80 tasks each
+running a 1k file create/delete loop, and one task spinning on
+drop_caches. After 10 minutes or so of runtime, the idle/clean cache
+negative dentry count increases from somewhere in the range of 5-10
+entries to several hundred (and increasingly grows beyond
+nr_dentry_unused).
+
+Tweak the logic in the paths that turn a dentry negative or positive
+to filter out the case where the dentry is present on a shrink
+related list. This allows the above workload to maintain an accurate
+negative dentry count.
+
+Fixes: af0c9af1b3f6 ("fs/dcache: Track & report number of negative dentries")
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Link: https://lore.kernel.org/r/20240703121301.247680-1-bfoster@redhat.com
+Acked-by: Ian Kent <ikent@redhat.com>
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Reviewed-by: Waiman Long <longman@redhat.com>
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/dcache.c | 12 +++++++++---
+ 1 file changed, 9 insertions(+), 3 deletions(-)
+
+diff --git a/fs/dcache.c b/fs/dcache.c
+index 407095188f83a..66515fbc9dd70 100644
+--- a/fs/dcache.c
++++ b/fs/dcache.c
+@@ -355,7 +355,11 @@ static inline void __d_clear_type_and_inode(struct dentry *dentry)
+       flags &= ~DCACHE_ENTRY_TYPE;
+       WRITE_ONCE(dentry->d_flags, flags);
+       dentry->d_inode = NULL;
+-      if (flags & DCACHE_LRU_LIST)
++      /*
++       * The negative counter only tracks dentries on the LRU. Don't inc if
++       * d_lru is on another list.
++       */
++      if ((flags & (DCACHE_LRU_LIST|DCACHE_SHRINK_LIST)) == DCACHE_LRU_LIST)
+               this_cpu_inc(nr_dentry_negative);
+ }
+ 
+@@ -1844,9 +1848,11 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
+ 
+       spin_lock(&dentry->d_lock);
+       /*
+-       * Decrement negative dentry count if it was in the LRU list.
++       * The negative counter only tracks dentries on the LRU. Don't dec if
++       * d_lru is on another list.
+        */
+-      if (dentry->d_flags & DCACHE_LRU_LIST)
++      if ((dentry->d_flags &
++           (DCACHE_LRU_LIST|DCACHE_SHRINK_LIST)) == DCACHE_LRU_LIST)
+               this_cpu_dec(nr_dentry_negative);
+       hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
+       raw_write_seqcount_begin(&dentry->d_seq);
+-- 
+2.43.0
+
author	Sasha Levin <sashal@kernel.org>
	Sat, 13 Jul 2024 13:26:37 +0000 (09:26 -0400)
committer	Sasha Levin <sashal@kernel.org>
	Sat, 13 Jul 2024 13:26:37 +0000 (09:26 -0400)
queue-6.9/bpf-defer-work-in-bpf_timer_cancel_and_free.patch	[new file with mode: 0644]	patch \| blob
queue-6.9/bpf-fail-bpf_timer_cancel-when-callback-is-being-can.patch	[new file with mode: 0644]	patch \| blob
queue-6.9/bpf-fix-order-of-args-in-call-to-bpf_map_kvcalloc.patch	[new file with mode: 0644]	patch \| blob
queue-6.9/bpf-fix-too-early-release-of-tcx_entry.patch	[new file with mode: 0644]	patch \| blob
queue-6.9/bpf-make-timer-data-struct-more-generic.patch	[new file with mode: 0644]	patch \| blob
queue-6.9/bpf-replace-bpf_timer_init-with-a-generic-helper.patch	[new file with mode: 0644]	patch \| blob
queue-6.9/cachefiles-add-missing-lock-protection-when-polling.patch	[new file with mode: 0644]	patch \| blob
queue-6.9/cachefiles-cancel-all-requests-for-the-object-that-i.patch	[new file with mode: 0644]	patch \| blob
queue-6.9/cachefiles-cyclic-allocation-of-msg_id-to-avoid-reus.patch	[new file with mode: 0644]	patch \| blob
queue-6.9/cachefiles-propagate-errors-from-vfs_getxattr-to-avo.patch	[new file with mode: 0644]	patch \| blob
queue-6.9/cachefiles-stop-sending-new-request-when-dropping-ob.patch	[new file with mode: 0644]	patch \| blob
queue-6.9/cachefiles-wait-for-ondemand_object_worker-to-finish.patch	[new file with mode: 0644]	patch \| blob
queue-6.9/dsa-lan9303-fix-mapping-between-dsa-port-number-and-.patch	[new file with mode: 0644]	patch \| blob
queue-6.9/ethtool-netlink-do-not-return-sqi-value-if-link-is-d.patch	[new file with mode: 0644]	patch \| blob
queue-6.9/filelock-fix-potential-use-after-free-in-posix_lock_.patch	[new file with mode: 0644]	patch \| blob
queue-6.9/fs-dcache-re-use-value-stored-to-dentry-d_flags-inst.patch	[new file with mode: 0644]	patch \| blob
queue-6.9/i40e-fix-xdp-program-unloading-while-removing-the-dr.patch	[new file with mode: 0644]	patch \| blob
queue-6.9/minixfs-fix-minixfs_rename-with-highmem.patch	[new file with mode: 0644]	patch \| blob
queue-6.9/mm-prevent-derefencing-null-ptr-in-pfn_section_valid.patch	[new file with mode: 0644]	patch \| blob
queue-6.9/net-bcmasp-fix-error-code-in-probe.patch	[new file with mode: 0644]	patch \| blob
queue-6.9/net-dsa-allow-dsa-switch-drivers-to-provide-their-ow.patch	[new file with mode: 0644]	patch \| blob
queue-6.9/net-dsa-introduce-dsa_phylink_to_port.patch	[new file with mode: 0644]	patch \| blob
queue-6.9/net-dsa-lan9303-provide-own-phylink-mac-operations.patch	[new file with mode: 0644]	patch \| blob
queue-6.9/net-ethernet-lantiq_etop-fix-double-free-in-detach.patch	[new file with mode: 0644]	patch \| blob
queue-6.9/net-ethernet-mtk-star-emac-set-mac_managed_pm-when-p.patch	[new file with mode: 0644]	patch \| blob
queue-6.9/net-fix-rc7-s-__skb_datagram_iter.patch	[new file with mode: 0644]	patch \| blob
queue-6.9/net-phy-microchip-lan87xx-reinit-phy-after-cable-tes.patch	[new file with mode: 0644]	patch \| blob
queue-6.9/net-sched-fix-uaf-when-resolving-a-clash.patch	[new file with mode: 0644]	patch \| blob
queue-6.9/net-sunrpc-remap-eperm-in-case-of-connection-failure.patch	[new file with mode: 0644]	patch \| blob
queue-6.9/netfilter-nf_tables-prefer-nft_chain_validate.patch	[new file with mode: 0644]	patch \| blob
queue-6.9/netfilter-nfnetlink_queue-drop-bogus-warn_on.patch	[new file with mode: 0644]	patch \| blob
queue-6.9/octeontx2-af-fix-incorrect-value-output-on-error-pat.patch	[new file with mode: 0644]	patch \| blob
queue-6.9/ppp-reject-claimed-as-lcp-but-actually-malformed-pac.patch	[new file with mode: 0644]	patch \| blob
queue-6.9/scsi-ufs-core-fix-ufshcd_abort_one-racing-issue.patch	[new file with mode: 0644]	patch \| blob
queue-6.9/scsi-ufs-core-fix-ufshcd_clear_cmd-racing-issue.patch	[new file with mode: 0644]	patch \| blob
queue-6.9/series	[new file with mode: 0644]	patch \| blob
queue-6.9/skmsg-skip-zero-length-skb-in-sk_msg_recvmsg.patch	[new file with mode: 0644]	patch \| blob
queue-6.9/spi-add-defer_optimize_message-controller-flag.patch	[new file with mode: 0644]	patch \| blob
queue-6.9/spi-axi-spi-engine-fix-sleep-calculation.patch	[new file with mode: 0644]	patch \| blob
queue-6.9/spi-don-t-unoptimize-message-in-spi_async.patch	[new file with mode: 0644]	patch \| blob
queue-6.9/tcp-avoid-too-many-retransmit-packets.patch	[new file with mode: 0644]	patch \| blob
queue-6.9/tcp-fix-incorrect-undo-caused-by-dsack-of-tlp-retran.patch	[new file with mode: 0644]	patch \| blob
queue-6.9/udp-set-sock_rcu_free-earlier-in-udp_lib_get_port.patch	[new file with mode: 0644]	patch \| blob
queue-6.9/vfio-pci-init-the-count-variable-in-collecting-hot-r.patch	[new file with mode: 0644]	patch \| blob
queue-6.9/vfs-don-t-mod-negative-dentry-count-when-on-shrinker.patch	[new file with mode: 0644]	patch \| blob