]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
net: sched: avoid qdisc_reset_all_tx_gt() vs dequeue race for lockless qdiscs
authorKoichiro Den <den@valinux.co.jp>
Sat, 28 Feb 2026 14:53:07 +0000 (23:53 +0900)
committerJakub Kicinski <kuba@kernel.org>
Thu, 5 Mar 2026 01:43:45 +0000 (17:43 -0800)
When shrinking the number of real tx queues,
netif_set_real_num_tx_queues() calls qdisc_reset_all_tx_gt() to flush
qdiscs for queues which will no longer be used.

qdisc_reset_all_tx_gt() currently serializes qdisc_reset() with
qdisc_lock(). However, for lockless qdiscs, the dequeue path is
serialized by qdisc_run_begin/end() using qdisc->seqlock instead, so
qdisc_reset() can run concurrently with __qdisc_run() and free skbs
while they are still being dequeued, leading to UAF.

This can easily be reproduced on e.g. virtio-net by imposing heavy
traffic while frequently changing the number of queue pairs:

  iperf3 -ub0 -c $peer -t 0 &
  while :; do
    ethtool -L eth0 combined 1
    ethtool -L eth0 combined 2
  done

With KASAN enabled, this leads to reports like:

  BUG: KASAN: slab-use-after-free in __qdisc_run+0x133f/0x1760
  ...
  Call Trace:
   <TASK>
   ...
   __qdisc_run+0x133f/0x1760
   __dev_queue_xmit+0x248f/0x3550
   ip_finish_output2+0xa42/0x2110
   ip_output+0x1a7/0x410
   ip_send_skb+0x2e6/0x480
   udp_send_skb+0xb0a/0x1590
   udp_sendmsg+0x13c9/0x1fc0
   ...
   </TASK>

  Allocated by task 1270 on cpu 5 at 44.558414s:
   ...
   alloc_skb_with_frags+0x84/0x7c0
   sock_alloc_send_pskb+0x69a/0x830
   __ip_append_data+0x1b86/0x48c0
   ip_make_skb+0x1e8/0x2b0
   udp_sendmsg+0x13a6/0x1fc0
   ...

  Freed by task 1306 on cpu 3 at 44.558445s:
   ...
   kmem_cache_free+0x117/0x5e0
   pfifo_fast_reset+0x14d/0x580
   qdisc_reset+0x9e/0x5f0
   netif_set_real_num_tx_queues+0x303/0x840
   virtnet_set_channels+0x1bf/0x260 [virtio_net]
   ethnl_set_channels+0x684/0xae0
   ethnl_default_set_doit+0x31a/0x890
   ...

Serialize qdisc_reset_all_tx_gt() against the lockless dequeue path by
taking qdisc->seqlock for TCQ_F_NOLOCK qdiscs, matching the
serialization model already used by dev_reset_queue().

Additionally clear QDISC_STATE_NON_EMPTY after reset so the qdisc state
reflects an empty queue, avoiding needless re-scheduling.

Fixes: 6b3ba9146fe6 ("net: sched: allow qdiscs to handle locking")
Signed-off-by: Koichiro Den <den@valinux.co.jp>
Link: https://patch.msgid.link/20260228145307.3955532-1-den@valinux.co.jp
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
include/net/sch_generic.h

index c3a7268b567e0abf3f38290cd4e3fa7cd0601e36..d5d55cb21686dd6f5cba4b03039e047316969f5e 100644 (file)
@@ -778,13 +778,23 @@ static inline bool skb_skip_tc_classify(struct sk_buff *skb)
 static inline void qdisc_reset_all_tx_gt(struct net_device *dev, unsigned int i)
 {
        struct Qdisc *qdisc;
+       bool nolock;
 
        for (; i < dev->num_tx_queues; i++) {
                qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc);
                if (qdisc) {
+                       nolock = qdisc->flags & TCQ_F_NOLOCK;
+
+                       if (nolock)
+                               spin_lock_bh(&qdisc->seqlock);
                        spin_lock_bh(qdisc_lock(qdisc));
                        qdisc_reset(qdisc);
                        spin_unlock_bh(qdisc_lock(qdisc));
+                       if (nolock) {
+                               clear_bit(__QDISC_STATE_MISSED, &qdisc->state);
+                               clear_bit(__QDISC_STATE_DRAINING, &qdisc->state);
+                               spin_unlock_bh(&qdisc->seqlock);
+                       }
                }
        }
 }