net: dev_queue_xmit() llist adoption

author Eric Dumazet <edumazet@google.com>

Tue, 14 Oct 2025 17:19:07 +0000 (17:19 +0000)

committer Jakub Kicinski <kuba@kernel.org>

Thu, 16 Oct 2025 23:25:10 +0000 (16:25 -0700)
author Eric Dumazet <edumazet@google.com>
Tue, 14 Oct 2025 17:19:07 +0000 (17:19 +0000)
committer Jakub Kicinski <kuba@kernel.org>
Thu, 16 Oct 2025 23:25:10 +0000 (16:25 -0700)
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h

index 31561291bc92fd70d4d3ca8f5f7dbc4c94c895a0..94966692ccdf51db085c236319705aecba8c30cf 100644 (file)
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -115,7 +115,9 @@ struct Qdisc {
         struct Qdisc            *next_sched;
         struct sk_buff_head     skb_bad_txq;
  
-       spinlock_t              busylock ____cacheline_aligned_in_smp;
+       atomic_long_t           defer_count ____cacheline_aligned_in_smp;
+       struct llist_head       defer_list;
+
         spinlock_t              seqlock;
  
         struct rcu_head         rcu;
diff --git a/net/core/dev.c b/net/core/dev.c

index 1d8e7a76d83b6780631ce64b1b6ce9db41cdbb13..821e7c718924405d0a7c10e41f677b98aa2d070b 100644 (file)
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4125,9 +4125,10 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
                                  struct net_device *dev,
                                  struct netdev_queue *txq)
  {
+       struct sk_buff *next, *to_free = NULL;
         spinlock_t *root_lock = qdisc_lock(q);
-       struct sk_buff *to_free = NULL;
-       bool contended;
+       struct llist_node *ll_list, *first_n;
+       unsigned long defer_count = 0;
         int rc;
  
         qdisc_calculate_pkt_len(skb, q);
@@ -4167,61 +4168,81 @@ no_lock_out:
                 return rc;
         }
  
-       /*
-        * Heuristic to force contended enqueues to serialize on a
-        * separate lock before trying to get qdisc main lock.
-        * This permits qdisc->running owner to get the lock more
-        * often and dequeue packets faster.
-        * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit
-        * and then other tasks will only enqueue packets. The packets will be
-        * sent after the qdisc owner is scheduled again. To prevent this
-        * scenario the task always serialize on the lock.
+       /* Open code llist_add(&skb->ll_node, &q->defer_list) + queue limit.
+        * In the try_cmpxchg() loop, we want to increment q->defer_count
+        * at most once to limit the number of skbs in defer_list.
+        * We perform the defer_count increment only if the list is not empty,
+        * because some arches have slow atomic_long_inc_return().
+        */
+       first_n = READ_ONCE(q->defer_list.first);
+       do {
+               if (first_n && !defer_count) {
+                       defer_count = atomic_long_inc_return(&q->defer_count);
+                       if (unlikely(defer_count > q->limit)) {
+                               kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_DROP);
+                               return NET_XMIT_DROP;
+                       }
+               }
+               skb->ll_node.next = first_n;
+       } while (!try_cmpxchg(&q->defer_list.first, &first_n, &skb->ll_node));
+
+       /* If defer_list was not empty, we know the cpu which queued
+        * the first skb will process the whole list for us.
          */
-       contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT);
-       if (unlikely(contended))
-               spin_lock(&q->busylock);
+       if (first_n)
+               return NET_XMIT_SUCCESS;
  
         spin_lock(root_lock);
+
+       ll_list = llist_del_all(&q->defer_list);
+       /* There is a small race because we clear defer_count not atomically
+        * with the prior llist_del_all(). This means defer_list could grow
+        * over q->limit.
+        */
+       atomic_long_set(&q->defer_count, 0);
+
+       ll_list = llist_reverse_order(ll_list);
+
         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
-               __qdisc_drop(skb, &to_free);
+               llist_for_each_entry_safe(skb, next, ll_list, ll_node)
+                       __qdisc_drop(skb, &to_free);
                 rc = NET_XMIT_DROP;
-       } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
-                  qdisc_run_begin(q)) {
+               goto unlock;
+       }
+       if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
+           !llist_next(ll_list) && qdisc_run_begin(q)) {
                 /*
                  * This is a work-conserving queue; there are no old skbs
                  * waiting to be sent out; and the qdisc is not running -
                  * xmit the skb directly.
                  */
  
+               DEBUG_NET_WARN_ON_ONCE(skb != llist_entry(ll_list,
+                                                         struct sk_buff,
+                                                         ll_node));
                 qdisc_bstats_update(q, skb);
-
-               if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
-                       if (unlikely(contended)) {
-                               spin_unlock(&q->busylock);
-                               contended = false;
-                       }
+               if (sch_direct_xmit(skb, q, dev, txq, root_lock, true))
                         __qdisc_run(q);
-               }
-
                 qdisc_run_end(q);
                 rc = NET_XMIT_SUCCESS;
         } else {
-               rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
-               if (qdisc_run_begin(q)) {
-                       if (unlikely(contended)) {
-                               spin_unlock(&q->busylock);
-                               contended = false;
-                       }
-                       __qdisc_run(q);
-                       qdisc_run_end(q);
+               int count = 0;
+
+               llist_for_each_entry_safe(skb, next, ll_list, ll_node) {
+                       prefetch(next);
+                       skb_mark_not_on_list(skb);
+                       rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
+                       count++;
                 }
+               qdisc_run(q);
+               if (count != 1)
+                       rc = NET_XMIT_SUCCESS;
         }
+unlock:
         spin_unlock(root_lock);
         if (unlikely(to_free))
                 kfree_skb_list_reason(to_free,
                                       tcf_get_drop_reason(to_free));
-       if (unlikely(contended))
-               spin_unlock(&q->busylock);
         return rc;
  }
  
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c

index dfa8e8e667d24a435b0c9cb3c1f05c8075f63e89..d9a98d02a55fc361a223f3201e37b6a2b698bb5e 100644 (file)
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -666,7 +666,6 @@ struct Qdisc noop_qdisc = {
         .ops            =       &noop_qdisc_ops,
         .q.lock         =       __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
         .dev_queue      =       &noop_netdev_queue,
-       .busylock       =       __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
         .gso_skb = {
                 .next = (struct sk_buff *)&noop_qdisc.gso_skb,
                 .prev = (struct sk_buff *)&noop_qdisc.gso_skb,
@@ -970,10 +969,6 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
                 }
         }
  
-       spin_lock_init(&sch->busylock);
-       lockdep_set_class(&sch->busylock,
-                         dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
-
         /* seqlock has the same scope of busylock, for NOLOCK qdisc */
         spin_lock_init(&sch->seqlock);
         lockdep_set_class(&sch->seqlock,
author	Eric Dumazet <edumazet@google.com>
	Tue, 14 Oct 2025 17:19:07 +0000 (17:19 +0000)
committer	Jakub Kicinski <kuba@kernel.org>
	Thu, 16 Oct 2025 23:25:10 +0000 (16:25 -0700)
include/net/sch_generic.h		patch \| blob \| blame \| history
net/core/dev.c		patch \| blob \| blame \| history
net/sched/sch_generic.c		patch \| blob \| blame \| history