net: allow busy connected flows to switch tx queues

author Eric Dumazet <edumazet@google.com>

Mon, 13 Oct 2025 15:22:34 +0000 (15:22 +0000)

committer Jakub Kicinski <kuba@kernel.org>

Wed, 15 Oct 2025 16:04:22 +0000 (09:04 -0700)
author Eric Dumazet <edumazet@google.com>
Mon, 13 Oct 2025 15:22:34 +0000 (15:22 +0000)
committer Jakub Kicinski <kuba@kernel.org>
Wed, 15 Oct 2025 16:04:22 +0000 (09:04 -0700)
diff --git a/include/net/sock.h b/include/net/sock.h

index 2794bc5c565424491a064049d3d76c3fb7ba1ed8..f0d00928db9e9241a2f2b9f193725bda9f5c0b69 100644 (file)
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -313,6 +313,7 @@ struct sk_filter;
    *    @sk_bind_phc: SO_TIMESTAMPING bind PHC index of PTP virtual clock
    *                  for timestamping
    *    @sk_tskey: counter to disambiguate concurrent tstamp requests
+  *    @sk_tx_queue_mapping_jiffies: time in jiffies of last @sk_tx_queue_mapping refresh.
    *    @sk_zckey: counter to order MSG_ZEROCOPY notifications
    *    @sk_socket: Identd and reporting IO signals
    *    @sk_user_data: RPC layer private data. Write-protected by @sk_callback_lock.
@@ -485,6 +486,7 @@ struct sock {
         unsigned long           sk_pacing_rate; /* bytes per second */
         atomic_t                sk_zckey;
         atomic_t                sk_tskey;
+       unsigned long           sk_tx_queue_mapping_jiffies;
         __cacheline_group_end(sock_write_tx);
  
         __cacheline_group_begin(sock_read_tx);
@@ -1992,7 +1994,15 @@ static inline void sk_tx_queue_set(struct sock *sk, int tx_queue)
         /* Paired with READ_ONCE() in sk_tx_queue_get() and
          * other WRITE_ONCE() because socket lock might be not held.
          */
-       WRITE_ONCE(sk->sk_tx_queue_mapping, tx_queue);
+       if (READ_ONCE(sk->sk_tx_queue_mapping) != tx_queue) {
+               WRITE_ONCE(sk->sk_tx_queue_mapping, tx_queue);
+               WRITE_ONCE(sk->sk_tx_queue_mapping_jiffies, jiffies);
+               return;
+       }
+
+       /* Refresh sk_tx_queue_mapping_jiffies if too old. */
+       if (time_is_before_jiffies(READ_ONCE(sk->sk_tx_queue_mapping_jiffies) + HZ))
+               WRITE_ONCE(sk->sk_tx_queue_mapping_jiffies, jiffies);
  }
  
  #define NO_QUEUE_MAPPING       USHRT_MAX
@@ -2005,19 +2015,7 @@ static inline void sk_tx_queue_clear(struct sock *sk)
         WRITE_ONCE(sk->sk_tx_queue_mapping, NO_QUEUE_MAPPING);
  }
  
-static inline int sk_tx_queue_get(const struct sock *sk)
-{
-       if (sk) {
-               /* Paired with WRITE_ONCE() in sk_tx_queue_clear()
-                * and sk_tx_queue_set().
-                */
-               int val = READ_ONCE(sk->sk_tx_queue_mapping);
-
-               if (val != NO_QUEUE_MAPPING)
-                       return val;
-       }
-       return -1;
-}
+int sk_tx_queue_get(const struct sock *sk);
  
  static inline void __sk_rx_queue_set(struct sock *sk,
                                      const struct sk_buff *skb,
diff --git a/net/core/dev.c b/net/core/dev.c

index a64cef2c537e98ee87776e6f8d3ca3d98f0711b3..33e6101dbc4546aafec627c38cbebb222ea67196 100644 (file)
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4591,6 +4591,32 @@ u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
  }
  EXPORT_SYMBOL(dev_pick_tx_zero);
  
+int sk_tx_queue_get(const struct sock *sk)
+{
+       int resel, val;
+
+       if (!sk)
+               return -1;
+       /* Paired with WRITE_ONCE() in sk_tx_queue_clear()
+        * and sk_tx_queue_set().
+        */
+       val = READ_ONCE(sk->sk_tx_queue_mapping);
+
+       if (val == NO_QUEUE_MAPPING)
+               return -1;
+
+       if (!sk_fullsock(sk))
+               return val;
+
+       resel = READ_ONCE(sock_net(sk)->core.sysctl_txq_reselection);
+       if (resel && time_is_before_jiffies(
+                       READ_ONCE(sk->sk_tx_queue_mapping_jiffies) + resel))
+               return -1;
+
+       return val;
+}
+EXPORT_SYMBOL(sk_tx_queue_get);
+
  u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
                      struct net_device *sb_dev)
  {
@@ -4606,8 +4632,7 @@ u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
                 if (new_index < 0)
                         new_index = skb_tx_hash(dev, sb_dev, skb);
  
-               if (queue_index != new_index && sk &&
-                   sk_fullsock(sk) &&
+               if (sk && sk_fullsock(sk) &&
                     rcu_access_pointer(sk->sk_dst_cache))
                         sk_tx_queue_set(sk, new_index);
author	Eric Dumazet <edumazet@google.com>
	Mon, 13 Oct 2025 15:22:34 +0000 (15:22 +0000)
committer	Jakub Kicinski <kuba@kernel.org>
	Wed, 15 Oct 2025 16:04:22 +0000 (09:04 -0700)
include/net/sock.h		patch \| blob \| blame \| history
net/core/dev.c		patch \| blob \| blame \| history