]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
net: xsk: update tx queue consumer immediately after transmission
authorJason Xing <kernelxing@tencent.com>
Thu, 3 Jul 2025 14:17:11 +0000 (22:17 +0800)
committerJakub Kicinski <kuba@kernel.org>
Wed, 9 Jul 2025 01:28:07 +0000 (18:28 -0700)
For afxdp, the return value of sendto() syscall doesn't reflect how many
descs handled in the kernel. One of use cases is that when user-space
application tries to know the number of transmitted skbs and then decides
if it continues to send, say, is it stopped due to max tx budget?

The following formular can be used after sending to learn how many
skbs/descs the kernel takes care of:

  tx_queue.consumers_before - tx_queue.consumers_after

Prior to the current patch, in non-zc mode, the consumer of tx queue is
not immediately updated at the end of each sendto syscall when error
occurs, which leads to the consumer value out-of-dated from the perspective
of user space. So this patch requires store operation to pass the cached
value to the shared value to handle the problem.

More than those explicit errors appearing in the while() loop in
__xsk_generic_xmit(), there are a few possible error cases that might
be neglected in the following call trace:
__xsk_generic_xmit()
    xskq_cons_peek_desc()
        xskq_cons_read_desc()
    xskq_cons_is_valid_desc()
It will also cause the premature exit in the while() loop even if not
all the descs are consumed.

Based on the above analysis, using @sent_frame could cover all the possible
cases where it might lead to out-of-dated global state of consumer after
finishing __xsk_generic_xmit().

The patch also adds a common helper __xsk_tx_release() to keep align
with the zc mode usage in xsk_tx_release().

Signed-off-by: Jason Xing <kernelxing@tencent.com>
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20250703141712.33190-2-kerneljasonxing@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
net/xdp/xsk.c

index 72c000c0ae5f57e6b710deeb74717ff965d9f8f1..bd61b0bc9c2420f7b79dc550252e7628219198da 100644 (file)
@@ -300,6 +300,13 @@ static bool xsk_tx_writeable(struct xdp_sock *xs)
        return true;
 }
 
+static void __xsk_tx_release(struct xdp_sock *xs)
+{
+       __xskq_cons_release(xs->tx);
+       if (xsk_tx_writeable(xs))
+               xs->sk.sk_write_space(&xs->sk);
+}
+
 static bool xsk_is_bound(struct xdp_sock *xs)
 {
        if (READ_ONCE(xs->state) == XSK_BOUND) {
@@ -407,11 +414,8 @@ void xsk_tx_release(struct xsk_buff_pool *pool)
        struct xdp_sock *xs;
 
        rcu_read_lock();
-       list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
-               __xskq_cons_release(xs->tx);
-               if (xsk_tx_writeable(xs))
-                       xs->sk.sk_write_space(&xs->sk);
-       }
+       list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list)
+               __xsk_tx_release(xs);
        rcu_read_unlock();
 }
 EXPORT_SYMBOL(xsk_tx_release);
@@ -858,8 +862,7 @@ static int __xsk_generic_xmit(struct sock *sk)
 
 out:
        if (sent_frame)
-               if (xsk_tx_writeable(xs))
-                       sk->sk_write_space(sk);
+               __xsk_tx_release(xs);
 
        mutex_unlock(&xs->mutex);
        return err;