]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
tcp: avoid premature drops in tcp_add_backlog()
authorEric Dumazet <edumazet@google.com>
Tue, 23 Apr 2024 12:56:20 +0000 (12:56 +0000)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 30 May 2024 07:49:16 +0000 (09:49 +0200)
[ Upstream commit ec00ed472bdb7d0af840da68c8c11bff9f4d9caa ]

While testing TCP performance with latest trees,
I saw suspect SOCKET_BACKLOG drops.

tcp_add_backlog() computes its limit with :

    limit = (u32)READ_ONCE(sk->sk_rcvbuf) +
            (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
    limit += 64 * 1024;

This does not take into account that sk->sk_backlog.len
is reset only at the very end of __release_sock().

Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
sk_rcvbuf in normal conditions.

We should double sk->sk_rcvbuf contribution in the formula
to absorb bubbles in the backlog, which happen more often
for very fast flows.

This change maintains decent protection against abuses.

Fixes: c377411f2494 ("net: sk_add_backlog() take rmem_alloc into account")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20240423125620.3309458-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
net/ipv4/tcp_ipv4.c

index 68a065c0e5081f07373076c7c37c5f942710504f..abd47159d7e4d216045fe4ac3d9e098145ca85d5 100644 (file)
@@ -2000,7 +2000,7 @@ int tcp_v4_early_demux(struct sk_buff *skb)
 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
                     enum skb_drop_reason *reason)
 {
-       u32 limit, tail_gso_size, tail_gso_segs;
+       u32 tail_gso_size, tail_gso_segs;
        struct skb_shared_info *shinfo;
        const struct tcphdr *th;
        struct tcphdr *thtail;
@@ -2009,6 +2009,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
        bool fragstolen;
        u32 gso_segs;
        u32 gso_size;
+       u64 limit;
        int delta;
 
        /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
@@ -2106,7 +2107,13 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
        __skb_push(skb, hdrlen);
 
 no_coalesce:
-       limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
+       /* sk->sk_backlog.len is reset only at the end of __release_sock().
+        * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
+        * sk_rcvbuf in normal conditions.
+        */
+       limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
+
+       limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
 
        /* Only socket owner can try to collapse/prune rx queues
         * to reduce memory overhead, so add a little headroom here.
@@ -2114,6 +2121,8 @@ no_coalesce:
         */
        limit += 64 * 1024;
 
+       limit = min_t(u64, limit, UINT_MAX);
+
        if (unlikely(sk_add_backlog(sk, skb, limit))) {
                bh_unlock_sock(sk);
                *reason = SKB_DROP_REASON_SOCKET_BACKLOG;