]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
tcp: fix tcp_tso_should_defer() vs large RTT
authorEric Dumazet <edumazet@google.com>
Sat, 11 Oct 2025 11:57:42 +0000 (11:57 +0000)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 29 Oct 2025 13:03:08 +0000 (14:03 +0100)
[ Upstream commit 295ce1eb36ae47dc862d6c8a1012618a25516208 ]

Neal reported that using neper tcp_stream with TCP_TX_DELAY
set to 50ms would often lead to flows stuck in a small cwnd mode,
regardless of the congestion control.

While tcp_stream sets TCP_TX_DELAY too late after the connect(),
it highlighted two kernel bugs.

The following heuristic in tcp_tso_should_defer() seems wrong
for large RTT:

delta = tp->tcp_clock_cache - head->tstamp;
/* If next ACK is likely to come too late (half srtt), do not defer */
if ((s64)(delta - (u64)NSEC_PER_USEC * (tp->srtt_us >> 4)) < 0)
      goto send_now;

If next ACK is expected to come in more than 1 ms, we should
not defer because we prefer a smooth ACK clocking.

While blamed commit was a step in the good direction, it was not
generic enough.

Another patch fixing TCP_TX_DELAY for established flows
will be proposed when net-next reopens.

Fixes: 50c8339e9299 ("tcp: tso: restore IW10 after TSO autosizing")
Reported-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Tested-by: Neal Cardwell <ncardwell@google.com>
Link: https://patch.msgid.link/20251011115742.1245771-1-edumazet@google.com
[pabeni@redhat.com: fixed whitespace issue]
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
net/ipv4/tcp_output.c

index 3a66d0c7d015cfbd38f5719235b026ad3262fe67..dd63832c11fd7d566b31ef47ee34d56324918779 100644 (file)
@@ -2180,7 +2180,8 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
                                 u32 max_segs)
 {
        const struct inet_connection_sock *icsk = inet_csk(sk);
-       u32 send_win, cong_win, limit, in_flight;
+       u32 send_win, cong_win, limit, in_flight, threshold;
+       u64 srtt_in_ns, expected_ack, how_far_is_the_ack;
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *head;
        int win_divisor;
@@ -2242,9 +2243,19 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
        head = tcp_rtx_queue_head(sk);
        if (!head)
                goto send_now;
-       delta = tp->tcp_clock_cache - head->tstamp;
-       /* If next ACK is likely to come too late (half srtt), do not defer */
-       if ((s64)(delta - (u64)NSEC_PER_USEC * (tp->srtt_us >> 4)) < 0)
+
+       srtt_in_ns = (u64)(NSEC_PER_USEC >> 3) * tp->srtt_us;
+       /* When is the ACK expected ? */
+       expected_ack = head->tstamp + srtt_in_ns;
+       /* How far from now is the ACK expected ? */
+       how_far_is_the_ack = expected_ack - tp->tcp_clock_cache;
+
+       /* If next ACK is likely to come too late,
+        * ie in more than min(1ms, half srtt), do not defer.
+        */
+       threshold = min(srtt_in_ns >> 1, NSEC_PER_MSEC);
+
+       if ((s64)(how_far_is_the_ack - threshold) > 0)
                goto send_now;
 
        /* Ok, it looks like it is advisable to defer.