]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
tcp: implement RFC 7323 window retraction receiver requirements
authorSimon Baatz <gmbnomis@gmail.com>
Mon, 9 Mar 2026 08:02:26 +0000 (09:02 +0100)
committerJakub Kicinski <kuba@kernel.org>
Sat, 14 Mar 2026 15:01:49 +0000 (08:01 -0700)
By default, the Linux TCP implementation does not shrink the
advertised window (RFC 7323 calls this "window retraction") with the
following exceptions:

- When an incoming segment cannot be added due to the receive buffer
  running out of memory. Since commit 8c670bdfa58e ("tcp: correct
  handling of extreme memory squeeze") a zero window will be
  advertised in this case. It turns out that reaching the required
  memory pressure is easy when window scaling is in use. In the
  simplest case, sending a sufficient number of segments smaller than
  the scale factor to a receiver that does not read data is enough.

- Commit b650d953cd39 ("tcp: enforce receive buffer memory limits by
  allowing the tcp window to shrink") addressed the "eating memory"
  problem by introducing a sysctl knob that allows shrinking the
  window before running out of memory.

However, RFC 7323 does not only state that shrinking the window is
necessary in some cases, it also formulates requirements for TCP
implementations when doing so (Section 2.4).

This commit addresses the receiver-side requirements: After retracting
the window, the peer may have a snd_nxt that lies within a previously
advertised window but is now beyond the retracted window. This means
that all incoming segments (including pure ACKs) will be rejected
until the application happens to read enough data to let the peer's
snd_nxt be in window again (which may be never).

To comply with RFC 7323, the receiver MUST honor any segment that
would have been in window for any ACK sent by the receiver and, when
window scaling is in effect, SHOULD track the maximum window sequence
number it has advertised. This patch tracks that maximum window
sequence number rcv_mwnd_seq throughout the connection and uses it in
tcp_sequence() when deciding whether a segment is acceptable.

rcv_mwnd_seq is updated together with rcv_wup and rcv_wnd in
tcp_select_window(). If we count tcp_sequence() as fast path, it is
read in the fast path. Therefore, rcv_mwnd_seq is put into rcv_wnd's
cacheline group.

The logic for handling received data in tcp_data_queue() is already
sufficient and does not need to be updated.

Signed-off-by: Simon Baatz <gmbnomis@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20260309-tcp_rfc7323_retract_wnd_rfc-v3-1-4c7f96b1ec69@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Documentation/networking/net_cachelines/tcp_sock.rst
include/linux/tcp.h
include/net/tcp.h
net/ipv4/tcp.c
net/ipv4/tcp_fastopen.c
net/ipv4/tcp_input.c
net/ipv4/tcp_minisocks.c
net/ipv4/tcp_output.c
tools/testing/selftests/net/packetdrill/tcp_rcv_big_endseq.pkt

index 563daea10d6c5c074f004cb1b8574f5392157abb..fecf61166a54ee2f64bcef5312c81dcc4aa9a124 100644 (file)
@@ -121,6 +121,7 @@ u64                           delivered_mstamp        read_write
 u32                           rate_delivered                              read_mostly         tcp_rate_gen
 u32                           rate_interval_us                            read_mostly         rate_delivered,rate_app_limited
 u32                           rcv_wnd                 read_write          read_mostly         tcp_select_window,tcp_receive_window,tcp_fast_path_check
+u32                           rcv_mwnd_seq            read_write                              tcp_select_window
 u32                           write_seq               read_write                              tcp_rate_check_app_limited,tcp_write_queue_empty,tcp_skb_entail,forced_push,tcp_mark_push
 u32                           notsent_lowat           read_mostly                             tcp_stream_memory_free
 u32                           pushed_seq              read_write                              tcp_mark_push,forced_push
index bcebc4f07532f0e099d9b0751db7d3eedff2ee5d..6982f10e826b4004f210ea22c94f0488e52184d1 100644 (file)
@@ -316,6 +316,9 @@ struct tcp_sock {
                                        */
        u32     app_limited;    /* limited until "delivered" reaches this val */
        u32     rcv_wnd;        /* Current receiver window              */
+       u32     rcv_mwnd_seq;   /* Maximum window sequence number (RFC 7323,
+                                * section 2.4, receiver requirements)
+                                */
        u32     rcv_tstamp;     /* timestamp of last received ACK (for keepalives) */
 /*
  *      Options received (usually on last packet, some only on SYN packets).
index 48dffcca0a71b70d0c0fd49d89a66c4f6ae72a58..f87bdacb5a6995422851e88cfb65734702c84093 100644 (file)
@@ -934,6 +934,28 @@ static inline u32 tcp_receive_window(const struct tcp_sock *tp)
        return (u32) win;
 }
 
+/* Compute the maximum receive window we ever advertised.
+ * Rcv_nxt can be after the window if our peer push more data
+ * than the offered window.
+ */
+static inline u32 tcp_max_receive_window(const struct tcp_sock *tp)
+{
+       s32 win = tp->rcv_mwnd_seq - tp->rcv_nxt;
+
+       if (win < 0)
+               win = 0;
+       return (u32) win;
+}
+
+/* Check if we need to update the maximum receive window sequence number */
+static inline void tcp_update_max_rcv_wnd_seq(struct tcp_sock *tp)
+{
+       u32 wre = tp->rcv_wup + tp->rcv_wnd;
+
+       if (after(wre, tp->rcv_mwnd_seq))
+               tp->rcv_mwnd_seq = wre;
+}
+
 /* Choose a new window, without checks for shrinking, and without
  * scaling applied to the result.  The caller does these things
  * if necessary.  This is a "raw" window selection.
index ed6f6712f06076dc33af61947782bde436dde15e..516087c622ade78883ca41e4f883740e305035a0 100644 (file)
@@ -3561,6 +3561,7 @@ static int tcp_repair_set_window(struct tcp_sock *tp, sockptr_t optbuf, int len)
 
        tp->rcv_wnd     = opt.rcv_wnd;
        tp->rcv_wup     = opt.rcv_wup;
+       tp->rcv_mwnd_seq = opt.rcv_wup + opt.rcv_wnd;
 
        return 0;
 }
@@ -5275,6 +5276,7 @@ static void __init tcp_struct_check(void)
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ecn_bytes);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd);
+       CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_mwnd_seq);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_tstamp);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt);
 
index 9fdc19accafd23c6ab74bd82f7a7d82de1d60b90..4e389d609f919c17435509c5007bc3b2a13eac6c 100644 (file)
@@ -377,6 +377,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
 
        tcp_rsk(req)->rcv_nxt = tp->rcv_nxt;
        tp->rcv_wup = tp->rcv_nxt;
+       tp->rcv_mwnd_seq = tp->rcv_wup + tp->rcv_wnd;
        /* tcp_conn_request() is sending the SYNACK,
         * and queues the child into listener accept queue.
         */
index 71ac69b7b75e4919f69631a4894421fa4e417c95..2e1b237608150c2e9c9baf73cf047ed0823ca555 100644 (file)
@@ -4808,20 +4808,18 @@ static enum skb_drop_reason tcp_sequence(const struct sock *sk,
                                         const struct tcphdr *th)
 {
        const struct tcp_sock *tp = tcp_sk(sk);
-       u32 seq_limit;
 
        if (before(end_seq, tp->rcv_wup))
                return SKB_DROP_REASON_TCP_OLD_SEQUENCE;
 
-       seq_limit = tp->rcv_nxt + tcp_receive_window(tp);
-       if (unlikely(after(end_seq, seq_limit))) {
+       if (unlikely(after(end_seq, tp->rcv_nxt + tcp_max_receive_window(tp)))) {
                /* Some stacks are known to handle FIN incorrectly; allow the
                 * FIN to extend beyond the window and check it in detail later.
                 */
-               if (!after(end_seq - th->fin, seq_limit))
+               if (!after(end_seq - th->fin, tp->rcv_nxt + tcp_receive_window(tp)))
                        return SKB_NOT_DROPPED_YET;
 
-               if (after(seq, seq_limit))
+               if (after(seq, tp->rcv_nxt + tcp_max_receive_window(tp)))
                        return SKB_DROP_REASON_TCP_INVALID_SEQUENCE;
 
                /* Only accept this packet if receive queue is empty. */
@@ -6903,6 +6901,7 @@ consume:
                 */
                WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1);
                tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
+               tp->rcv_mwnd_seq = tp->rcv_wup + tp->rcv_wnd;
 
                /* RFC1323: The window in SYN & SYN/ACK segments is
                 * never scaled.
@@ -7015,6 +7014,7 @@ consume:
                WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1);
                WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
                tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
+               tp->rcv_mwnd_seq = tp->rcv_wup + tp->rcv_wnd;
 
                /* RFC1323: The window in SYN & SYN/ACK segments is
                 * never scaled.
index dafb63b923d0d08cb1a0e9a37d8ec025386a960a..d350d794a959720853ffd8937cfdc34c03e2ce30 100644 (file)
@@ -604,6 +604,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
        newtp->window_clamp = req->rsk_window_clamp;
        newtp->rcv_ssthresh = req->rsk_rcv_wnd;
        newtp->rcv_wnd = req->rsk_rcv_wnd;
+       newtp->rcv_mwnd_seq = newtp->rcv_wup + req->rsk_rcv_wnd;
        newtp->rx_opt.wscale_ok = ireq->wscale_ok;
        if (newtp->rx_opt.wscale_ok) {
                newtp->rx_opt.snd_wscale = ireq->snd_wscale;
index 34a25ef610060988c0c0350ca4b97a112f04ddcb..35c3b0ab5a0cb714155d5720fe56888f71aecced 100644 (file)
@@ -293,6 +293,7 @@ static u16 tcp_select_window(struct sock *sk)
                tp->pred_flags = 0;
                tp->rcv_wnd = 0;
                tp->rcv_wup = tp->rcv_nxt;
+               tcp_update_max_rcv_wnd_seq(tp);
                return 0;
        }
 
@@ -316,6 +317,7 @@ static u16 tcp_select_window(struct sock *sk)
 
        tp->rcv_wnd = new_win;
        tp->rcv_wup = tp->rcv_nxt;
+       tcp_update_max_rcv_wnd_seq(tp);
 
        /* Make sure we do not exceed the maximum possible
         * scaled window.
@@ -4165,6 +4167,7 @@ static void tcp_connect_init(struct sock *sk)
        else
                tp->rcv_tstamp = tcp_jiffies32;
        tp->rcv_wup = tp->rcv_nxt;
+       tp->rcv_mwnd_seq = tp->rcv_nxt + tp->rcv_wnd;
        WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
 
        inet_csk(sk)->icsk_rto = tcp_timeout_init(sk);
index 6c0f32c40f19be2a750fc9d69bbf64250cd7b525..12882be10f2e0cf19e6bc7bd2479b27c11ce8ac0 100644 (file)
@@ -36,7 +36,7 @@
 
   +0 read(4, ..., 100000) = 4000
 
-// If queue is empty, accept a packet even if its end_seq is above wup + rcv_wnd
+// If queue is empty, accept a packet even if its end_seq is above rcv_mwnd_seq
   +0 < P. 4001:54001(50000) ack 1 win 257
    * > .  1:1(0) ack 54001 win 0