From: Eric Dumazet Date: Wed, 21 Jan 2026 09:59:22 +0000 (+0000) Subject: tcp: move tcp_rate_gen to tcp_input.c X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=b814bdcecd7990d85d42e19cff6ce0c12f146330;p=thirdparty%2Fkernel%2Flinux.git tcp: move tcp_rate_gen to tcp_input.c This function is called from one caller only, in TCP fast path. Move it to tcp_input.c so that compiler can inline it. $ scripts/bloat-o-meter -t vmlinux.old vmlinux.new add/remove: 0/2 grow/shrink: 1/0 up/down: 226/-300 (-74) Function old new delta tcp_ack 5405 5631 +226 __pfx_tcp_rate_gen 16 - -16 tcp_rate_gen 284 - -284 Total: Before=22566536, After=22566462, chg -0.00% Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Link: https://patch.msgid.link/20260121095923.3134639-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- diff --git a/include/net/tcp.h b/include/net/tcp.h index b38327606454..9345f1757169 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1356,8 +1356,6 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) void tcp_set_ca_state(struct sock *sk, const u8 ca_state); /* From tcp_rate.c */ -void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, - bool is_sack_reneg, struct rate_sample *rs); void tcp_rate_check_app_limited(struct sock *sk); static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index dc8e256321b0..9e91ddbc6253 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1637,6 +1637,116 @@ static u8 tcp_sacktag_one(struct sock *sk, return sacked; } +/* The bandwidth estimator estimates the rate at which the network + * can currently deliver outbound data packets for this flow. At a high + * level, it operates by taking a delivery rate sample for each ACK. + * + * A rate sample records the rate at which the network delivered packets + * for this flow, calculated over the time interval between the transmission + * of a data packet and the acknowledgment of that packet. + * + * Specifically, over the interval between each transmit and corresponding ACK, + * the estimator generates a delivery rate sample. Typically it uses the rate + * at which packets were acknowledged. However, the approach of using only the + * acknowledgment rate faces a challenge under the prevalent ACK decimation or + * compression: packets can temporarily appear to be delivered much quicker + * than the bottleneck rate. Since it is physically impossible to do that in a + * sustained fashion, when the estimator notices that the ACK rate is faster + * than the transmit rate, it uses the latter: + * + * send_rate = #pkts_delivered/(last_snd_time - first_snd_time) + * ack_rate = #pkts_delivered/(last_ack_time - first_ack_time) + * bw = min(send_rate, ack_rate) + * + * Notice the estimator essentially estimates the goodput, not always the + * network bottleneck link rate when the sending or receiving is limited by + * other factors like applications or receiver window limits. The estimator + * deliberately avoids using the inter-packet spacing approach because that + * approach requires a large number of samples and sophisticated filtering. + * + * TCP flows can often be application-limited in request/response workloads. + * The estimator marks a bandwidth sample as application-limited if there + * was some moment during the sampled window of packets when there was no data + * ready to send in the write queue. + */ + +/* Update the connection delivery information and generate a rate sample. */ +static void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, + bool is_sack_reneg, struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 snd_us, ack_us; + + /* Clear app limited if bubble is acked and gone. */ + if (tp->app_limited && after(tp->delivered, tp->app_limited)) + tp->app_limited = 0; + + /* TODO: there are multiple places throughout tcp_ack() to get + * current time. Refactor the code using a new "tcp_acktag_state" + * to carry current time, flags, stats like "tcp_sacktag_state". + */ + if (delivered) + tp->delivered_mstamp = tp->tcp_mstamp; + + rs->acked_sacked = delivered; /* freshly ACKed or SACKed */ + rs->losses = lost; /* freshly marked lost */ + /* Return an invalid sample if no timing information is available or + * in recovery from loss with SACK reneging. Rate samples taken during + * a SACK reneging event may overestimate bw by including packets that + * were SACKed before the reneg. + */ + if (!rs->prior_mstamp || is_sack_reneg) { + rs->delivered = -1; + rs->interval_us = -1; + return; + } + rs->delivered = tp->delivered - rs->prior_delivered; + + rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce; + /* delivered_ce occupies less than 32 bits in the skb control block */ + rs->delivered_ce &= TCPCB_DELIVERED_CE_MASK; + + /* Model sending data and receiving ACKs as separate pipeline phases + * for a window. Usually the ACK phase is longer, but with ACK + * compression the send phase can be longer. To be safe we use the + * longer phase. + */ + snd_us = rs->interval_us; /* send phase */ + ack_us = tcp_stamp_us_delta(tp->tcp_mstamp, + rs->prior_mstamp); /* ack phase */ + rs->interval_us = max(snd_us, ack_us); + + /* Record both segment send and ack receive intervals */ + rs->snd_interval_us = snd_us; + rs->rcv_interval_us = ack_us; + + /* Normally we expect interval_us >= min-rtt. + * Note that rate may still be over-estimated when a spuriously + * retransmistted skb was first (s)acked because "interval_us" + * is under-estimated (up to an RTT). However continuously + * measuring the delivery rate during loss recovery is crucial + * for connections suffer heavy or prolonged losses. + */ + if (unlikely(rs->interval_us < tcp_min_rtt(tp))) { + if (!rs->is_retrans) + pr_debug("tcp rate: %ld %d %u %u %u\n", + rs->interval_us, rs->delivered, + inet_csk(sk)->icsk_ca_state, + tp->rx_opt.sack_ok, tcp_min_rtt(tp)); + rs->interval_us = -1; + return; + } + + /* Record the last non-app-limited or the highest app-limited bw */ + if (!rs->is_app_limited || + ((u64)rs->delivered * tp->rate_interval_us >= + (u64)tp->rate_delivered * rs->interval_us)) { + tp->rate_delivered = rs->delivered; + tp->rate_interval_us = rs->interval_us; + tp->rate_app_limited = rs->is_app_limited; + } +} + /* When an skb is sacked or acked, we fill in the rate sample with the (prior) * delivery information when the skb was last transmitted. * diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c index f0f2ef377043..272806ba3b4e 100644 --- a/net/ipv4/tcp_rate.c +++ b/net/ipv4/tcp_rate.c @@ -1,116 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only #include -/* The bandwidth estimator estimates the rate at which the network - * can currently deliver outbound data packets for this flow. At a high - * level, it operates by taking a delivery rate sample for each ACK. - * - * A rate sample records the rate at which the network delivered packets - * for this flow, calculated over the time interval between the transmission - * of a data packet and the acknowledgment of that packet. - * - * Specifically, over the interval between each transmit and corresponding ACK, - * the estimator generates a delivery rate sample. Typically it uses the rate - * at which packets were acknowledged. However, the approach of using only the - * acknowledgment rate faces a challenge under the prevalent ACK decimation or - * compression: packets can temporarily appear to be delivered much quicker - * than the bottleneck rate. Since it is physically impossible to do that in a - * sustained fashion, when the estimator notices that the ACK rate is faster - * than the transmit rate, it uses the latter: - * - * send_rate = #pkts_delivered/(last_snd_time - first_snd_time) - * ack_rate = #pkts_delivered/(last_ack_time - first_ack_time) - * bw = min(send_rate, ack_rate) - * - * Notice the estimator essentially estimates the goodput, not always the - * network bottleneck link rate when the sending or receiving is limited by - * other factors like applications or receiver window limits. The estimator - * deliberately avoids using the inter-packet spacing approach because that - * approach requires a large number of samples and sophisticated filtering. - * - * TCP flows can often be application-limited in request/response workloads. - * The estimator marks a bandwidth sample as application-limited if there - * was some moment during the sampled window of packets when there was no data - * ready to send in the write queue. - */ - -/* Update the connection delivery information and generate a rate sample. */ -void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, - bool is_sack_reneg, struct rate_sample *rs) -{ - struct tcp_sock *tp = tcp_sk(sk); - u32 snd_us, ack_us; - - /* Clear app limited if bubble is acked and gone. */ - if (tp->app_limited && after(tp->delivered, tp->app_limited)) - tp->app_limited = 0; - - /* TODO: there are multiple places throughout tcp_ack() to get - * current time. Refactor the code using a new "tcp_acktag_state" - * to carry current time, flags, stats like "tcp_sacktag_state". - */ - if (delivered) - tp->delivered_mstamp = tp->tcp_mstamp; - - rs->acked_sacked = delivered; /* freshly ACKed or SACKed */ - rs->losses = lost; /* freshly marked lost */ - /* Return an invalid sample if no timing information is available or - * in recovery from loss with SACK reneging. Rate samples taken during - * a SACK reneging event may overestimate bw by including packets that - * were SACKed before the reneg. - */ - if (!rs->prior_mstamp || is_sack_reneg) { - rs->delivered = -1; - rs->interval_us = -1; - return; - } - rs->delivered = tp->delivered - rs->prior_delivered; - - rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce; - /* delivered_ce occupies less than 32 bits in the skb control block */ - rs->delivered_ce &= TCPCB_DELIVERED_CE_MASK; - - /* Model sending data and receiving ACKs as separate pipeline phases - * for a window. Usually the ACK phase is longer, but with ACK - * compression the send phase can be longer. To be safe we use the - * longer phase. - */ - snd_us = rs->interval_us; /* send phase */ - ack_us = tcp_stamp_us_delta(tp->tcp_mstamp, - rs->prior_mstamp); /* ack phase */ - rs->interval_us = max(snd_us, ack_us); - - /* Record both segment send and ack receive intervals */ - rs->snd_interval_us = snd_us; - rs->rcv_interval_us = ack_us; - - /* Normally we expect interval_us >= min-rtt. - * Note that rate may still be over-estimated when a spuriously - * retransmistted skb was first (s)acked because "interval_us" - * is under-estimated (up to an RTT). However continuously - * measuring the delivery rate during loss recovery is crucial - * for connections suffer heavy or prolonged losses. - */ - if (unlikely(rs->interval_us < tcp_min_rtt(tp))) { - if (!rs->is_retrans) - pr_debug("tcp rate: %ld %d %u %u %u\n", - rs->interval_us, rs->delivered, - inet_csk(sk)->icsk_ca_state, - tp->rx_opt.sack_ok, tcp_min_rtt(tp)); - rs->interval_us = -1; - return; - } - - /* Record the last non-app-limited or the highest app-limited bw */ - if (!rs->is_app_limited || - ((u64)rs->delivered * tp->rate_interval_us >= - (u64)tp->rate_delivered * rs->interval_us)) { - tp->rate_delivered = rs->delivered; - tp->rate_interval_us = rs->interval_us; - tp->rate_app_limited = rs->is_app_limited; - } -} - /* If a gap is detected between sends, mark the socket application-limited. */ void tcp_rate_check_app_limited(struct sock *sk) {