]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/blame - releases/4.19.35/tcp-ensure-dctcp-reacts-to-losses.patch
Linux 4.19.35
[thirdparty/kernel/stable-queue.git] / releases / 4.19.35 / tcp-ensure-dctcp-reacts-to-losses.patch
CommitLineData
a9fba688
SL
1From b877af05cdab4a74ba52e6a8ef6efefc6d242a23 Mon Sep 17 00:00:00 2001
2From: Koen De Schepper <koen.de_schepper@nokia-bell-labs.com>
3Date: Thu, 4 Apr 2019 12:24:02 +0000
4Subject: tcp: Ensure DCTCP reacts to losses
5MIME-Version: 1.0
6Content-Type: text/plain; charset=UTF-8
7Content-Transfer-Encoding: 8bit
8
9[ Upstream commit aecfde23108b8e637d9f5c5e523b24fb97035dc3 ]
10
11RFC8257 ยง3.5 explicitly states that "A DCTCP sender MUST react to
12loss episodes in the same way as conventional TCP".
13
14Currently, Linux DCTCP performs no cwnd reduction when losses
15are encountered. Optionally, the dctcp_clamp_alpha_on_loss resets
16alpha to its maximal value if a RTO happens. This behavior
17is sub-optimal for at least two reasons: i) it ignores losses
18triggering fast retransmissions; and ii) it causes unnecessary large
19cwnd reduction in the future if the loss was isolated as it resets
20the historical term of DCTCP's alpha EWMA to its maximal value (i.e.,
21denoting a total congestion). The second reason has an especially
22noticeable effect when using DCTCP in high BDP environments, where
23alpha normally stays at low values.
24
25This patch replace the clamping of alpha by setting ssthresh to
26half of cwnd for both fast retransmissions and RTOs, at most once
27per RTT. Consequently, the dctcp_clamp_alpha_on_loss module parameter
28has been removed.
29
30The table below shows experimental results where we measured the
31drop probability of a PIE AQM (not applying ECN marks) at a
32bottleneck in the presence of a single TCP flow with either the
33alpha-clamping option enabled or the cwnd halving proposed by this
34patch. Results using reno or cubic are given for comparison.
35
36 | Link | RTT | Drop
37 TCP CC | speed | base+AQM | probability
38 ==================|=========|==========|============
39 CUBIC | 40Mbps | 7+20ms | 0.21%
40 RENO | | | 0.19%
41 DCTCP-CLAMP-ALPHA | | | 25.80%
42 DCTCP-HALVE-CWND | | | 0.22%
43 ------------------|---------|----------|------------
44 CUBIC | 100Mbps | 7+20ms | 0.03%
45 RENO | | | 0.02%
46 DCTCP-CLAMP-ALPHA | | | 23.30%
47 DCTCP-HALVE-CWND | | | 0.04%
48 ------------------|---------|----------|------------
49 CUBIC | 800Mbps | 1+1ms | 0.04%
50 RENO | | | 0.05%
51 DCTCP-CLAMP-ALPHA | | | 18.70%
52 DCTCP-HALVE-CWND | | | 0.06%
53
54We see that, without halving its cwnd for all source of losses,
55DCTCP drives the AQM to large drop probabilities in order to keep
56the queue length under control (i.e., it repeatedly faces RTOs).
57Instead, if DCTCP reacts to all source of losses, it can then be
58controlled by the AQM using similar drop levels than cubic or reno.
59
60Signed-off-by: Koen De Schepper <koen.de_schepper@nokia-bell-labs.com>
61Signed-off-by: Olivier Tilmans <olivier.tilmans@nokia-bell-labs.com>
62Cc: Bob Briscoe <research@bobbriscoe.net>
63Cc: Lawrence Brakmo <brakmo@fb.com>
64Cc: Florian Westphal <fw@strlen.de>
65Cc: Daniel Borkmann <borkmann@iogearbox.net>
66Cc: Yuchung Cheng <ycheng@google.com>
67Cc: Neal Cardwell <ncardwell@google.com>
68Cc: Eric Dumazet <edumazet@google.com>
69Cc: Andrew Shewmaker <agshew@gmail.com>
70Cc: Glenn Judd <glenn.judd@morganstanley.com>
71Acked-by: Florian Westphal <fw@strlen.de>
72Acked-by: Neal Cardwell <ncardwell@google.com>
73Acked-by: Daniel Borkmann <daniel@iogearbox.net>
74Signed-off-by: David S. Miller <davem@davemloft.net>
75Signed-off-by: Sasha Levin <sashal@kernel.org>
76---
77 net/ipv4/tcp_dctcp.c | 36 ++++++++++++++++++------------------
78 1 file changed, 18 insertions(+), 18 deletions(-)
79
80diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
81index ca61e2a659e7..5205c5a5d8d5 100644
82--- a/net/ipv4/tcp_dctcp.c
83+++ b/net/ipv4/tcp_dctcp.c
84@@ -66,11 +66,6 @@ static unsigned int dctcp_alpha_on_init __read_mostly = DCTCP_MAX_ALPHA;
85 module_param(dctcp_alpha_on_init, uint, 0644);
86 MODULE_PARM_DESC(dctcp_alpha_on_init, "parameter for initial alpha value");
87
88-static unsigned int dctcp_clamp_alpha_on_loss __read_mostly;
89-module_param(dctcp_clamp_alpha_on_loss, uint, 0644);
90-MODULE_PARM_DESC(dctcp_clamp_alpha_on_loss,
91- "parameter for clamping alpha on loss");
92-
93 static struct tcp_congestion_ops dctcp_reno;
94
95 static void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca)
96@@ -211,21 +206,23 @@ static void dctcp_update_alpha(struct sock *sk, u32 flags)
97 }
98 }
99
100-static void dctcp_state(struct sock *sk, u8 new_state)
101+static void dctcp_react_to_loss(struct sock *sk)
102 {
103- if (dctcp_clamp_alpha_on_loss && new_state == TCP_CA_Loss) {
104- struct dctcp *ca = inet_csk_ca(sk);
105+ struct dctcp *ca = inet_csk_ca(sk);
106+ struct tcp_sock *tp = tcp_sk(sk);
107
108- /* If this extension is enabled, we clamp dctcp_alpha to
109- * max on packet loss; the motivation is that dctcp_alpha
110- * is an indicator to the extend of congestion and packet
111- * loss is an indicator of extreme congestion; setting
112- * this in practice turned out to be beneficial, and
113- * effectively assumes total congestion which reduces the
114- * window by half.
115- */
116- ca->dctcp_alpha = DCTCP_MAX_ALPHA;
117- }
118+ ca->loss_cwnd = tp->snd_cwnd;
119+ tp->snd_ssthresh = max(tp->snd_cwnd >> 1U, 2U);
120+}
121+
122+static void dctcp_state(struct sock *sk, u8 new_state)
123+{
124+ if (new_state == TCP_CA_Recovery &&
125+ new_state != inet_csk(sk)->icsk_ca_state)
126+ dctcp_react_to_loss(sk);
127+ /* We handle RTO in dctcp_cwnd_event to ensure that we perform only
128+ * one loss-adjustment per RTT.
129+ */
130 }
131
132 static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
133@@ -237,6 +234,9 @@ static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
134 case CA_EVENT_ECN_NO_CE:
135 dctcp_ce_state_1_to_0(sk);
136 break;
137+ case CA_EVENT_LOSS:
138+ dctcp_react_to_loss(sk);
139+ break;
140 default:
141 /* Don't care for the rest. */
142 break;
143--
1442.19.1
145