]>
Commit | Line | Data |
---|---|---|
a9fba688 SL |
1 | From b877af05cdab4a74ba52e6a8ef6efefc6d242a23 Mon Sep 17 00:00:00 2001 |
2 | From: Koen De Schepper <koen.de_schepper@nokia-bell-labs.com> | |
3 | Date: Thu, 4 Apr 2019 12:24:02 +0000 | |
4 | Subject: tcp: Ensure DCTCP reacts to losses | |
5 | MIME-Version: 1.0 | |
6 | Content-Type: text/plain; charset=UTF-8 | |
7 | Content-Transfer-Encoding: 8bit | |
8 | ||
9 | [ Upstream commit aecfde23108b8e637d9f5c5e523b24fb97035dc3 ] | |
10 | ||
11 | RFC8257 ยง3.5 explicitly states that "A DCTCP sender MUST react to | |
12 | loss episodes in the same way as conventional TCP". | |
13 | ||
14 | Currently, Linux DCTCP performs no cwnd reduction when losses | |
15 | are encountered. Optionally, the dctcp_clamp_alpha_on_loss resets | |
16 | alpha to its maximal value if a RTO happens. This behavior | |
17 | is sub-optimal for at least two reasons: i) it ignores losses | |
18 | triggering fast retransmissions; and ii) it causes unnecessary large | |
19 | cwnd reduction in the future if the loss was isolated as it resets | |
20 | the historical term of DCTCP's alpha EWMA to its maximal value (i.e., | |
21 | denoting a total congestion). The second reason has an especially | |
22 | noticeable effect when using DCTCP in high BDP environments, where | |
23 | alpha normally stays at low values. | |
24 | ||
25 | This patch replace the clamping of alpha by setting ssthresh to | |
26 | half of cwnd for both fast retransmissions and RTOs, at most once | |
27 | per RTT. Consequently, the dctcp_clamp_alpha_on_loss module parameter | |
28 | has been removed. | |
29 | ||
30 | The table below shows experimental results where we measured the | |
31 | drop probability of a PIE AQM (not applying ECN marks) at a | |
32 | bottleneck in the presence of a single TCP flow with either the | |
33 | alpha-clamping option enabled or the cwnd halving proposed by this | |
34 | patch. Results using reno or cubic are given for comparison. | |
35 | ||
36 | | Link | RTT | Drop | |
37 | TCP CC | speed | base+AQM | probability | |
38 | ==================|=========|==========|============ | |
39 | CUBIC | 40Mbps | 7+20ms | 0.21% | |
40 | RENO | | | 0.19% | |
41 | DCTCP-CLAMP-ALPHA | | | 25.80% | |
42 | DCTCP-HALVE-CWND | | | 0.22% | |
43 | ------------------|---------|----------|------------ | |
44 | CUBIC | 100Mbps | 7+20ms | 0.03% | |
45 | RENO | | | 0.02% | |
46 | DCTCP-CLAMP-ALPHA | | | 23.30% | |
47 | DCTCP-HALVE-CWND | | | 0.04% | |
48 | ------------------|---------|----------|------------ | |
49 | CUBIC | 800Mbps | 1+1ms | 0.04% | |
50 | RENO | | | 0.05% | |
51 | DCTCP-CLAMP-ALPHA | | | 18.70% | |
52 | DCTCP-HALVE-CWND | | | 0.06% | |
53 | ||
54 | We see that, without halving its cwnd for all source of losses, | |
55 | DCTCP drives the AQM to large drop probabilities in order to keep | |
56 | the queue length under control (i.e., it repeatedly faces RTOs). | |
57 | Instead, if DCTCP reacts to all source of losses, it can then be | |
58 | controlled by the AQM using similar drop levels than cubic or reno. | |
59 | ||
60 | Signed-off-by: Koen De Schepper <koen.de_schepper@nokia-bell-labs.com> | |
61 | Signed-off-by: Olivier Tilmans <olivier.tilmans@nokia-bell-labs.com> | |
62 | Cc: Bob Briscoe <research@bobbriscoe.net> | |
63 | Cc: Lawrence Brakmo <brakmo@fb.com> | |
64 | Cc: Florian Westphal <fw@strlen.de> | |
65 | Cc: Daniel Borkmann <borkmann@iogearbox.net> | |
66 | Cc: Yuchung Cheng <ycheng@google.com> | |
67 | Cc: Neal Cardwell <ncardwell@google.com> | |
68 | Cc: Eric Dumazet <edumazet@google.com> | |
69 | Cc: Andrew Shewmaker <agshew@gmail.com> | |
70 | Cc: Glenn Judd <glenn.judd@morganstanley.com> | |
71 | Acked-by: Florian Westphal <fw@strlen.de> | |
72 | Acked-by: Neal Cardwell <ncardwell@google.com> | |
73 | Acked-by: Daniel Borkmann <daniel@iogearbox.net> | |
74 | Signed-off-by: David S. Miller <davem@davemloft.net> | |
75 | Signed-off-by: Sasha Levin <sashal@kernel.org> | |
76 | --- | |
77 | net/ipv4/tcp_dctcp.c | 36 ++++++++++++++++++------------------ | |
78 | 1 file changed, 18 insertions(+), 18 deletions(-) | |
79 | ||
80 | diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c | |
81 | index ca61e2a659e7..5205c5a5d8d5 100644 | |
82 | --- a/net/ipv4/tcp_dctcp.c | |
83 | +++ b/net/ipv4/tcp_dctcp.c | |
84 | @@ -66,11 +66,6 @@ static unsigned int dctcp_alpha_on_init __read_mostly = DCTCP_MAX_ALPHA; | |
85 | module_param(dctcp_alpha_on_init, uint, 0644); | |
86 | MODULE_PARM_DESC(dctcp_alpha_on_init, "parameter for initial alpha value"); | |
87 | ||
88 | -static unsigned int dctcp_clamp_alpha_on_loss __read_mostly; | |
89 | -module_param(dctcp_clamp_alpha_on_loss, uint, 0644); | |
90 | -MODULE_PARM_DESC(dctcp_clamp_alpha_on_loss, | |
91 | - "parameter for clamping alpha on loss"); | |
92 | - | |
93 | static struct tcp_congestion_ops dctcp_reno; | |
94 | ||
95 | static void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca) | |
96 | @@ -211,21 +206,23 @@ static void dctcp_update_alpha(struct sock *sk, u32 flags) | |
97 | } | |
98 | } | |
99 | ||
100 | -static void dctcp_state(struct sock *sk, u8 new_state) | |
101 | +static void dctcp_react_to_loss(struct sock *sk) | |
102 | { | |
103 | - if (dctcp_clamp_alpha_on_loss && new_state == TCP_CA_Loss) { | |
104 | - struct dctcp *ca = inet_csk_ca(sk); | |
105 | + struct dctcp *ca = inet_csk_ca(sk); | |
106 | + struct tcp_sock *tp = tcp_sk(sk); | |
107 | ||
108 | - /* If this extension is enabled, we clamp dctcp_alpha to | |
109 | - * max on packet loss; the motivation is that dctcp_alpha | |
110 | - * is an indicator to the extend of congestion and packet | |
111 | - * loss is an indicator of extreme congestion; setting | |
112 | - * this in practice turned out to be beneficial, and | |
113 | - * effectively assumes total congestion which reduces the | |
114 | - * window by half. | |
115 | - */ | |
116 | - ca->dctcp_alpha = DCTCP_MAX_ALPHA; | |
117 | - } | |
118 | + ca->loss_cwnd = tp->snd_cwnd; | |
119 | + tp->snd_ssthresh = max(tp->snd_cwnd >> 1U, 2U); | |
120 | +} | |
121 | + | |
122 | +static void dctcp_state(struct sock *sk, u8 new_state) | |
123 | +{ | |
124 | + if (new_state == TCP_CA_Recovery && | |
125 | + new_state != inet_csk(sk)->icsk_ca_state) | |
126 | + dctcp_react_to_loss(sk); | |
127 | + /* We handle RTO in dctcp_cwnd_event to ensure that we perform only | |
128 | + * one loss-adjustment per RTT. | |
129 | + */ | |
130 | } | |
131 | ||
132 | static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) | |
133 | @@ -237,6 +234,9 @@ static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) | |
134 | case CA_EVENT_ECN_NO_CE: | |
135 | dctcp_ce_state_1_to_0(sk); | |
136 | break; | |
137 | + case CA_EVENT_LOSS: | |
138 | + dctcp_react_to_loss(sk); | |
139 | + break; | |
140 | default: | |
141 | /* Don't care for the rest. */ | |
142 | break; | |
143 | -- | |
144 | 2.19.1 | |
145 |