--- /dev/null
+From b877af05cdab4a74ba52e6a8ef6efefc6d242a23 Mon Sep 17 00:00:00 2001
+From: Koen De Schepper <koen.de_schepper@nokia-bell-labs.com>
+Date: Thu, 4 Apr 2019 12:24:02 +0000
+Subject: tcp: Ensure DCTCP reacts to losses
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+[ Upstream commit aecfde23108b8e637d9f5c5e523b24fb97035dc3 ]
+
+RFC8257 ยง3.5 explicitly states that "A DCTCP sender MUST react to
+loss episodes in the same way as conventional TCP".
+
+Currently, Linux DCTCP performs no cwnd reduction when losses
+are encountered. Optionally, the dctcp_clamp_alpha_on_loss resets
+alpha to its maximal value if a RTO happens. This behavior
+is sub-optimal for at least two reasons: i) it ignores losses
+triggering fast retransmissions; and ii) it causes unnecessary large
+cwnd reduction in the future if the loss was isolated as it resets
+the historical term of DCTCP's alpha EWMA to its maximal value (i.e.,
+denoting a total congestion). The second reason has an especially
+noticeable effect when using DCTCP in high BDP environments, where
+alpha normally stays at low values.
+
+This patch replace the clamping of alpha by setting ssthresh to
+half of cwnd for both fast retransmissions and RTOs, at most once
+per RTT. Consequently, the dctcp_clamp_alpha_on_loss module parameter
+has been removed.
+
+The table below shows experimental results where we measured the
+drop probability of a PIE AQM (not applying ECN marks) at a
+bottleneck in the presence of a single TCP flow with either the
+alpha-clamping option enabled or the cwnd halving proposed by this
+patch. Results using reno or cubic are given for comparison.
+
+ | Link | RTT | Drop
+ TCP CC | speed | base+AQM | probability
+ ==================|=========|==========|============
+ CUBIC | 40Mbps | 7+20ms | 0.21%
+ RENO | | | 0.19%
+ DCTCP-CLAMP-ALPHA | | | 25.80%
+ DCTCP-HALVE-CWND | | | 0.22%
+ ------------------|---------|----------|------------
+ CUBIC | 100Mbps | 7+20ms | 0.03%
+ RENO | | | 0.02%
+ DCTCP-CLAMP-ALPHA | | | 23.30%
+ DCTCP-HALVE-CWND | | | 0.04%
+ ------------------|---------|----------|------------
+ CUBIC | 800Mbps | 1+1ms | 0.04%
+ RENO | | | 0.05%
+ DCTCP-CLAMP-ALPHA | | | 18.70%
+ DCTCP-HALVE-CWND | | | 0.06%
+
+We see that, without halving its cwnd for all source of losses,
+DCTCP drives the AQM to large drop probabilities in order to keep
+the queue length under control (i.e., it repeatedly faces RTOs).
+Instead, if DCTCP reacts to all source of losses, it can then be
+controlled by the AQM using similar drop levels than cubic or reno.
+
+Signed-off-by: Koen De Schepper <koen.de_schepper@nokia-bell-labs.com>
+Signed-off-by: Olivier Tilmans <olivier.tilmans@nokia-bell-labs.com>
+Cc: Bob Briscoe <research@bobbriscoe.net>
+Cc: Lawrence Brakmo <brakmo@fb.com>
+Cc: Florian Westphal <fw@strlen.de>
+Cc: Daniel Borkmann <borkmann@iogearbox.net>
+Cc: Yuchung Cheng <ycheng@google.com>
+Cc: Neal Cardwell <ncardwell@google.com>
+Cc: Eric Dumazet <edumazet@google.com>
+Cc: Andrew Shewmaker <agshew@gmail.com>
+Cc: Glenn Judd <glenn.judd@morganstanley.com>
+Acked-by: Florian Westphal <fw@strlen.de>
+Acked-by: Neal Cardwell <ncardwell@google.com>
+Acked-by: Daniel Borkmann <daniel@iogearbox.net>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv4/tcp_dctcp.c | 36 ++++++++++++++++++------------------
+ 1 file changed, 18 insertions(+), 18 deletions(-)
+
+diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
+index ca61e2a659e7..5205c5a5d8d5 100644
+--- a/net/ipv4/tcp_dctcp.c
++++ b/net/ipv4/tcp_dctcp.c
+@@ -66,11 +66,6 @@ static unsigned int dctcp_alpha_on_init __read_mostly = DCTCP_MAX_ALPHA;
+ module_param(dctcp_alpha_on_init, uint, 0644);
+ MODULE_PARM_DESC(dctcp_alpha_on_init, "parameter for initial alpha value");
+
+-static unsigned int dctcp_clamp_alpha_on_loss __read_mostly;
+-module_param(dctcp_clamp_alpha_on_loss, uint, 0644);
+-MODULE_PARM_DESC(dctcp_clamp_alpha_on_loss,
+- "parameter for clamping alpha on loss");
+-
+ static struct tcp_congestion_ops dctcp_reno;
+
+ static void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca)
+@@ -211,21 +206,23 @@ static void dctcp_update_alpha(struct sock *sk, u32 flags)
+ }
+ }
+
+-static void dctcp_state(struct sock *sk, u8 new_state)
++static void dctcp_react_to_loss(struct sock *sk)
+ {
+- if (dctcp_clamp_alpha_on_loss && new_state == TCP_CA_Loss) {
+- struct dctcp *ca = inet_csk_ca(sk);
++ struct dctcp *ca = inet_csk_ca(sk);
++ struct tcp_sock *tp = tcp_sk(sk);
+
+- /* If this extension is enabled, we clamp dctcp_alpha to
+- * max on packet loss; the motivation is that dctcp_alpha
+- * is an indicator to the extend of congestion and packet
+- * loss is an indicator of extreme congestion; setting
+- * this in practice turned out to be beneficial, and
+- * effectively assumes total congestion which reduces the
+- * window by half.
+- */
+- ca->dctcp_alpha = DCTCP_MAX_ALPHA;
+- }
++ ca->loss_cwnd = tp->snd_cwnd;
++ tp->snd_ssthresh = max(tp->snd_cwnd >> 1U, 2U);
++}
++
++static void dctcp_state(struct sock *sk, u8 new_state)
++{
++ if (new_state == TCP_CA_Recovery &&
++ new_state != inet_csk(sk)->icsk_ca_state)
++ dctcp_react_to_loss(sk);
++ /* We handle RTO in dctcp_cwnd_event to ensure that we perform only
++ * one loss-adjustment per RTT.
++ */
+ }
+
+ static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
+@@ -237,6 +234,9 @@ static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
+ case CA_EVENT_ECN_NO_CE:
+ dctcp_ce_state_1_to_0(sk);
+ break;
++ case CA_EVENT_LOSS:
++ dctcp_react_to_loss(sk);
++ break;
+ default:
+ /* Don't care for the rest. */
+ break;
+--
+2.19.1
+