net patches from davem for 4.19

[thirdparty/kernel/stable-queue.git] / queue-4.19 / tcp-ensure-dctcp-reacts-to-losses.patch
diff --git a/queue-4.19/tcp-ensure-dctcp-reacts-to-losses.patch b/queue-4.19/tcp-ensure-dctcp-reacts-to-losses.patch

new file mode 100644 (file)

index 0000000..e177bbd
--- /dev/null
+++ b/queue-4.19/tcp-ensure-dctcp-reacts-to-losses.patch
@@ -0,0 +1,145 @@
+From b877af05cdab4a74ba52e6a8ef6efefc6d242a23 Mon Sep 17 00:00:00 2001
+From: Koen De Schepper <koen.de_schepper@nokia-bell-labs.com>
+Date: Thu, 4 Apr 2019 12:24:02 +0000
+Subject: tcp: Ensure DCTCP reacts to losses
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+[ Upstream commit aecfde23108b8e637d9f5c5e523b24fb97035dc3 ]
+
+RFC8257 §3.5 explicitly states that "A DCTCP sender MUST react to
+loss episodes in the same way as conventional TCP".
+
+Currently, Linux DCTCP performs no cwnd reduction when losses
+are encountered. Optionally, the dctcp_clamp_alpha_on_loss resets
+alpha to its maximal value if a RTO happens. This behavior
+is sub-optimal for at least two reasons: i) it ignores losses
+triggering fast retransmissions; and ii) it causes unnecessary large
+cwnd reduction in the future if the loss was isolated as it resets
+the historical term of DCTCP's alpha EWMA to its maximal value (i.e.,
+denoting a total congestion). The second reason has an especially
+noticeable effect when using DCTCP in high BDP environments, where
+alpha normally stays at low values.
+
+This patch replace the clamping of alpha by setting ssthresh to
+half of cwnd for both fast retransmissions and RTOs, at most once
+per RTT. Consequently, the dctcp_clamp_alpha_on_loss module parameter
+has been removed.
+
+The table below shows experimental results where we measured the
+drop probability of a PIE AQM (not applying ECN marks) at a
+bottleneck in the presence of a single TCP flow with either the
+alpha-clamping option enabled or the cwnd halving proposed by this
+patch. Results using reno or cubic are given for comparison.
+
+                          |  Link   |   RTT    |    Drop
+                 TCP CC   |  speed  | base+AQM | probability
+        ==================|=========|==========|============
+                    CUBIC |  40Mbps |  7+20ms  |    0.21%
+                     RENO |         |          |    0.19%
+        DCTCP-CLAMP-ALPHA |         |          |   25.80%
+         DCTCP-HALVE-CWND |         |          |    0.22%
+        ------------------|---------|----------|------------
+                    CUBIC | 100Mbps |  7+20ms  |    0.03%
+                     RENO |         |          |    0.02%
+        DCTCP-CLAMP-ALPHA |         |          |   23.30%
+         DCTCP-HALVE-CWND |         |          |    0.04%
+        ------------------|---------|----------|------------
+                    CUBIC | 800Mbps |   1+1ms  |    0.04%
+                     RENO |         |          |    0.05%
+        DCTCP-CLAMP-ALPHA |         |          |   18.70%
+         DCTCP-HALVE-CWND |         |          |    0.06%
+
+We see that, without halving its cwnd for all source of losses,
+DCTCP drives the AQM to large drop probabilities in order to keep
+the queue length under control (i.e., it repeatedly faces RTOs).
+Instead, if DCTCP reacts to all source of losses, it can then be
+controlled by the AQM using similar drop levels than cubic or reno.
+
+Signed-off-by: Koen De Schepper <koen.de_schepper@nokia-bell-labs.com>
+Signed-off-by: Olivier Tilmans <olivier.tilmans@nokia-bell-labs.com>
+Cc: Bob Briscoe <research@bobbriscoe.net>
+Cc: Lawrence Brakmo <brakmo@fb.com>
+Cc: Florian Westphal <fw@strlen.de>
+Cc: Daniel Borkmann <borkmann@iogearbox.net>
+Cc: Yuchung Cheng <ycheng@google.com>
+Cc: Neal Cardwell <ncardwell@google.com>
+Cc: Eric Dumazet <edumazet@google.com>
+Cc: Andrew Shewmaker <agshew@gmail.com>
+Cc: Glenn Judd <glenn.judd@morganstanley.com>
+Acked-by: Florian Westphal <fw@strlen.de>
+Acked-by: Neal Cardwell <ncardwell@google.com>
+Acked-by: Daniel Borkmann <daniel@iogearbox.net>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv4/tcp_dctcp.c | 36 ++++++++++++++++++------------------
+ 1 file changed, 18 insertions(+), 18 deletions(-)
+
+diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
+index ca61e2a659e7..5205c5a5d8d5 100644
+--- a/net/ipv4/tcp_dctcp.c
++++ b/net/ipv4/tcp_dctcp.c
+@@ -66,11 +66,6 @@ static unsigned int dctcp_alpha_on_init __read_mostly = DCTCP_MAX_ALPHA;
+ module_param(dctcp_alpha_on_init, uint, 0644);
+ MODULE_PARM_DESC(dctcp_alpha_on_init, "parameter for initial alpha value");
+ 
+-static unsigned int dctcp_clamp_alpha_on_loss __read_mostly;
+-module_param(dctcp_clamp_alpha_on_loss, uint, 0644);
+-MODULE_PARM_DESC(dctcp_clamp_alpha_on_loss,
+-               "parameter for clamping alpha on loss");
+-
+ static struct tcp_congestion_ops dctcp_reno;
+ 
+ static void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca)
+@@ -211,21 +206,23 @@ static void dctcp_update_alpha(struct sock *sk, u32 flags)
+       }
+ }
+ 
+-static void dctcp_state(struct sock *sk, u8 new_state)
++static void dctcp_react_to_loss(struct sock *sk)
+ {
+-      if (dctcp_clamp_alpha_on_loss && new_state == TCP_CA_Loss) {
+-              struct dctcp *ca = inet_csk_ca(sk);
++      struct dctcp *ca = inet_csk_ca(sk);
++      struct tcp_sock *tp = tcp_sk(sk);
+ 
+-              /* If this extension is enabled, we clamp dctcp_alpha to
+-               * max on packet loss; the motivation is that dctcp_alpha
+-               * is an indicator to the extend of congestion and packet
+-               * loss is an indicator of extreme congestion; setting
+-               * this in practice turned out to be beneficial, and
+-               * effectively assumes total congestion which reduces the
+-               * window by half.
+-               */
+-              ca->dctcp_alpha = DCTCP_MAX_ALPHA;
+-      }
++      ca->loss_cwnd = tp->snd_cwnd;
++      tp->snd_ssthresh = max(tp->snd_cwnd >> 1U, 2U);
++}
++
++static void dctcp_state(struct sock *sk, u8 new_state)
++{
++      if (new_state == TCP_CA_Recovery &&
++          new_state != inet_csk(sk)->icsk_ca_state)
++              dctcp_react_to_loss(sk);
++      /* We handle RTO in dctcp_cwnd_event to ensure that we perform only
++       * one loss-adjustment per RTT.
++       */
+ }
+ 
+ static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
+@@ -237,6 +234,9 @@ static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
+       case CA_EVENT_ECN_NO_CE:
+               dctcp_ce_state_1_to_0(sk);
+               break;
++      case CA_EVENT_LOSS:
++              dctcp_react_to_loss(sk);
++              break;
+       default:
+               /* Don't care for the rest. */
+               break;
+-- 
+2.19.1
+