--- /dev/null
+From foo@baz Mon 17 Jun 2019 06:56:41 PM CEST
+From: Eric Dumazet <edumazet@google.com>
+Date: Sat, 15 Jun 2019 17:44:24 -0700
+Subject: tcp: add tcp_min_snd_mss sysctl
+
+From: Eric Dumazet <edumazet@google.com>
+
+commit 5f3e2bf008c2221478101ee72f5cb4654b9fc363 upstream.
+
+Some TCP peers announce a very small MSS option in their SYN and/or
+SYN/ACK messages.
+
+This forces the stack to send packets with a very high network/cpu
+overhead.
+
+Linux has enforced a minimal value of 48. Since this value includes
+the size of TCP options, and that the options can consume up to 40
+bytes, this means that each segment can include only 8 bytes of payload.
+
+In some cases, it can be useful to increase the minimal value
+to a saner value.
+
+We still let the default to 48 (TCP_MIN_SND_MSS), for compatibility
+reasons.
+
+Note that TCP_MAXSEG socket option enforces a minimal value
+of (TCP_MIN_MSS). David Miller increased this minimal value
+in commit c39508d6f118 ("tcp: Make TCP_MAXSEG minimum more correct.")
+from 64 to 88.
+
+We might in the future merge TCP_MIN_SND_MSS and TCP_MIN_MSS.
+
+CVE-2019-11479 -- tcp mss hardcoded to 48
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Suggested-by: Jonathan Looney <jtl@netflix.com>
+Acked-by: Neal Cardwell <ncardwell@google.com>
+Cc: Yuchung Cheng <ycheng@google.com>
+Cc: Tyler Hicks <tyhicks@canonical.com>
+Cc: Bruce Curtis <brucec@netflix.com>
+Cc: Jonathan Lemon <jonathan.lemon@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ Documentation/networking/ip-sysctl.txt | 8 ++++++++
+ include/net/netns/ipv4.h | 1 +
+ net/ipv4/sysctl_net_ipv4.c | 11 +++++++++++
+ net/ipv4/tcp_ipv4.c | 1 +
+ net/ipv4/tcp_output.c | 3 +--
+ 5 files changed, 22 insertions(+), 2 deletions(-)
+
+--- a/Documentation/networking/ip-sysctl.txt
++++ b/Documentation/networking/ip-sysctl.txt
+@@ -241,6 +241,14 @@ tcp_base_mss - INTEGER
+ Path MTU discovery (MTU probing). If MTU probing is enabled,
+ this is the initial MSS used by the connection.
+
++tcp_min_snd_mss - INTEGER
++ TCP SYN and SYNACK messages usually advertise an ADVMSS option,
++ as described in RFC 1122 and RFC 6691.
++ If this ADVMSS option is smaller than tcp_min_snd_mss,
++ it is silently capped to tcp_min_snd_mss.
++
++ Default : 48 (at least 8 bytes of payload per segment)
++
+ tcp_congestion_control - STRING
+ Set the congestion control algorithm to be used for new
+ connections. The algorithm "reno" is always available, but
+--- a/include/net/netns/ipv4.h
++++ b/include/net/netns/ipv4.h
+@@ -107,6 +107,7 @@ struct netns_ipv4 {
+ #endif
+ int sysctl_tcp_mtu_probing;
+ int sysctl_tcp_base_mss;
++ int sysctl_tcp_min_snd_mss;
+ int sysctl_tcp_probe_threshold;
+ u32 sysctl_tcp_probe_interval;
+
+--- a/net/ipv4/sysctl_net_ipv4.c
++++ b/net/ipv4/sysctl_net_ipv4.c
+@@ -37,6 +37,8 @@ static int ip_local_port_range_min[] = {
+ static int ip_local_port_range_max[] = { 65535, 65535 };
+ static int tcp_adv_win_scale_min = -31;
+ static int tcp_adv_win_scale_max = 31;
++static int tcp_min_snd_mss_min = TCP_MIN_SND_MSS;
++static int tcp_min_snd_mss_max = 65535;
+ static int ip_privileged_port_min;
+ static int ip_privileged_port_max = 65535;
+ static int ip_ttl_min = 1;
+@@ -944,6 +946,15 @@ static struct ctl_table ipv4_net_table[]
+ .proc_handler = proc_dointvec,
+ },
+ {
++ .procname = "tcp_min_snd_mss",
++ .data = &init_net.ipv4.sysctl_tcp_min_snd_mss,
++ .maxlen = sizeof(int),
++ .mode = 0644,
++ .proc_handler = proc_dointvec_minmax,
++ .extra1 = &tcp_min_snd_mss_min,
++ .extra2 = &tcp_min_snd_mss_max,
++ },
++ {
+ .procname = "tcp_probe_threshold",
+ .data = &init_net.ipv4.sysctl_tcp_probe_threshold,
+ .maxlen = sizeof(int),
+--- a/net/ipv4/tcp_ipv4.c
++++ b/net/ipv4/tcp_ipv4.c
+@@ -2477,6 +2477,7 @@ static int __net_init tcp_sk_init(struct
+ net->ipv4.sysctl_tcp_ecn_fallback = 1;
+
+ net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
++ net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
+ net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
+ net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
+
+--- a/net/ipv4/tcp_output.c
++++ b/net/ipv4/tcp_output.c
+@@ -1447,8 +1447,7 @@ static inline int __tcp_mtu_to_mss(struc
+ mss_now -= icsk->icsk_ext_hdr_len;
+
+ /* Then reserve room for full set of TCP options and 8 bytes of data */
+- if (mss_now < TCP_MIN_SND_MSS)
+- mss_now = TCP_MIN_SND_MSS;
++ mss_now = max(mss_now, sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss);
+ return mss_now;
+ }
+
--- /dev/null
+From foo@baz Mon 17 Jun 2019 06:56:41 PM CEST
+From: Eric Dumazet <edumazet@google.com>
+Date: Sat, 15 Jun 2019 17:47:27 -0700
+Subject: tcp: enforce tcp_min_snd_mss in tcp_mtu_probing()
+
+From: Eric Dumazet <edumazet@google.com>
+
+commit 967c05aee439e6e5d7d805e195b3a20ef5c433d6 upstream.
+
+If mtu probing is enabled tcp_mtu_probing() could very well end up
+with a too small MSS.
+
+Use the new sysctl tcp_min_snd_mss to make sure MSS search
+is performed in an acceptable range.
+
+CVE-2019-11479 -- tcp mss hardcoded to 48
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reported-by: Jonathan Lemon <jonathan.lemon@gmail.com>
+Cc: Jonathan Looney <jtl@netflix.com>
+Acked-by: Neal Cardwell <ncardwell@google.com>
+Cc: Yuchung Cheng <ycheng@google.com>
+Cc: Tyler Hicks <tyhicks@canonical.com>
+Cc: Bruce Curtis <brucec@netflix.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/tcp_timer.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/net/ipv4/tcp_timer.c
++++ b/net/ipv4/tcp_timer.c
+@@ -141,6 +141,7 @@ static void tcp_mtu_probing(struct inet_
+ mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1;
+ mss = min(net->ipv4.sysctl_tcp_base_mss, mss);
+ mss = max(mss, 68 - tp->tcp_header_len);
++ mss = max(mss, net->ipv4.sysctl_tcp_min_snd_mss);
+ icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
+ tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+ }
--- /dev/null
+From foo@baz Mon 17 Jun 2019 06:56:41 PM CEST
+From: Eric Dumazet <edumazet@google.com>
+Date: Sat, 15 Jun 2019 17:31:03 -0700
+Subject: tcp: limit payload size of sacked skbs
+
+From: Eric Dumazet <edumazet@google.com>
+
+commit 3b4929f65b0d8249f19a50245cd88ed1a2f78cff upstream.
+
+Jonathan Looney reported that TCP can trigger the following crash
+in tcp_shifted_skb() :
+
+ BUG_ON(tcp_skb_pcount(skb) < pcount);
+
+This can happen if the remote peer has advertized the smallest
+MSS that linux TCP accepts : 48
+
+An skb can hold 17 fragments, and each fragment can hold 32KB
+on x86, or 64KB on PowerPC.
+
+This means that the 16bit witdh of TCP_SKB_CB(skb)->tcp_gso_segs
+can overflow.
+
+Note that tcp_sendmsg() builds skbs with less than 64KB
+of payload, so this problem needs SACK to be enabled.
+SACK blocks allow TCP to coalesce multiple skbs in the retransmit
+queue, thus filling the 17 fragments to maximal capacity.
+
+CVE-2019-11477 -- u16 overflow of TCP_SKB_CB(skb)->tcp_gso_segs
+
+Backport notes, provided by Joao Martins <joao.m.martins@oracle.com>
+
+v4.15 or since commit 737ff314563 ("tcp: use sequence distance to
+detect reordering") had switched from the packet-based FACK tracking and
+switched to sequence-based.
+
+v4.14 and older still have the old logic and hence on
+tcp_skb_shift_data() needs to retain its original logic and have
+@fack_count in sync. In other words, we keep the increment of pcount with
+tcp_skb_pcount(skb) to later used that to update fack_count. To make it
+more explicit we track the new skb that gets incremented to pcount in
+@next_pcount, and we get to avoid the constant invocation of
+tcp_skb_pcount(skb) all together.
+
+Fixes: 832d11c5cd07 ("tcp: Try to restore large SKBs while SACK processing")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reported-by: Jonathan Looney <jtl@netflix.com>
+Acked-by: Neal Cardwell <ncardwell@google.com>
+Reviewed-by: Tyler Hicks <tyhicks@canonical.com>
+Cc: Yuchung Cheng <ycheng@google.com>
+Cc: Bruce Curtis <brucec@netflix.com>
+Cc: Jonathan Lemon <jonathan.lemon@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/tcp.h | 4 ++++
+ include/net/tcp.h | 2 ++
+ net/ipv4/tcp.c | 1 +
+ net/ipv4/tcp_input.c | 28 ++++++++++++++++++++++------
+ net/ipv4/tcp_output.c | 6 +++---
+ 5 files changed, 32 insertions(+), 9 deletions(-)
+
+--- a/include/linux/tcp.h
++++ b/include/linux/tcp.h
+@@ -450,4 +450,8 @@ static inline u16 tcp_mss_clamp(const st
+
+ return (user_mss && user_mss < mss) ? user_mss : mss;
+ }
++
++int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount,
++ int shiftlen);
++
+ #endif /* _LINUX_TCP_H */
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -57,6 +57,8 @@ void tcp_time_wait(struct sock *sk, int
+
+ #define MAX_TCP_HEADER (128 + MAX_HEADER)
+ #define MAX_TCP_OPTION_SPACE 40
++#define TCP_MIN_SND_MSS 48
++#define TCP_MIN_GSO_SIZE (TCP_MIN_SND_MSS - MAX_TCP_OPTION_SPACE)
+
+ /*
+ * Never offer a window over 32767 without using window scaling. Some
+--- a/net/ipv4/tcp.c
++++ b/net/ipv4/tcp.c
+@@ -3480,6 +3480,7 @@ void __init tcp_init(void)
+ unsigned long limit;
+ unsigned int i;
+
++ BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE);
+ BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
+ FIELD_SIZEOF(struct sk_buff, cb));
+
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -1329,7 +1329,7 @@ static bool tcp_shifted_skb(struct sock
+ TCP_SKB_CB(skb)->seq += shifted;
+
+ tcp_skb_pcount_add(prev, pcount);
+- BUG_ON(tcp_skb_pcount(skb) < pcount);
++ WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
+ tcp_skb_pcount_add(skb, -pcount);
+
+ /* When we're adding to gso_segs == 1, gso_size will be zero,
+@@ -1396,6 +1396,21 @@ static int skb_can_shift(const struct sk
+ return !skb_headlen(skb) && skb_is_nonlinear(skb);
+ }
+
++int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from,
++ int pcount, int shiftlen)
++{
++ /* TCP min gso_size is 8 bytes (TCP_MIN_GSO_SIZE)
++ * Since TCP_SKB_CB(skb)->tcp_gso_segs is 16 bits, we need
++ * to make sure not storing more than 65535 * 8 bytes per skb,
++ * even if current MSS is bigger.
++ */
++ if (unlikely(to->len + shiftlen >= 65535 * TCP_MIN_GSO_SIZE))
++ return 0;
++ if (unlikely(tcp_skb_pcount(to) + pcount > 65535))
++ return 0;
++ return skb_shift(to, from, shiftlen);
++}
++
+ /* Try collapsing SACK blocks spanning across multiple skbs to a single
+ * skb.
+ */
+@@ -1407,6 +1422,7 @@ static struct sk_buff *tcp_shift_skb_dat
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *prev;
+ int mss;
++ int next_pcount;
+ int pcount = 0;
+ int len;
+ int in_sack;
+@@ -1504,7 +1520,7 @@ static struct sk_buff *tcp_shift_skb_dat
+ if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una))
+ goto fallback;
+
+- if (!skb_shift(prev, skb, len))
++ if (!tcp_skb_shift(prev, skb, pcount, len))
+ goto fallback;
+ if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack))
+ goto out;
+@@ -1523,11 +1539,11 @@ static struct sk_buff *tcp_shift_skb_dat
+ goto out;
+
+ len = skb->len;
+- if (skb_shift(prev, skb, len)) {
+- pcount += tcp_skb_pcount(skb);
+- tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss, 0);
++ next_pcount = tcp_skb_pcount(skb);
++ if (tcp_skb_shift(prev, skb, next_pcount, len)) {
++ pcount += next_pcount;
++ tcp_shifted_skb(sk, skb, state, next_pcount, len, mss, 0);
+ }
+-
+ out:
+ state->fack_count += pcount;
+ return prev;
+--- a/net/ipv4/tcp_output.c
++++ b/net/ipv4/tcp_output.c
+@@ -1442,8 +1442,8 @@ static inline int __tcp_mtu_to_mss(struc
+ mss_now -= icsk->icsk_ext_hdr_len;
+
+ /* Then reserve room for full set of TCP options and 8 bytes of data */
+- if (mss_now < 48)
+- mss_now = 48;
++ if (mss_now < TCP_MIN_SND_MSS)
++ mss_now = TCP_MIN_SND_MSS;
+ return mss_now;
+ }
+
+@@ -2724,7 +2724,7 @@ static bool tcp_collapse_retrans(struct
+ if (next_skb_size <= skb_availroom(skb))
+ skb_copy_bits(next_skb, 0, skb_put(skb, next_skb_size),
+ next_skb_size);
+- else if (!skb_shift(skb, next_skb, next_skb_size))
++ else if (!tcp_skb_shift(skb, next_skb, 1, next_skb_size))
+ return false;
+ }
+ tcp_highest_sack_replace(sk, next_skb, skb);
--- /dev/null
+From foo@baz Mon 17 Jun 2019 06:56:41 PM CEST
+From: Eric Dumazet <edumazet@google.com>
+Date: Thu, 5 Oct 2017 22:21:25 -0700
+Subject: tcp: reduce tcp_fastretrans_alert() verbosity
+
+From: Eric Dumazet <edumazet@google.com>
+
+commit 8ba6ddaaf86c4c6814774e4e4ef158b732bd9f9f upstream.
+
+With upcoming rb-tree implementation, the checks will trigger
+more often, and this is expected.
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Cc: Amit Shah <amit@infradead.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/tcp_input.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -2810,9 +2810,9 @@ static void tcp_fastretrans_alert(struct
+ bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
+ (tcp_fackets_out(tp) > tp->reordering));
+
+- if (WARN_ON(!tp->packets_out && tp->sacked_out))
++ if (!tp->packets_out && tp->sacked_out)
+ tp->sacked_out = 0;
+- if (WARN_ON(!tp->sacked_out && tp->fackets_out))
++ if (!tp->sacked_out && tp->fackets_out)
+ tp->fackets_out = 0;
+
+ /* Now state machine starts.
--- /dev/null
+From foo@baz Mon 17 Jun 2019 06:56:41 PM CEST
+From: Eric Dumazet <edumazet@google.com>
+Date: Sat, 15 Jun 2019 17:40:56 -0700
+Subject: tcp: tcp_fragment() should apply sane memory limits
+
+From: Eric Dumazet <edumazet@google.com>
+
+commit f070ef2ac66716357066b683fb0baf55f8191a2e upstream.
+
+Jonathan Looney reported that a malicious peer can force a sender
+to fragment its retransmit queue into tiny skbs, inflating memory
+usage and/or overflow 32bit counters.
+
+TCP allows an application to queue up to sk_sndbuf bytes,
+so we need to give some allowance for non malicious splitting
+of retransmit queue.
+
+A new SNMP counter is added to monitor how many times TCP
+did not allow to split an skb if the allowance was exceeded.
+
+Note that this counter might increase in the case applications
+use SO_SNDBUF socket option to lower sk_sndbuf.
+
+CVE-2019-11478 : tcp_fragment, prevent fragmenting a packet when the
+ socket is already using more than half the allowed space
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reported-by: Jonathan Looney <jtl@netflix.com>
+Acked-by: Neal Cardwell <ncardwell@google.com>
+Acked-by: Yuchung Cheng <ycheng@google.com>
+Reviewed-by: Tyler Hicks <tyhicks@canonical.com>
+Cc: Bruce Curtis <brucec@netflix.com>
+Cc: Jonathan Lemon <jonathan.lemon@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/uapi/linux/snmp.h | 1 +
+ net/ipv4/proc.c | 1 +
+ net/ipv4/tcp_output.c | 5 +++++
+ 3 files changed, 7 insertions(+)
+
+--- a/include/uapi/linux/snmp.h
++++ b/include/uapi/linux/snmp.h
+@@ -278,6 +278,7 @@ enum
+ LINUX_MIB_TCPKEEPALIVE, /* TCPKeepAlive */
+ LINUX_MIB_TCPMTUPFAIL, /* TCPMTUPFail */
+ LINUX_MIB_TCPMTUPSUCCESS, /* TCPMTUPSuccess */
++ LINUX_MIB_TCPWQUEUETOOBIG, /* TCPWqueueTooBig */
+ __LINUX_MIB_MAX
+ };
+
+--- a/net/ipv4/proc.c
++++ b/net/ipv4/proc.c
+@@ -299,6 +299,7 @@ static const struct snmp_mib snmp4_net_l
+ SNMP_MIB_ITEM("TCPKeepAlive", LINUX_MIB_TCPKEEPALIVE),
+ SNMP_MIB_ITEM("TCPMTUPFail", LINUX_MIB_TCPMTUPFAIL),
+ SNMP_MIB_ITEM("TCPMTUPSuccess", LINUX_MIB_TCPMTUPSUCCESS),
++ SNMP_MIB_ITEM("TCPWqueueTooBig", LINUX_MIB_TCPWQUEUETOOBIG),
+ SNMP_MIB_SENTINEL
+ };
+
+--- a/net/ipv4/tcp_output.c
++++ b/net/ipv4/tcp_output.c
+@@ -1274,6 +1274,11 @@ int tcp_fragment(struct sock *sk, struct
+ if (nsize < 0)
+ nsize = 0;
+
++ if (unlikely((sk->sk_wmem_queued >> 1) > sk->sk_sndbuf)) {
++ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPWQUEUETOOBIG);
++ return -ENOMEM;
++ }
++
+ if (skb_unclone(skb, gfp))
+ return -ENOMEM;
+