]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
4.4-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 18 Nov 2016 10:20:58 +0000 (11:20 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 18 Nov 2016 10:20:58 +0000 (11:20 +0100)
added patches:
bgmac-stop-clearing-dma-receive-control-register-right-after-it-is-set.patch
dccp-do-not-send-reset-to-already-closed-sockets.patch
dccp-fix-out-of-bound-access-in-dccp_v4_err.patch
dctcp-avoid-bogus-doubling-of-cwnd-after-loss.patch
fib_trie-correct-proc-net-route-off-by-one-error.patch
ip6_tunnel-clear-ip6cb-in-ip6tunnel_xmit.patch
ipv4-use-new_gw-for-redirect-neigh-lookup.patch
ipv6-dccp-add-missing-bind_conflict-to-dccp_ipv6_mapped.patch
ipv6-dccp-fix-out-of-bound-access-in-dccp_v6_err.patch
net-__skb_flow_dissect-must-cap-its-return-value.patch
net-clear-sk_err_soft-in-sk_clone_lock.patch
net-mangle-zero-checksum-in-skb_checksum_help.patch
sctp-assign-assoc_id-earlier-in-__sctp_connect.patch
sock-fix-sendmmsg-for-partial-sendmsg.patch
tcp-fix-potential-memory-corruption.patch
tcp-take-care-of-truncations-done-by-sk_filter.patch

17 files changed:
queue-4.4/bgmac-stop-clearing-dma-receive-control-register-right-after-it-is-set.patch [new file with mode: 0644]
queue-4.4/dccp-do-not-send-reset-to-already-closed-sockets.patch [new file with mode: 0644]
queue-4.4/dccp-fix-out-of-bound-access-in-dccp_v4_err.patch [new file with mode: 0644]
queue-4.4/dctcp-avoid-bogus-doubling-of-cwnd-after-loss.patch [new file with mode: 0644]
queue-4.4/fib_trie-correct-proc-net-route-off-by-one-error.patch [new file with mode: 0644]
queue-4.4/ip6_tunnel-clear-ip6cb-in-ip6tunnel_xmit.patch [new file with mode: 0644]
queue-4.4/ipv4-use-new_gw-for-redirect-neigh-lookup.patch [new file with mode: 0644]
queue-4.4/ipv6-dccp-add-missing-bind_conflict-to-dccp_ipv6_mapped.patch [new file with mode: 0644]
queue-4.4/ipv6-dccp-fix-out-of-bound-access-in-dccp_v6_err.patch [new file with mode: 0644]
queue-4.4/net-__skb_flow_dissect-must-cap-its-return-value.patch [new file with mode: 0644]
queue-4.4/net-clear-sk_err_soft-in-sk_clone_lock.patch [new file with mode: 0644]
queue-4.4/net-mangle-zero-checksum-in-skb_checksum_help.patch [new file with mode: 0644]
queue-4.4/sctp-assign-assoc_id-earlier-in-__sctp_connect.patch [new file with mode: 0644]
queue-4.4/series [new file with mode: 0644]
queue-4.4/sock-fix-sendmmsg-for-partial-sendmsg.patch [new file with mode: 0644]
queue-4.4/tcp-fix-potential-memory-corruption.patch [new file with mode: 0644]
queue-4.4/tcp-take-care-of-truncations-done-by-sk_filter.patch [new file with mode: 0644]

diff --git a/queue-4.4/bgmac-stop-clearing-dma-receive-control-register-right-after-it-is-set.patch b/queue-4.4/bgmac-stop-clearing-dma-receive-control-register-right-after-it-is-set.patch
new file mode 100644 (file)
index 0000000..9e05e70
--- /dev/null
@@ -0,0 +1,59 @@
+From foo@baz Fri Nov 18 11:09:43 CET 2016
+From: Andy Gospodarek <gospo@broadcom.com>
+Date: Mon, 31 Oct 2016 13:32:03 -0400
+Subject: bgmac: stop clearing DMA receive control register right after it is set
+
+From: Andy Gospodarek <gospo@broadcom.com>
+
+
+[ Upstream commit fcdefccac976ee51dd6071832b842d8fb41c479c ]
+
+Current bgmac code initializes some DMA settings in the receive control
+register for some hardware and then immediately clears those settings.
+Not clearing those settings results in ~420Mbps *improvement* in
+throughput; this system can now receive frames at line-rate on Broadcom
+5871x hardware compared to ~520Mbps today.  I also tested a few other
+values but found there to be no discernible difference in CPU
+utilization even if burst size and prefetching values are different.
+
+On the hardware tested there was no need to keep the code that cleared
+all but bits 16-17, but since there is a wide variety of hardware that
+used this driver (I did not look at all hardware docs for hardware using
+this IP block), I find it wise to move this call up and clear bits just
+after reading the default value from the hardware rather than completely
+removing it.
+
+This is a good candidate for -stable >=3.14 since that is when the code
+that was supposed to improve performance (but did not) was introduced.
+
+Signed-off-by: Andy Gospodarek <gospo@broadcom.com>
+Fixes: 56ceecde1f29 ("bgmac: initialize the DMA controller of core...")
+Cc: Hauke Mehrtens <hauke@hauke-m.de>
+Acked-by: Hauke Mehrtens <hauke@hauke-m.de>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/broadcom/bgmac.c |    5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+--- a/drivers/net/ethernet/broadcom/bgmac.c
++++ b/drivers/net/ethernet/broadcom/bgmac.c
+@@ -314,6 +314,10 @@ static void bgmac_dma_rx_enable(struct b
+       u32 ctl;
+       ctl = bgmac_read(bgmac, ring->mmio_base + BGMAC_DMA_RX_CTL);
++
++      /* preserve ONLY bits 16-17 from current hardware value */
++      ctl &= BGMAC_DMA_RX_ADDREXT_MASK;
++
+       if (bgmac->core->id.rev >= 4) {
+               ctl &= ~BGMAC_DMA_RX_BL_MASK;
+               ctl |= BGMAC_DMA_RX_BL_128 << BGMAC_DMA_RX_BL_SHIFT;
+@@ -324,7 +328,6 @@ static void bgmac_dma_rx_enable(struct b
+               ctl &= ~BGMAC_DMA_RX_PT_MASK;
+               ctl |= BGMAC_DMA_RX_PT_1 << BGMAC_DMA_RX_PT_SHIFT;
+       }
+-      ctl &= BGMAC_DMA_RX_ADDREXT_MASK;
+       ctl |= BGMAC_DMA_RX_ENABLE;
+       ctl |= BGMAC_DMA_RX_PARITY_DISABLE;
+       ctl |= BGMAC_DMA_RX_OVERFLOW_CONT;
diff --git a/queue-4.4/dccp-do-not-send-reset-to-already-closed-sockets.patch b/queue-4.4/dccp-do-not-send-reset-to-already-closed-sockets.patch
new file mode 100644 (file)
index 0000000..b434e49
--- /dev/null
@@ -0,0 +1,74 @@
+From foo@baz Fri Nov 18 11:09:43 CET 2016
+From: Eric Dumazet <edumazet@google.com>
+Date: Wed, 2 Nov 2016 18:04:24 -0700
+Subject: dccp: do not send reset to already closed sockets
+
+From: Eric Dumazet <edumazet@google.com>
+
+
+[ Upstream commit 346da62cc186c4b4b1ac59f87f4482b47a047388 ]
+
+Andrey reported following warning while fuzzing with syzkaller
+
+WARNING: CPU: 1 PID: 21072 at net/dccp/proto.c:83 dccp_set_state+0x229/0x290
+Kernel panic - not syncing: panic_on_warn set ...
+
+CPU: 1 PID: 21072 Comm: syz-executor Not tainted 4.9.0-rc1+ #293
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
+ ffff88003d4c7738 ffffffff81b474f4 0000000000000003 dffffc0000000000
+ ffffffff844f8b00 ffff88003d4c7804 ffff88003d4c7800 ffffffff8140c06a
+ 0000000041b58ab3 ffffffff8479ab7d ffffffff8140beae ffffffff8140cd00
+Call Trace:
+ [<     inline     >] __dump_stack lib/dump_stack.c:15
+ [<ffffffff81b474f4>] dump_stack+0xb3/0x10f lib/dump_stack.c:51
+ [<ffffffff8140c06a>] panic+0x1bc/0x39d kernel/panic.c:179
+ [<ffffffff8111125c>] __warn+0x1cc/0x1f0 kernel/panic.c:542
+ [<ffffffff8111144c>] warn_slowpath_null+0x2c/0x40 kernel/panic.c:585
+ [<ffffffff8389e5d9>] dccp_set_state+0x229/0x290 net/dccp/proto.c:83
+ [<ffffffff838a0aa2>] dccp_close+0x612/0xc10 net/dccp/proto.c:1016
+ [<ffffffff8316bf1f>] inet_release+0xef/0x1c0 net/ipv4/af_inet.c:415
+ [<ffffffff82b6e89e>] sock_release+0x8e/0x1d0 net/socket.c:570
+ [<ffffffff82b6e9f6>] sock_close+0x16/0x20 net/socket.c:1017
+ [<ffffffff815256ad>] __fput+0x29d/0x720 fs/file_table.c:208
+ [<ffffffff81525bb5>] ____fput+0x15/0x20 fs/file_table.c:244
+ [<ffffffff811727d8>] task_work_run+0xf8/0x170 kernel/task_work.c:116
+ [<     inline     >] exit_task_work include/linux/task_work.h:21
+ [<ffffffff8111bc53>] do_exit+0x883/0x2ac0 kernel/exit.c:828
+ [<ffffffff811221fe>] do_group_exit+0x10e/0x340 kernel/exit.c:931
+ [<ffffffff81143c94>] get_signal+0x634/0x15a0 kernel/signal.c:2307
+ [<ffffffff81054aad>] do_signal+0x8d/0x1a30 arch/x86/kernel/signal.c:807
+ [<ffffffff81003a05>] exit_to_usermode_loop+0xe5/0x130
+arch/x86/entry/common.c:156
+ [<     inline     >] prepare_exit_to_usermode arch/x86/entry/common.c:190
+ [<ffffffff81006298>] syscall_return_slowpath+0x1a8/0x1e0
+arch/x86/entry/common.c:259
+ [<ffffffff83fc1a62>] entry_SYSCALL_64_fastpath+0xc0/0xc2
+Dumping ftrace buffer:
+   (ftrace buffer empty)
+Kernel Offset: disabled
+
+Fix this the same way we did for TCP in commit 565b7b2d2e63
+("tcp: do not send reset to already closed sockets")
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reported-by: Andrey Konovalov <andreyknvl@google.com>
+Tested-by: Andrey Konovalov <andreyknvl@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/dccp/proto.c |    4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/net/dccp/proto.c
++++ b/net/dccp/proto.c
+@@ -1009,6 +1009,10 @@ void dccp_close(struct sock *sk, long ti
+               __kfree_skb(skb);
+       }
++      /* If socket has been already reset kill it. */
++      if (sk->sk_state == DCCP_CLOSED)
++              goto adjudge_to_death;
++
+       if (data_was_unread) {
+               /* Unread data was tossed, send an appropriate Reset Code */
+               DCCP_WARN("ABORT with %u bytes unread\n", data_was_unread);
diff --git a/queue-4.4/dccp-fix-out-of-bound-access-in-dccp_v4_err.patch b/queue-4.4/dccp-fix-out-of-bound-access-in-dccp_v4_err.patch
new file mode 100644 (file)
index 0000000..056dd21
--- /dev/null
@@ -0,0 +1,56 @@
+From foo@baz Fri Nov 18 11:09:43 CET 2016
+From: Eric Dumazet <edumazet@google.com>
+Date: Wed, 2 Nov 2016 19:00:40 -0700
+Subject: dccp: fix out of bound access in dccp_v4_err()
+
+From: Eric Dumazet <edumazet@google.com>
+
+
+[ Upstream commit 6706a97fec963d6cb3f7fc2978ec1427b4651214 ]
+
+dccp_v4_err() does not use pskb_may_pull() and might access garbage.
+
+We only need 4 bytes at the beginning of the DCCP header, like TCP,
+so the 8 bytes pulled in icmp_socket_deliver() are more than enough.
+
+This patch might allow to process more ICMP messages, as some routers
+are still limiting the size of reflected bytes to 28 (RFC 792), instead
+of extended lengths (RFC 1812 4.3.2.3)
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/dccp/ipv4.c |   14 ++++++++------
+ 1 file changed, 8 insertions(+), 6 deletions(-)
+
+--- a/net/dccp/ipv4.c
++++ b/net/dccp/ipv4.c
+@@ -235,7 +235,7 @@ static void dccp_v4_err(struct sk_buff *
+ {
+       const struct iphdr *iph = (struct iphdr *)skb->data;
+       const u8 offset = iph->ihl << 2;
+-      const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data + offset);
++      const struct dccp_hdr *dh;
+       struct dccp_sock *dp;
+       struct inet_sock *inet;
+       const int type = icmp_hdr(skb)->type;
+@@ -245,11 +245,13 @@ static void dccp_v4_err(struct sk_buff *
+       int err;
+       struct net *net = dev_net(skb->dev);
+-      if (skb->len < offset + sizeof(*dh) ||
+-          skb->len < offset + __dccp_basic_hdr_len(dh)) {
+-              ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
+-              return;
+-      }
++      /* Only need dccph_dport & dccph_sport which are the first
++       * 4 bytes in dccp header.
++       * Our caller (icmp_socket_deliver()) already pulled 8 bytes for us.
++       */
++      BUILD_BUG_ON(offsetofend(struct dccp_hdr, dccph_sport) > 8);
++      BUILD_BUG_ON(offsetofend(struct dccp_hdr, dccph_dport) > 8);
++      dh = (struct dccp_hdr *)(skb->data + offset);
+       sk = __inet_lookup_established(net, &dccp_hashinfo,
+                                      iph->daddr, dh->dccph_dport,
diff --git a/queue-4.4/dctcp-avoid-bogus-doubling-of-cwnd-after-loss.patch b/queue-4.4/dctcp-avoid-bogus-doubling-of-cwnd-after-loss.patch
new file mode 100644 (file)
index 0000000..22ed725
--- /dev/null
@@ -0,0 +1,87 @@
+From foo@baz Fri Nov 18 11:09:43 CET 2016
+From: Florian Westphal <fw@strlen.de>
+Date: Fri, 28 Oct 2016 18:43:11 +0200
+Subject: dctcp: avoid bogus doubling of cwnd after loss
+
+From: Florian Westphal <fw@strlen.de>
+
+
+[ Upstream commit ce6dd23329b1ee6a794acf5f7e40f8e89b8317ee ]
+
+If a congestion control module doesn't provide .undo_cwnd function,
+tcp_undo_cwnd_reduction() will set cwnd to
+
+   tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1);
+
+... which makes sense for reno (it sets ssthresh to half the current cwnd),
+but it makes no sense for dctcp, which sets ssthresh based on the current
+congestion estimate.
+
+This can cause severe growth of cwnd (eventually overflowing u32).
+
+Fix this by saving last cwnd on loss and restore cwnd based on that,
+similar to cubic and other algorithms.
+
+Fixes: e3118e8359bb7c ("net: tcp: add DCTCP congestion control algorithm")
+Cc: Lawrence Brakmo <brakmo@fb.com>
+Cc: Andrew Shewmaker <agshew@gmail.com>
+Cc: Glenn Judd <glenn.judd@morganstanley.com>
+Acked-by: Daniel Borkmann <daniel@iogearbox.net>
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/tcp_dctcp.c |   13 ++++++++++++-
+ 1 file changed, 12 insertions(+), 1 deletion(-)
+
+--- a/net/ipv4/tcp_dctcp.c
++++ b/net/ipv4/tcp_dctcp.c
+@@ -56,6 +56,7 @@ struct dctcp {
+       u32 next_seq;
+       u32 ce_state;
+       u32 delayed_ack_reserved;
++      u32 loss_cwnd;
+ };
+ static unsigned int dctcp_shift_g __read_mostly = 4; /* g = 1/2^4 */
+@@ -96,6 +97,7 @@ static void dctcp_init(struct sock *sk)
+               ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA);
+               ca->delayed_ack_reserved = 0;
++              ca->loss_cwnd = 0;
+               ca->ce_state = 0;
+               dctcp_reset(tp, ca);
+@@ -111,9 +113,10 @@ static void dctcp_init(struct sock *sk)
+ static u32 dctcp_ssthresh(struct sock *sk)
+ {
+-      const struct dctcp *ca = inet_csk_ca(sk);
++      struct dctcp *ca = inet_csk_ca(sk);
+       struct tcp_sock *tp = tcp_sk(sk);
++      ca->loss_cwnd = tp->snd_cwnd;
+       return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U);
+ }
+@@ -308,12 +311,20 @@ static size_t dctcp_get_info(struct sock
+       return 0;
+ }
++static u32 dctcp_cwnd_undo(struct sock *sk)
++{
++      const struct dctcp *ca = inet_csk_ca(sk);
++
++      return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
++}
++
+ static struct tcp_congestion_ops dctcp __read_mostly = {
+       .init           = dctcp_init,
+       .in_ack_event   = dctcp_update_alpha,
+       .cwnd_event     = dctcp_cwnd_event,
+       .ssthresh       = dctcp_ssthresh,
+       .cong_avoid     = tcp_reno_cong_avoid,
++      .undo_cwnd      = dctcp_cwnd_undo,
+       .set_state      = dctcp_state,
+       .get_info       = dctcp_get_info,
+       .flags          = TCP_CONG_NEEDS_ECN,
diff --git a/queue-4.4/fib_trie-correct-proc-net-route-off-by-one-error.patch b/queue-4.4/fib_trie-correct-proc-net-route-off-by-one-error.patch
new file mode 100644 (file)
index 0000000..53fa29c
--- /dev/null
@@ -0,0 +1,102 @@
+From foo@baz Fri Nov 18 11:09:43 CET 2016
+From: Alexander Duyck <alexander.h.duyck@intel.com>
+Date: Fri, 4 Nov 2016 15:11:57 -0400
+Subject: fib_trie: Correct /proc/net/route off by one error
+
+From: Alexander Duyck <alexander.h.duyck@intel.com>
+
+
+[ Upstream commit fd0285a39b1cb496f60210a9a00ad33a815603e7 ]
+
+The display of /proc/net/route has had a couple issues due to the fact that
+when I originally rewrote most of fib_trie I made it so that the iterator
+was tracking the next value to use instead of the current.
+
+In addition it had an off by 1 error where I was tracking the first piece
+of data as position 0, even though in reality that belonged to the
+SEQ_START_TOKEN.
+
+This patch updates the code so the iterator tracks the last reported
+position and key instead of the next expected position and key.  In
+addition it shifts things so that all of the leaves start at 1 instead of
+trying to report leaves starting with offset 0 as being valid.  With these
+two issues addressed this should resolve any off by one errors that were
+present in the display of /proc/net/route.
+
+Fixes: 25b97c016b26 ("ipv4: off-by-one in continuation handling in /proc/net/route")
+Cc: Andy Whitcroft <apw@canonical.com>
+Reported-by: Jason Baron <jbaron@akamai.com>
+Tested-by: Jason Baron <jbaron@akamai.com>
+Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/fib_trie.c |   21 +++++++++------------
+ 1 file changed, 9 insertions(+), 12 deletions(-)
+
+--- a/net/ipv4/fib_trie.c
++++ b/net/ipv4/fib_trie.c
+@@ -2456,22 +2456,19 @@ static struct key_vector *fib_route_get_
+       struct key_vector *l, **tp = &iter->tnode;
+       t_key key;
+-      /* use cache location of next-to-find key */
++      /* use cached location of previously found key */
+       if (iter->pos > 0 && pos >= iter->pos) {
+-              pos -= iter->pos;
+               key = iter->key;
+       } else {
+-              iter->pos = 0;
++              iter->pos = 1;
+               key = 0;
+       }
+-      while ((l = leaf_walk_rcu(tp, key)) != NULL) {
++      pos -= iter->pos;
++
++      while ((l = leaf_walk_rcu(tp, key)) && (pos-- > 0)) {
+               key = l->key + 1;
+               iter->pos++;
+-
+-              if (--pos <= 0)
+-                      break;
+-
+               l = NULL;
+               /* handle unlikely case of a key wrap */
+@@ -2480,7 +2477,7 @@ static struct key_vector *fib_route_get_
+       }
+       if (l)
+-              iter->key = key;        /* remember it */
++              iter->key = l->key;     /* remember it */
+       else
+               iter->pos = 0;          /* forget it */
+@@ -2508,7 +2505,7 @@ static void *fib_route_seq_start(struct
+               return fib_route_get_idx(iter, *pos);
+       iter->pos = 0;
+-      iter->key = 0;
++      iter->key = KEY_MAX;
+       return SEQ_START_TOKEN;
+ }
+@@ -2517,7 +2514,7 @@ static void *fib_route_seq_next(struct s
+ {
+       struct fib_route_iter *iter = seq->private;
+       struct key_vector *l = NULL;
+-      t_key key = iter->key;
++      t_key key = iter->key + 1;
+       ++*pos;
+@@ -2526,7 +2523,7 @@ static void *fib_route_seq_next(struct s
+               l = leaf_walk_rcu(&iter->tnode, key);
+       if (l) {
+-              iter->key = l->key + 1;
++              iter->key = l->key;
+               iter->pos++;
+       } else {
+               iter->pos = 0;
diff --git a/queue-4.4/ip6_tunnel-clear-ip6cb-in-ip6tunnel_xmit.patch b/queue-4.4/ip6_tunnel-clear-ip6cb-in-ip6tunnel_xmit.patch
new file mode 100644 (file)
index 0000000..702fe79
--- /dev/null
@@ -0,0 +1,36 @@
+From foo@baz Fri Nov 18 11:09:43 CET 2016
+From: Eli Cooper <elicooper@gmx.com>
+Date: Tue, 1 Nov 2016 23:45:12 +0800
+Subject: ip6_tunnel: Clear IP6CB in ip6tunnel_xmit()
+
+From: Eli Cooper <elicooper@gmx.com>
+
+
+[ Upstream commit 23f4ffedb7d751c7e298732ba91ca75d224bc1a6 ]
+
+skb->cb may contain data from previous layers. In the observed scenario,
+the garbage data were misinterpreted as IP6CB(skb)->frag_max_size, so
+that small packets sent through the tunnel are mistakenly fragmented.
+
+This patch unconditionally clears the control buffer in ip6tunnel_xmit(),
+which affects ip6_tunnel, ip6_udp_tunnel and ip6_gre. Currently none of
+these tunnels set IP6CB(skb)->flags, otherwise it needs to be done earlier.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Eli Cooper <elicooper@gmx.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/ip6_tunnel.h |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/include/net/ip6_tunnel.h
++++ b/include/net/ip6_tunnel.h
+@@ -86,6 +86,7 @@ static inline void ip6tunnel_xmit(struct
+       struct net_device_stats *stats = &dev->stats;
+       int pkt_len, err;
++      memset(skb->cb, 0, sizeof(struct inet6_skb_parm));
+       pkt_len = skb->len - skb_inner_network_offset(skb);
+       err = ip6_local_out(dev_net(skb_dst(skb)->dev), sk, skb);
diff --git a/queue-4.4/ipv4-use-new_gw-for-redirect-neigh-lookup.patch b/queue-4.4/ipv4-use-new_gw-for-redirect-neigh-lookup.patch
new file mode 100644 (file)
index 0000000..865530e
--- /dev/null
@@ -0,0 +1,51 @@
+From foo@baz Fri Nov 18 11:09:43 CET 2016
+From: Stephen Suryaputra Lin <stephen.suryaputra.lin@gmail.com>
+Date: Thu, 10 Nov 2016 11:16:15 -0500
+Subject: ipv4: use new_gw for redirect neigh lookup
+
+From: Stephen Suryaputra Lin <stephen.suryaputra.lin@gmail.com>
+
+
+[ Upstream commit 969447f226b451c453ddc83cac6144eaeac6f2e3 ]
+
+In v2.6, ip_rt_redirect() calls arp_bind_neighbour() which returns 0
+and then the state of the neigh for the new_gw is checked. If the state
+isn't valid then the redirected route is deleted. This behavior is
+maintained up to v3.5.7 by check_peer_redirect() because rt->rt_gateway
+is assigned to peer->redirect_learned.a4 before calling
+ipv4_neigh_lookup().
+
+After commit 5943634fc559 ("ipv4: Maintain redirect and PMTU info in
+struct rtable again."), ipv4_neigh_lookup() is performed without the
+rt_gateway assigned to the new_gw. In the case when rt_gateway (old_gw)
+isn't zero, the function uses it as the key. The neigh is most likely
+valid since the old_gw is the one that sends the ICMP redirect message.
+Then the new_gw is assigned to fib_nh_exception. The problem is: the
+new_gw ARP may never gets resolved and the traffic is blackholed.
+
+So, use the new_gw for neigh lookup.
+
+Changes from v1:
+ - use __ipv4_neigh_lookup instead (per Eric Dumazet).
+
+Fixes: 5943634fc559 ("ipv4: Maintain redirect and PMTU info in struct rtable again.")
+Signed-off-by: Stephen Suryaputra Lin <ssurya@ieee.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/route.c |    4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/net/ipv4/route.c
++++ b/net/ipv4/route.c
+@@ -747,7 +747,9 @@ static void __ip_do_redirect(struct rtab
+                       goto reject_redirect;
+       }
+-      n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
++      n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
++      if (!n)
++              n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
+       if (!IS_ERR(n)) {
+               if (!(n->nud_state & NUD_VALID)) {
+                       neigh_event_send(n, NULL);
diff --git a/queue-4.4/ipv6-dccp-add-missing-bind_conflict-to-dccp_ipv6_mapped.patch b/queue-4.4/ipv6-dccp-add-missing-bind_conflict-to-dccp_ipv6_mapped.patch
new file mode 100644 (file)
index 0000000..b7c62a7
--- /dev/null
@@ -0,0 +1,35 @@
+From foo@baz Fri Nov 18 11:09:43 CET 2016
+From: Eric Dumazet <edumazet@google.com>
+Date: Thu, 3 Nov 2016 08:59:46 -0700
+Subject: ipv6: dccp: add missing bind_conflict to dccp_ipv6_mapped
+
+From: Eric Dumazet <edumazet@google.com>
+
+
+[ Upstream commit 990ff4d84408fc55942ca6644f67e361737b3d8e ]
+
+While fuzzing kernel with syzkaller, Andrey reported a nasty crash
+in inet6_bind() caused by DCCP lacking a required method.
+
+Fixes: ab1e0a13d7029 ("[SOCK] proto: Add hashinfo member to struct proto")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reported-by: Andrey Konovalov <andreyknvl@google.com>
+Tested-by: Andrey Konovalov <andreyknvl@google.com>
+Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
+Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/dccp/ipv6.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/net/dccp/ipv6.c
++++ b/net/dccp/ipv6.c
+@@ -948,6 +948,7 @@ static const struct inet_connection_sock
+       .getsockopt        = ipv6_getsockopt,
+       .addr2sockaddr     = inet6_csk_addr2sockaddr,
+       .sockaddr_len      = sizeof(struct sockaddr_in6),
++      .bind_conflict     = inet6_csk_bind_conflict,
+ #ifdef CONFIG_COMPAT
+       .compat_setsockopt = compat_ipv6_setsockopt,
+       .compat_getsockopt = compat_ipv6_getsockopt,
diff --git a/queue-4.4/ipv6-dccp-fix-out-of-bound-access-in-dccp_v6_err.patch b/queue-4.4/ipv6-dccp-fix-out-of-bound-access-in-dccp_v6_err.patch
new file mode 100644 (file)
index 0000000..203d0e4
--- /dev/null
@@ -0,0 +1,53 @@
+From foo@baz Fri Nov 18 11:09:43 CET 2016
+From: Eric Dumazet <edumazet@google.com>
+Date: Wed, 2 Nov 2016 20:30:48 -0700
+Subject: ipv6: dccp: fix out of bound access in dccp_v6_err()
+
+From: Eric Dumazet <edumazet@google.com>
+
+
+[ Upstream commit 1aa9d1a0e7eefcc61696e147d123453fc0016005 ]
+
+dccp_v6_err() does not use pskb_may_pull() and might access garbage.
+
+We only need 4 bytes at the beginning of the DCCP header, like TCP,
+so the 8 bytes pulled in icmpv6_notify() are more than enough.
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/dccp/ipv6.c |   15 ++++++++-------
+ 1 file changed, 8 insertions(+), 7 deletions(-)
+
+--- a/net/dccp/ipv6.c
++++ b/net/dccp/ipv6.c
+@@ -70,7 +70,7 @@ static void dccp_v6_err(struct sk_buff *
+                       u8 type, u8 code, int offset, __be32 info)
+ {
+       const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data;
+-      const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data + offset);
++      const struct dccp_hdr *dh;
+       struct dccp_sock *dp;
+       struct ipv6_pinfo *np;
+       struct sock *sk;
+@@ -78,12 +78,13 @@ static void dccp_v6_err(struct sk_buff *
+       __u64 seq;
+       struct net *net = dev_net(skb->dev);
+-      if (skb->len < offset + sizeof(*dh) ||
+-          skb->len < offset + __dccp_basic_hdr_len(dh)) {
+-              ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev),
+-                                 ICMP6_MIB_INERRORS);
+-              return;
+-      }
++      /* Only need dccph_dport & dccph_sport which are the first
++       * 4 bytes in dccp header.
++       * Our caller (icmpv6_notify()) already pulled 8 bytes for us.
++       */
++      BUILD_BUG_ON(offsetofend(struct dccp_hdr, dccph_sport) > 8);
++      BUILD_BUG_ON(offsetofend(struct dccp_hdr, dccph_dport) > 8);
++      dh = (struct dccp_hdr *)(skb->data + offset);
+       sk = __inet6_lookup_established(net, &dccp_hashinfo,
+                                       &hdr->daddr, dh->dccph_dport,
diff --git a/queue-4.4/net-__skb_flow_dissect-must-cap-its-return-value.patch b/queue-4.4/net-__skb_flow_dissect-must-cap-its-return-value.patch
new file mode 100644 (file)
index 0000000..7208c71
--- /dev/null
@@ -0,0 +1,59 @@
+From foo@baz Fri Nov 18 11:09:43 CET 2016
+From: Eric Dumazet <edumazet@google.com>
+Date: Wed, 9 Nov 2016 16:04:46 -0800
+Subject: net: __skb_flow_dissect() must cap its return value
+
+From: Eric Dumazet <edumazet@google.com>
+
+
+[ Upstream commit 34fad54c2537f7c99d07375e50cb30aa3c23bd83 ]
+
+After Tom patch, thoff field could point past the end of the buffer,
+this could fool some callers.
+
+If an skb was provided, skb->len should be the upper limit.
+If not, hlen is supposed to be the upper limit.
+
+Fixes: a6e544b0a88b ("flow_dissector: Jump to exit code in __skb_flow_dissect")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reported-by: Yibin Yang <yibyang@cisco.com
+Acked-by: Alexander Duyck <alexander.h.duyck@intel.com>
+Acked-by: Willem de Bruijn <willemb@google.com>
+Acked-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/core/flow_dissector.c |   11 ++++++++---
+ 1 file changed, 8 insertions(+), 3 deletions(-)
+
+--- a/net/core/flow_dissector.c
++++ b/net/core/flow_dissector.c
+@@ -131,7 +131,7 @@ bool __skb_flow_dissect(const struct sk_
+       struct flow_dissector_key_tags *key_tags;
+       struct flow_dissector_key_keyid *key_keyid;
+       u8 ip_proto = 0;
+-      bool ret = false;
++      bool ret;
+       if (!data) {
+               data = skb->data;
+@@ -492,12 +492,17 @@ ip_proto_again:
+ out_good:
+       ret = true;
+-out_bad:
++      key_control->thoff = (u16)nhoff;
++out:
+       key_basic->n_proto = proto;
+       key_basic->ip_proto = ip_proto;
+-      key_control->thoff = (u16)nhoff;
+       return ret;
++
++out_bad:
++      ret = false;
++      key_control->thoff = min_t(u16, nhoff, skb ? skb->len : hlen);
++      goto out;
+ }
+ EXPORT_SYMBOL(__skb_flow_dissect);
diff --git a/queue-4.4/net-clear-sk_err_soft-in-sk_clone_lock.patch b/queue-4.4/net-clear-sk_err_soft-in-sk_clone_lock.patch
new file mode 100644 (file)
index 0000000..32c9f39
--- /dev/null
@@ -0,0 +1,34 @@
+From foo@baz Fri Nov 18 11:09:43 CET 2016
+From: Eric Dumazet <edumazet@google.com>
+Date: Fri, 28 Oct 2016 13:40:24 -0700
+Subject: net: clear sk_err_soft in sk_clone_lock()
+
+From: Eric Dumazet <edumazet@google.com>
+
+
+[ Upstream commit e551c32d57c88923f99f8f010e89ca7ed0735e83 ]
+
+At accept() time, it is possible the parent has a non zero
+sk_err_soft, leftover from a prior error.
+
+Make sure we do not leave this value in the child, as it
+makes future getsockopt(SO_ERROR) calls quite unreliable.
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/core/sock.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/net/core/sock.c
++++ b/net/core/sock.c
+@@ -1562,6 +1562,7 @@ struct sock *sk_clone_lock(const struct
+               }
+               newsk->sk_err      = 0;
++              newsk->sk_err_soft = 0;
+               newsk->sk_priority = 0;
+               newsk->sk_incoming_cpu = raw_smp_processor_id();
+               atomic64_set(&newsk->sk_cookie, 0);
diff --git a/queue-4.4/net-mangle-zero-checksum-in-skb_checksum_help.patch b/queue-4.4/net-mangle-zero-checksum-in-skb_checksum_help.patch
new file mode 100644 (file)
index 0000000..fea39ab
--- /dev/null
@@ -0,0 +1,41 @@
+From foo@baz Fri Nov 18 11:09:43 CET 2016
+From: Eric Dumazet <edumazet@google.com>
+Date: Sat, 29 Oct 2016 11:02:36 -0700
+Subject: net: mangle zero checksum in skb_checksum_help()
+
+From: Eric Dumazet <edumazet@google.com>
+
+
+[ Upstream commit 4f2e4ad56a65f3b7d64c258e373cb71e8d2499f4 ]
+
+Sending zero checksum is ok for TCP, but not for UDP.
+
+UDPv6 receiver should by default drop a frame with a 0 checksum,
+and UDPv4 would not verify the checksum and might accept a corrupted
+packet.
+
+Simply replace such checksum by 0xffff, regardless of transport.
+
+This error was caught on SIT tunnels, but seems generic.
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Cc: Maciej Żenczykowski <maze@google.com>
+Cc: Willem de Bruijn <willemb@google.com>
+Acked-by: Maciej Żenczykowski <maze@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/core/dev.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/net/core/dev.c
++++ b/net/core/dev.c
+@@ -2462,7 +2462,7 @@ int skb_checksum_help(struct sk_buff *sk
+                       goto out;
+       }
+-      *(__sum16 *)(skb->data + offset) = csum_fold(csum);
++      *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
+ out_set_summed:
+       skb->ip_summed = CHECKSUM_NONE;
+ out:
diff --git a/queue-4.4/sctp-assign-assoc_id-earlier-in-__sctp_connect.patch b/queue-4.4/sctp-assign-assoc_id-earlier-in-__sctp_connect.patch
new file mode 100644 (file)
index 0000000..958936c
--- /dev/null
@@ -0,0 +1,57 @@
+From foo@baz Fri Nov 18 11:09:43 CET 2016
+From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+Date: Thu, 3 Nov 2016 17:03:41 -0200
+Subject: sctp: assign assoc_id earlier in __sctp_connect
+
+From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+
+
+[ Upstream commit 7233bc84a3aeda835d334499dc00448373caf5c0 ]
+
+sctp_wait_for_connect() currently already holds the asoc to keep it
+alive during the sleep, in case another thread release it. But Andrey
+Konovalov and Dmitry Vyukov reported an use-after-free in such
+situation.
+
+Problem is that __sctp_connect() doesn't get a ref on the asoc and will
+do a read on the asoc after calling sctp_wait_for_connect(), but by then
+another thread may have closed it and the _put on sctp_wait_for_connect
+will actually release it, causing the use-after-free.
+
+Fix is, instead of doing the read after waiting for the connect, do it
+before so, and avoid this issue as the socket is still locked by then.
+There should be no issue on returning the asoc id in case of failure as
+the application shouldn't trust on that number in such situations
+anyway.
+
+This issue doesn't exist in sctp_sendmsg() path.
+
+Reported-by: Dmitry Vyukov <dvyukov@google.com>
+Reported-by: Andrey Konovalov <andreyknvl@google.com>
+Tested-by: Andrey Konovalov <andreyknvl@google.com>
+Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+Reviewed-by: Xin Long <lucien.xin@gmail.com>
+Acked-by: Neil Horman <nhorman@tuxdriver.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/sctp/socket.c |    7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)
+
+--- a/net/sctp/socket.c
++++ b/net/sctp/socket.c
+@@ -1212,9 +1212,12 @@ static int __sctp_connect(struct sock *s
+       timeo = sock_sndtimeo(sk, f_flags & O_NONBLOCK);
+-      err = sctp_wait_for_connect(asoc, &timeo);
+-      if ((err == 0 || err == -EINPROGRESS) && assoc_id)
++      if (assoc_id)
+               *assoc_id = asoc->assoc_id;
++      err = sctp_wait_for_connect(asoc, &timeo);
++      /* Note: the asoc may be freed after the return of
++       * sctp_wait_for_connect.
++       */
+       /* Don't free association on exit. */
+       asoc = NULL;
diff --git a/queue-4.4/series b/queue-4.4/series
new file mode 100644 (file)
index 0000000..5fffc5e
--- /dev/null
@@ -0,0 +1,16 @@
+dctcp-avoid-bogus-doubling-of-cwnd-after-loss.patch
+net-clear-sk_err_soft-in-sk_clone_lock.patch
+net-mangle-zero-checksum-in-skb_checksum_help.patch
+bgmac-stop-clearing-dma-receive-control-register-right-after-it-is-set.patch
+ip6_tunnel-clear-ip6cb-in-ip6tunnel_xmit.patch
+tcp-fix-potential-memory-corruption.patch
+dccp-do-not-send-reset-to-already-closed-sockets.patch
+dccp-fix-out-of-bound-access-in-dccp_v4_err.patch
+ipv6-dccp-fix-out-of-bound-access-in-dccp_v6_err.patch
+ipv6-dccp-add-missing-bind_conflict-to-dccp_ipv6_mapped.patch
+sctp-assign-assoc_id-earlier-in-__sctp_connect.patch
+fib_trie-correct-proc-net-route-off-by-one-error.patch
+sock-fix-sendmmsg-for-partial-sendmsg.patch
+net-__skb_flow_dissect-must-cap-its-return-value.patch
+ipv4-use-new_gw-for-redirect-neigh-lookup.patch
+tcp-take-care-of-truncations-done-by-sk_filter.patch
diff --git a/queue-4.4/sock-fix-sendmmsg-for-partial-sendmsg.patch b/queue-4.4/sock-fix-sendmmsg-for-partial-sendmsg.patch
new file mode 100644 (file)
index 0000000..f8a5799
--- /dev/null
@@ -0,0 +1,49 @@
+From foo@baz Fri Nov 18 11:09:43 CET 2016
+From: Soheil Hassas Yeganeh <soheil@google.com>
+Date: Fri, 4 Nov 2016 15:36:49 -0400
+Subject: sock: fix sendmmsg for partial sendmsg
+
+From: Soheil Hassas Yeganeh <soheil@google.com>
+
+
+[ Upstream commit 3023898b7d4aac65987bd2f485cc22390aae6f78 ]
+
+Do not send the next message in sendmmsg for partial sendmsg
+invocations.
+
+sendmmsg assumes that it can continue sending the next message
+when the return value of the individual sendmsg invocations
+is positive. It results in corrupting the data for TCP,
+SCTP, and UNIX streams.
+
+For example, sendmmsg([["abcd"], ["efgh"]]) can result in a stream
+of "aefgh" if the first sendmsg invocation sends only the first
+byte while the second sendmsg goes through.
+
+Datagram sockets either send the entire datagram or fail, so
+this patch affects only sockets of type SOCK_STREAM and
+SOCK_SEQPACKET.
+
+Fixes: 228e548e6020 ("net: Add sendmmsg socket system call")
+Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: Willem de Bruijn <willemb@google.com>
+Signed-off-by: Neal Cardwell <ncardwell@google.com>
+Acked-by: Maciej Żenczykowski <maze@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/socket.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/net/socket.c
++++ b/net/socket.c
+@@ -2041,6 +2041,8 @@ int __sys_sendmmsg(int fd, struct mmsghd
+               if (err)
+                       break;
+               ++datagrams;
++              if (msg_data_left(&msg_sys))
++                      break;
+       }
+       fput_light(sock->file, fput_needed);
diff --git a/queue-4.4/tcp-fix-potential-memory-corruption.patch b/queue-4.4/tcp-fix-potential-memory-corruption.patch
new file mode 100644 (file)
index 0000000..a2d1373
--- /dev/null
@@ -0,0 +1,40 @@
+From foo@baz Fri Nov 18 11:09:43 CET 2016
+From: Eric Dumazet <edumazet@google.com>
+Date: Wed, 2 Nov 2016 07:53:17 -0700
+Subject: tcp: fix potential memory corruption
+
+From: Eric Dumazet <edumazet@google.com>
+
+
+[ Upstream commit ac9e70b17ecd7c6e933ff2eaf7ab37429e71bf4d ]
+
+Imagine initial value of max_skb_frags is 17, and last
+skb in write queue has 15 frags.
+
+Then max_skb_frags is lowered to 14 or smaller value.
+
+tcp_sendmsg() will then be allowed to add additional page frags
+and eventually go past MAX_SKB_FRAGS, overflowing struct
+skb_shared_info.
+
+Fixes: 5f74f82ea34c ("net:Add sysctl_max_skb_frags")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Cc: Hans Westgaard Ry <hans.westgaard.ry@oracle.com>
+Cc: Håkon Bugge <haakon.bugge@oracle.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/tcp.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/net/ipv4/tcp.c
++++ b/net/ipv4/tcp.c
+@@ -1212,7 +1212,7 @@ new_segment:
+                       if (!skb_can_coalesce(skb, i, pfrag->page,
+                                             pfrag->offset)) {
+-                              if (i == sysctl_max_skb_frags || !sg) {
++                              if (i >= sysctl_max_skb_frags || !sg) {
+                                       tcp_mark_push(tp, skb);
+                                       goto new_segment;
+                               }
diff --git a/queue-4.4/tcp-take-care-of-truncations-done-by-sk_filter.patch b/queue-4.4/tcp-take-care-of-truncations-done-by-sk_filter.patch
new file mode 100644 (file)
index 0000000..f4b3aee
--- /dev/null
@@ -0,0 +1,159 @@
+From foo@baz Fri Nov 18 11:09:43 CET 2016
+From: Eric Dumazet <edumazet@google.com>
+Date: Thu, 10 Nov 2016 13:12:35 -0800
+Subject: tcp: take care of truncations done by sk_filter()
+
+From: Eric Dumazet <edumazet@google.com>
+
+
+[ Upstream commit ac6e780070e30e4c35bd395acfe9191e6268bdd3 ]
+
+With syzkaller help, Marco Grassi found a bug in TCP stack,
+crashing in tcp_collapse()
+
+Root cause is that sk_filter() can truncate the incoming skb,
+but TCP stack was not really expecting this to happen.
+It probably was expecting a simple DROP or ACCEPT behavior.
+
+We first need to make sure no part of TCP header could be removed.
+Then we need to adjust TCP_SKB_CB(skb)->end_seq
+
+Many thanks to syzkaller team and Marco for giving us a reproducer.
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reported-by: Marco Grassi <marco.gra@gmail.com>
+Reported-by: Vladis Dronov <vdronov@redhat.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/filter.h |    6 +++++-
+ include/net/tcp.h      |    1 +
+ net/core/filter.c      |   10 +++++-----
+ net/ipv4/tcp_ipv4.c    |   19 ++++++++++++++++++-
+ net/ipv6/tcp_ipv6.c    |    6 ++++--
+ 5 files changed, 33 insertions(+), 9 deletions(-)
+
+--- a/include/linux/filter.h
++++ b/include/linux/filter.h
+@@ -421,7 +421,11 @@ static inline void bpf_prog_unlock_ro(st
+ }
+ #endif /* CONFIG_DEBUG_SET_MODULE_RONX */
+-int sk_filter(struct sock *sk, struct sk_buff *skb);
++int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap);
++static inline int sk_filter(struct sock *sk, struct sk_buff *skb)
++{
++      return sk_filter_trim_cap(sk, skb, 1);
++}
+ int bpf_prog_select_runtime(struct bpf_prog *fp);
+ void bpf_prog_free(struct bpf_prog *fp);
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -1156,6 +1156,7 @@ static inline void tcp_prequeue_init(str
+ }
+ bool tcp_prequeue(struct sock *sk, struct sk_buff *skb);
++int tcp_filter(struct sock *sk, struct sk_buff *skb);
+ #undef STATE_TRACE
+--- a/net/core/filter.c
++++ b/net/core/filter.c
+@@ -52,9 +52,10 @@
+ #include <net/dst.h>
+ /**
+- *    sk_filter - run a packet through a socket filter
++ *    sk_filter_trim_cap - run a packet through a socket filter
+  *    @sk: sock associated with &sk_buff
+  *    @skb: buffer to filter
++ *    @cap: limit on how short the eBPF program may trim the packet
+  *
+  * Run the eBPF program and then cut skb->data to correct size returned by
+  * the program. If pkt_len is 0 we toss packet. If skb->len is smaller
+@@ -63,7 +64,7 @@
+  * be accepted or -EPERM if the packet should be tossed.
+  *
+  */
+-int sk_filter(struct sock *sk, struct sk_buff *skb)
++int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
+ {
+       int err;
+       struct sk_filter *filter;
+@@ -84,14 +85,13 @@ int sk_filter(struct sock *sk, struct sk
+       filter = rcu_dereference(sk->sk_filter);
+       if (filter) {
+               unsigned int pkt_len = bpf_prog_run_save_cb(filter->prog, skb);
+-
+-              err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM;
++              err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM;
+       }
+       rcu_read_unlock();
+       return err;
+ }
+-EXPORT_SYMBOL(sk_filter);
++EXPORT_SYMBOL(sk_filter_trim_cap);
+ static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+ {
+--- a/net/ipv4/tcp_ipv4.c
++++ b/net/ipv4/tcp_ipv4.c
+@@ -1533,6 +1533,21 @@ bool tcp_prequeue(struct sock *sk, struc
+ }
+ EXPORT_SYMBOL(tcp_prequeue);
++int tcp_filter(struct sock *sk, struct sk_buff *skb)
++{
++      struct tcphdr *th = (struct tcphdr *)skb->data;
++      unsigned int eaten = skb->len;
++      int err;
++
++      err = sk_filter_trim_cap(sk, skb, th->doff * 4);
++      if (!err) {
++              eaten -= skb->len;
++              TCP_SKB_CB(skb)->end_seq -= eaten;
++      }
++      return err;
++}
++EXPORT_SYMBOL(tcp_filter);
++
+ /*
+  *    From tcp_input.c
+  */
+@@ -1638,8 +1653,10 @@ process:
+       nf_reset(skb);
+-      if (sk_filter(sk, skb))
++      if (tcp_filter(sk, skb))
+               goto discard_and_relse;
++      th = (const struct tcphdr *)skb->data;
++      iph = ip_hdr(skb);
+       skb->dev = NULL;
+--- a/net/ipv6/tcp_ipv6.c
++++ b/net/ipv6/tcp_ipv6.c
+@@ -1214,7 +1214,7 @@ static int tcp_v6_do_rcv(struct sock *sk
+       if (skb->protocol == htons(ETH_P_IP))
+               return tcp_v4_do_rcv(sk, skb);
+-      if (sk_filter(sk, skb))
++      if (tcp_filter(sk, skb))
+               goto discard;
+       /*
+@@ -1438,8 +1438,10 @@ process:
+       if (tcp_v6_inbound_md5_hash(sk, skb))
+               goto discard_and_relse;
+-      if (sk_filter(sk, skb))
++      if (tcp_filter(sk, skb))
+               goto discard_and_relse;
++      th = (const struct tcphdr *)skb->data;
++      hdr = ipv6_hdr(skb);
+       skb->dev = NULL;