--- /dev/null
+From foo@baz Wed Apr 29 12:00:35 CEST 2015
+From: =?UTF-8?q?Sebastian=20P=C3=B6hn?= <sebastian.poehn@gmail.com>
+Date: Mon, 20 Apr 2015 09:19:20 +0200
+Subject: ip_forward: Drop frames with attached skb->sk
+
+From: =?UTF-8?q?Sebastian=20P=C3=B6hn?= <sebastian.poehn@gmail.com>
+
+[ Upstream commit 2ab957492d13bb819400ac29ae55911d50a82a13 ]
+
+Initial discussion was:
+[FYI] xfrm: Don't lookup sk_policy for timewait sockets
+
+Forwarded frames should not have a socket attached. Especially
+tw sockets will lead to panics later-on in the stack.
+
+This was observed with TPROXY assigning a tw socket and broken
+policy routing (misconfigured). As a result frame enters
+forwarding path instead of input. We cannot solve this in
+TPROXY as it cannot know that policy routing is broken.
+
+v2:
+Remove useless comment
+
+Signed-off-by: Sebastian Poehn <sebastian.poehn@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/ip_forward.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/net/ipv4/ip_forward.c
++++ b/net/ipv4/ip_forward.c
+@@ -127,6 +127,9 @@ int ip_forward(struct sk_buff *skb)
+ struct rtable *rt; /* Route we use */
+ struct ip_options *opt = &(IPCB(skb)->opt);
+
++ if (unlikely(skb->sk))
++ goto drop;
++
+ if (skb_warn_if_lro(skb))
+ goto drop;
+
--- /dev/null
+From foo@baz Wed Apr 29 12:00:35 CEST 2015
+From: Eric Dumazet <edumazet@google.com>
+Date: Wed, 22 Apr 2015 07:33:36 -0700
+Subject: net: do not deplete pfmemalloc reserve
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit 79930f5892e134c6da1254389577fffb8bd72c66 ]
+
+build_skb() should look at the page pfmemalloc status.
+If set, this means page allocator allocated this page in the
+expectation it would help to free other pages. Networking
+stack can do that only if skb->pfmemalloc is also set.
+
+Also, we must refrain using high order pages from the pfmemalloc
+reserve, so __page_frag_refill() must also use __GFP_NOMEMALLOC for
+them. Under memory pressure, using order-0 pages is probably the best
+strategy.
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/core/skbuff.c | 9 +++++++--
+ 1 file changed, 7 insertions(+), 2 deletions(-)
+
+--- a/net/core/skbuff.c
++++ b/net/core/skbuff.c
+@@ -309,7 +309,11 @@ struct sk_buff *build_skb(void *data, un
+
+ memset(skb, 0, offsetof(struct sk_buff, tail));
+ skb->truesize = SKB_TRUESIZE(size);
+- skb->head_frag = frag_size != 0;
++ if (frag_size) {
++ skb->head_frag = 1;
++ if (virt_to_head_page(data)->pfmemalloc)
++ skb->pfmemalloc = 1;
++ }
+ atomic_set(&skb->users, 1);
+ skb->head = data;
+ skb->data = data;
+@@ -352,7 +356,8 @@ refill:
+ gfp_t gfp = gfp_mask;
+
+ if (order)
+- gfp |= __GFP_COMP | __GFP_NOWARN;
++ gfp |= __GFP_COMP | __GFP_NOWARN |
++ __GFP_NOMEMALLOC;
+ nc->frag.page = alloc_pages(gfp, order);
+ if (likely(nc->frag.page))
+ break;
--- /dev/null
+From foo@baz Wed Apr 29 12:00:35 CEST 2015
+From: Eric Dumazet <edumazet@google.com>
+Date: Fri, 24 Apr 2015 16:05:01 -0700
+Subject: net: fix crash in build_skb()
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit 2ea2f62c8bda242433809c7f4e9eae1c52c40bbe ]
+
+When I added pfmemalloc support in build_skb(), I forgot netlink
+was using build_skb() with a vmalloc() area.
+
+In this patch I introduce __build_skb() for netlink use,
+and build_skb() is a wrapper handling both skb->head_frag and
+skb->pfmemalloc
+
+This means netlink no longer has to hack skb->head_frag
+
+[ 1567.700067] kernel BUG at arch/x86/mm/physaddr.c:26!
+[ 1567.700067] invalid opcode: 0000 [#1] PREEMPT SMP KASAN
+[ 1567.700067] Dumping ftrace buffer:
+[ 1567.700067] (ftrace buffer empty)
+[ 1567.700067] Modules linked in:
+[ 1567.700067] CPU: 9 PID: 16186 Comm: trinity-c182 Not tainted 4.0.0-next-20150424-sasha-00037-g4796e21 #2167
+[ 1567.700067] task: ffff880127efb000 ti: ffff880246770000 task.ti: ffff880246770000
+[ 1567.700067] RIP: __phys_addr (arch/x86/mm/physaddr.c:26 (discriminator 3))
+[ 1567.700067] RSP: 0018:ffff8802467779d8 EFLAGS: 00010202
+[ 1567.700067] RAX: 000041000ed8e000 RBX: ffffc9008ed8e000 RCX: 000000000000002c
+[ 1567.700067] RDX: 0000000000000004 RSI: 0000000000000000 RDI: ffffffffb3fd6049
+[ 1567.700067] RBP: ffff8802467779f8 R08: 0000000000000019 R09: ffff8801d0168000
+[ 1567.700067] R10: ffff8801d01680c7 R11: ffffed003a02d019 R12: ffffc9000ed8e000
+[ 1567.700067] R13: 0000000000000f40 R14: 0000000000001180 R15: ffffc9000ed8e000
+[ 1567.700067] FS: 00007f2a7da3f700(0000) GS:ffff8801d1000000(0000) knlGS:0000000000000000
+[ 1567.700067] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+[ 1567.700067] CR2: 0000000000738308 CR3: 000000022e329000 CR4: 00000000000007e0
+[ 1567.700067] Stack:
+[ 1567.700067] ffffc9000ed8e000 ffff8801d0168000 ffffc9000ed8e000 ffff8801d0168000
+[ 1567.700067] ffff880246777a28 ffffffffad7c0a21 0000000000001080 ffff880246777c08
+[ 1567.700067] ffff88060d302e68 ffff880246777b58 ffff880246777b88 ffffffffad9a6821
+[ 1567.700067] Call Trace:
+[ 1567.700067] build_skb (include/linux/mm.h:508 net/core/skbuff.c:316)
+[ 1567.700067] netlink_sendmsg (net/netlink/af_netlink.c:1633 net/netlink/af_netlink.c:2329)
+[ 1567.774369] ? sched_clock_cpu (kernel/sched/clock.c:311)
+[ 1567.774369] ? netlink_unicast (net/netlink/af_netlink.c:2273)
+[ 1567.774369] ? netlink_unicast (net/netlink/af_netlink.c:2273)
+[ 1567.774369] sock_sendmsg (net/socket.c:614 net/socket.c:623)
+[ 1567.774369] sock_write_iter (net/socket.c:823)
+[ 1567.774369] ? sock_sendmsg (net/socket.c:806)
+[ 1567.774369] __vfs_write (fs/read_write.c:479 fs/read_write.c:491)
+[ 1567.774369] ? get_lock_stats (kernel/locking/lockdep.c:249)
+[ 1567.774369] ? default_llseek (fs/read_write.c:487)
+[ 1567.774369] ? vtime_account_user (kernel/sched/cputime.c:701)
+[ 1567.774369] ? rw_verify_area (fs/read_write.c:406 (discriminator 4))
+[ 1567.774369] vfs_write (fs/read_write.c:539)
+[ 1567.774369] SyS_write (fs/read_write.c:586 fs/read_write.c:577)
+[ 1567.774369] ? SyS_read (fs/read_write.c:577)
+[ 1567.774369] ? __this_cpu_preempt_check (lib/smp_processor_id.c:63)
+[ 1567.774369] ? trace_hardirqs_on_caller (kernel/locking/lockdep.c:2594 kernel/locking/lockdep.c:2636)
+[ 1567.774369] ? trace_hardirqs_on_thunk (arch/x86/lib/thunk_64.S:42)
+[ 1567.774369] system_call_fastpath (arch/x86/kernel/entry_64.S:261)
+
+Fixes: 79930f5892e ("net: do not deplete pfmemalloc reserve")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reported-by: Sasha Levin <sasha.levin@oracle.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/skbuff.h | 1 +
+ net/core/skbuff.c | 31 ++++++++++++++++++++++---------
+ net/netlink/af_netlink.c | 6 ++----
+ 3 files changed, 25 insertions(+), 13 deletions(-)
+
+--- a/include/linux/skbuff.h
++++ b/include/linux/skbuff.h
+@@ -661,6 +661,7 @@ bool skb_try_coalesce(struct sk_buff *to
+
+ struct sk_buff *__alloc_skb(unsigned int size, gfp_t priority, int flags,
+ int node);
++struct sk_buff *__build_skb(void *data, unsigned int frag_size);
+ struct sk_buff *build_skb(void *data, unsigned int frag_size);
+ static inline struct sk_buff *alloc_skb(unsigned int size,
+ gfp_t priority)
+--- a/net/core/skbuff.c
++++ b/net/core/skbuff.c
+@@ -278,13 +278,14 @@ nodata:
+ EXPORT_SYMBOL(__alloc_skb);
+
+ /**
+- * build_skb - build a network buffer
++ * __build_skb - build a network buffer
+ * @data: data buffer provided by caller
+- * @frag_size: size of fragment, or 0 if head was kmalloced
++ * @frag_size: size of data, or 0 if head was kmalloced
+ *
+ * Allocate a new &sk_buff. Caller provides space holding head and
+ * skb_shared_info. @data must have been allocated by kmalloc() only if
+- * @frag_size is 0, otherwise data should come from the page allocator.
++ * @frag_size is 0, otherwise data should come from the page allocator
++ * or vmalloc()
+ * The return is the new skb buffer.
+ * On a failure the return is %NULL, and @data is not freed.
+ * Notes :
+@@ -295,7 +296,7 @@ EXPORT_SYMBOL(__alloc_skb);
+ * before giving packet to stack.
+ * RX rings only contains data buffers, not full skbs.
+ */
+-struct sk_buff *build_skb(void *data, unsigned int frag_size)
++struct sk_buff *__build_skb(void *data, unsigned int frag_size)
+ {
+ struct skb_shared_info *shinfo;
+ struct sk_buff *skb;
+@@ -309,11 +310,6 @@ struct sk_buff *build_skb(void *data, un
+
+ memset(skb, 0, offsetof(struct sk_buff, tail));
+ skb->truesize = SKB_TRUESIZE(size);
+- if (frag_size) {
+- skb->head_frag = 1;
+- if (virt_to_head_page(data)->pfmemalloc)
+- skb->pfmemalloc = 1;
+- }
+ atomic_set(&skb->users, 1);
+ skb->head = data;
+ skb->data = data;
+@@ -330,6 +326,23 @@ struct sk_buff *build_skb(void *data, un
+
+ return skb;
+ }
++
++/* build_skb() is wrapper over __build_skb(), that specifically
++ * takes care of skb->head and skb->pfmemalloc
++ * This means that if @frag_size is not zero, then @data must be backed
++ * by a page fragment, not kmalloc() or vmalloc()
++ */
++struct sk_buff *build_skb(void *data, unsigned int frag_size)
++{
++ struct sk_buff *skb = __build_skb(data, frag_size);
++
++ if (skb && frag_size) {
++ skb->head_frag = 1;
++ if (virt_to_head_page(data)->pfmemalloc)
++ skb->pfmemalloc = 1;
++ }
++ return skb;
++}
+ EXPORT_SYMBOL(build_skb);
+
+ struct netdev_alloc_cache {
+--- a/net/netlink/af_netlink.c
++++ b/net/netlink/af_netlink.c
+@@ -1624,13 +1624,11 @@ static struct sk_buff *netlink_alloc_lar
+ if (data == NULL)
+ return NULL;
+
+- skb = build_skb(data, size);
++ skb = __build_skb(data, size);
+ if (skb == NULL)
+ vfree(data);
+- else {
+- skb->head_frag = 0;
++ else
+ skb->destructor = netlink_skb_destructor;
+- }
+
+ return skb;
+ }
--- /dev/null
+From foo@baz Wed Apr 29 12:00:35 CEST 2015
+From: Eric Dumazet <edumazet@google.com>
+Date: Thu, 23 Apr 2015 10:42:39 -0700
+Subject: tcp: avoid looping in tcp_send_fin()
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit 845704a535e9b3c76448f52af1b70e4422ea03fd ]
+
+Presence of an unbound loop in tcp_send_fin() had always been hard
+to explain when analyzing crash dumps involving gigantic dying processes
+with millions of sockets.
+
+Lets try a different strategy :
+
+In case of memory pressure, try to add the FIN flag to last packet
+in write queue, even if packet was already sent. TCP stack will
+be able to deliver this FIN after a timeout event. Note that this
+FIN being delivered by a retransmit, it also carries a Push flag
+given our current implementation.
+
+By checking sk_under_memory_pressure(), we anticipate that cooking
+many FIN packets might deplete tcp memory.
+
+In the case we could not allocate a packet, even with __GFP_WAIT
+allocation, then not sending a FIN seems quite reasonable if it allows
+to get rid of this socket, free memory, and not block the process from
+eventually doing other useful work.
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/tcp_output.c | 50 +++++++++++++++++++++++++++++---------------------
+ 1 file changed, 29 insertions(+), 21 deletions(-)
+
+--- a/net/ipv4/tcp_output.c
++++ b/net/ipv4/tcp_output.c
+@@ -2597,7 +2597,8 @@ begin_fwd:
+
+ /* We allow to exceed memory limits for FIN packets to expedite
+ * connection tear down and (memory) recovery.
+- * Otherwise tcp_send_fin() could loop forever.
++ * Otherwise tcp_send_fin() could be tempted to either delay FIN
++ * or even be forced to close flow without any FIN.
+ */
+ static void sk_forced_wmem_schedule(struct sock *sk, int size)
+ {
+@@ -2610,33 +2611,40 @@ static void sk_forced_wmem_schedule(stru
+ sk_memory_allocated_add(sk, amt, &status);
+ }
+
+-/* Send a fin. The caller locks the socket for us. This cannot be
+- * allowed to fail queueing a FIN frame under any circumstances.
++/* Send a FIN. The caller locks the socket for us.
++ * We should try to send a FIN packet really hard, but eventually give up.
+ */
+ void tcp_send_fin(struct sock *sk)
+ {
++ struct sk_buff *skb, *tskb = tcp_write_queue_tail(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+- struct sk_buff *skb = tcp_write_queue_tail(sk);
+- int mss_now;
+
+- /* Optimization, tack on the FIN if we have a queue of
+- * unsent frames. But be careful about outgoing SACKS
+- * and IP options.
++ /* Optimization, tack on the FIN if we have one skb in write queue and
++ * this skb was not yet sent, or we are under memory pressure.
++ * Note: in the latter case, FIN packet will be sent after a timeout,
++ * as TCP stack thinks it has already been transmitted.
+ */
+- mss_now = tcp_current_mss(sk);
+-
+- if (tcp_send_head(sk) != NULL) {
+- TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;
+- TCP_SKB_CB(skb)->end_seq++;
++ if (tskb && (tcp_send_head(sk) || sk_under_memory_pressure(sk))) {
++coalesce:
++ TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
++ TCP_SKB_CB(tskb)->end_seq++;
+ tp->write_seq++;
++ if (!tcp_send_head(sk)) {
++ /* This means tskb was already sent.
++ * Pretend we included the FIN on previous transmit.
++ * We need to set tp->snd_nxt to the value it would have
++ * if FIN had been sent. This is because retransmit path
++ * does not change tp->snd_nxt.
++ */
++ tp->snd_nxt++;
++ return;
++ }
+ } else {
+- /* Socket is locked, keep trying until memory is available. */
+- for (;;) {
+- skb = alloc_skb_fclone(MAX_TCP_HEADER,
+- sk->sk_allocation);
+- if (skb)
+- break;
+- yield();
++ skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
++ if (unlikely(!skb)) {
++ if (tskb)
++ goto coalesce;
++ return;
+ }
+ skb_reserve(skb, MAX_TCP_HEADER);
+ sk_forced_wmem_schedule(sk, skb->truesize);
+@@ -2645,7 +2653,7 @@ void tcp_send_fin(struct sock *sk)
+ TCPHDR_ACK | TCPHDR_FIN);
+ tcp_queue_skb(sk, skb);
+ }
+- __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF);
++ __tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF);
+ }
+
+ /* We get here when a process closes a file descriptor (either due to
--- /dev/null
+From foo@baz Wed Apr 29 12:00:35 CEST 2015
+From: Eric Dumazet <edumazet@google.com>
+Date: Tue, 21 Apr 2015 18:32:24 -0700
+Subject: tcp: fix possible deadlock in tcp_send_fin()
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit d83769a580f1132ac26439f50068a29b02be535e ]
+
+Using sk_stream_alloc_skb() in tcp_send_fin() is dangerous in
+case a huge process is killed by OOM, and tcp_mem[2] is hit.
+
+To be able to free memory we need to make progress, so this
+patch allows FIN packets to not care about tcp_mem[2], if
+skb allocation succeeded.
+
+In a follow-up patch, we might abort tcp_send_fin() infinite loop
+in case TIF_MEMDIE is set on this thread, as memory allocator
+did its best getting extra memory already.
+
+This patch reverts d22e15371811 ("tcp: fix tcp fin memory accounting")
+
+Fixes: d22e15371811 ("tcp: fix tcp fin memory accounting")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/tcp_output.c | 20 +++++++++++++++++++-
+ 1 file changed, 19 insertions(+), 1 deletion(-)
+
+--- a/net/ipv4/tcp_output.c
++++ b/net/ipv4/tcp_output.c
+@@ -2595,6 +2595,21 @@ begin_fwd:
+ }
+ }
+
++/* We allow to exceed memory limits for FIN packets to expedite
++ * connection tear down and (memory) recovery.
++ * Otherwise tcp_send_fin() could loop forever.
++ */
++static void sk_forced_wmem_schedule(struct sock *sk, int size)
++{
++ int amt, status;
++
++ if (size <= sk->sk_forward_alloc)
++ return;
++ amt = sk_mem_pages(size);
++ sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
++ sk_memory_allocated_add(sk, amt, &status);
++}
++
+ /* Send a fin. The caller locks the socket for us. This cannot be
+ * allowed to fail queueing a FIN frame under any circumstances.
+ */
+@@ -2617,11 +2632,14 @@ void tcp_send_fin(struct sock *sk)
+ } else {
+ /* Socket is locked, keep trying until memory is available. */
+ for (;;) {
+- skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
++ skb = alloc_skb_fclone(MAX_TCP_HEADER,
++ sk->sk_allocation);
+ if (skb)
+ break;
+ yield();
+ }
++ skb_reserve(skb, MAX_TCP_HEADER);
++ sk_forced_wmem_schedule(sk, skb->truesize);
+ /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
+ tcp_init_nondata_skb(skb, tp->write_seq,
+ TCPHDR_ACK | TCPHDR_FIN);