From: Greg Kroah-Hartman Date: Tue, 4 Feb 2014 17:13:38 +0000 (-0800) Subject: 3.10-stable patches X-Git-Tag: v3.4.79~1^2~27 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=6dd6a9a2019a53080765df4998da08d631d95c9b;p=thirdparty%2Fkernel%2Fstable-queue.git 3.10-stable patches added patches: bnx2x-fix-dma-unmapping-of-tso-split-bds.patch bpf-do-not-use-reciprocal-divide.patch fib_frontend-fix-possible-null-pointer-dereference.patch ieee802154-fix-memory-leak-in-ieee802154_add_iface.patch inet_diag-fix-inet_diag_dump_icsk-timewait-socket-state-logic.patch ip6tnl-fix-double-free-of-fb_tnl_dev-on-exit.patch ip_tunnel-clear-ipcb-in-ip_tunnel_xmit-in-case-dst_link_failure-is-called.patch net-avoid-reference-counter-overflows-on-fib_rules-in-multicast-forwarding.patch net-fix-memory-leak-if-tproxy-used-with-tcp-early-demux.patch net-rds-fix-per-cpu-helper-usage.patch net-via-rhine-fix-tx_timeout-handling.patch revert-ip6tnl-fix-use-after-free-of-fb_tnl_dev.patch s390-bpf-jit-fix-32-bit-divisions-use-unsigned-divide-instructions.patch sit-fix-double-free-of-fb_tunnel_dev-on-exit.patch tcp-metrics-avoid-duplicate-entries-with-the-same-destination-ip.patch xen-netfront-fix-resource-leak-in-netfront.patch --- diff --git a/queue-3.10/bnx2x-fix-dma-unmapping-of-tso-split-bds.patch b/queue-3.10/bnx2x-fix-dma-unmapping-of-tso-split-bds.patch new file mode 100644 index 00000000000..84338f360a5 --- /dev/null +++ b/queue-3.10/bnx2x-fix-dma-unmapping-of-tso-split-bds.patch @@ -0,0 +1,73 @@ +From foo@baz Tue Feb 4 09:07:36 PST 2014 +From: Michal Schmidt +Date: Thu, 9 Jan 2014 14:36:27 +0100 +Subject: bnx2x: fix DMA unmapping of TSO split BDs + +From: Michal Schmidt + +[ Upstream commit 95e92fd40c967c363ad66b2fd1ce4dcd68132e54 ] + +bnx2x triggers warnings with CONFIG_DMA_API_DEBUG=y: + + WARNING: CPU: 0 PID: 2253 at lib/dma-debug.c:887 check_unmap+0xf8/0x920() + bnx2x 0000:28:00.0: DMA-API: device driver frees DMA memory with + different size [device address=0x00000000da2b389e] [map size=1490 bytes] + [unmap size=66 bytes] + +The reason is that bnx2x splits a TSO BD into two BDs (headers + data) +using one DMA mapping for both, but it uses only the length of the first +BD when unmapping. + +This patch fixes the bug by unmapping the whole length of the two BDs. + +Signed-off-by: Michal Schmidt +Reviewed-by: Eric Dumazet +Acked-by: Dmitry Kravkov +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 13 +++++++++---- + 1 file changed, 9 insertions(+), 4 deletions(-) + +--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c ++++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c +@@ -153,6 +153,7 @@ static u16 bnx2x_free_tx_pkt(struct bnx2 + struct sk_buff *skb = tx_buf->skb; + u16 bd_idx = TX_BD(tx_buf->first_bd), new_cons; + int nbd; ++ u16 split_bd_len = 0; + + /* prefetch skb end pointer to speedup dev_kfree_skb() */ + prefetch(&skb->end); +@@ -160,10 +161,7 @@ static u16 bnx2x_free_tx_pkt(struct bnx2 + DP(NETIF_MSG_TX_DONE, "fp[%d]: pkt_idx %d buff @(%p)->skb %p\n", + txdata->txq_index, idx, tx_buf, skb); + +- /* unmap first bd */ + tx_start_bd = &txdata->tx_desc_ring[bd_idx].start_bd; +- dma_unmap_single(&bp->pdev->dev, BD_UNMAP_ADDR(tx_start_bd), +- BD_UNMAP_LEN(tx_start_bd), DMA_TO_DEVICE); + + + nbd = le16_to_cpu(tx_start_bd->nbd) - 1; +@@ -182,12 +180,19 @@ static u16 bnx2x_free_tx_pkt(struct bnx2 + --nbd; + bd_idx = TX_BD(NEXT_TX_IDX(bd_idx)); + +- /* ...and the TSO split header bd since they have no mapping */ ++ /* TSO headers+data bds share a common mapping. See bnx2x_tx_split() */ + if (tx_buf->flags & BNX2X_TSO_SPLIT_BD) { ++ tx_data_bd = &txdata->tx_desc_ring[bd_idx].reg_bd; ++ split_bd_len = BD_UNMAP_LEN(tx_data_bd); + --nbd; + bd_idx = TX_BD(NEXT_TX_IDX(bd_idx)); + } + ++ /* unmap first bd */ ++ dma_unmap_single(&bp->pdev->dev, BD_UNMAP_ADDR(tx_start_bd), ++ BD_UNMAP_LEN(tx_start_bd) + split_bd_len, ++ DMA_TO_DEVICE); ++ + /* now free frags */ + while (nbd > 0) { + diff --git a/queue-3.10/bpf-do-not-use-reciprocal-divide.patch b/queue-3.10/bpf-do-not-use-reciprocal-divide.patch new file mode 100644 index 00000000000..c50d1b128c8 --- /dev/null +++ b/queue-3.10/bpf-do-not-use-reciprocal-divide.patch @@ -0,0 +1,220 @@ +From foo@baz Tue Feb 4 09:07:36 PST 2014 +From: Eric Dumazet +Date: Wed, 15 Jan 2014 06:50:07 -0800 +Subject: bpf: do not use reciprocal divide + +From: Eric Dumazet + +[ Upstream commit aee636c4809fa54848ff07a899b326eb1f9987a2 ] + +At first Jakub Zawadzki noticed that some divisions by reciprocal_divide +were not correct. (off by one in some cases) +http://www.wireshark.org/~darkjames/reciprocal-buggy.c + +He could also show this with BPF: +http://www.wireshark.org/~darkjames/set-and-dump-filter-k-bug.c + +The reciprocal divide in linux kernel is not generic enough, +lets remove its use in BPF, as it is not worth the pain with +current cpus. + +Signed-off-by: Eric Dumazet +Reported-by: Jakub Zawadzki +Cc: Mircea Gherzan +Cc: Daniel Borkmann +Cc: Hannes Frederic Sowa +Cc: Matt Evans +Cc: Martin Schwidefsky +Cc: Heiko Carstens +Cc: David S. Miller +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm/net/bpf_jit_32.c | 6 +++--- + arch/powerpc/net/bpf_jit_comp.c | 7 ++++--- + arch/s390/net/bpf_jit_comp.c | 17 ++++++++++++----- + arch/sparc/net/bpf_jit_comp.c | 17 ++++++++++++++--- + arch/x86/net/bpf_jit_comp.c | 14 ++++++++++---- + net/core/filter.c | 30 ++---------------------------- + 6 files changed, 45 insertions(+), 46 deletions(-) + +--- a/arch/arm/net/bpf_jit_32.c ++++ b/arch/arm/net/bpf_jit_32.c +@@ -637,10 +637,10 @@ load_ind: + emit(ARM_MUL(r_A, r_A, r_X), ctx); + break; + case BPF_S_ALU_DIV_K: +- /* current k == reciprocal_value(userspace k) */ ++ if (k == 1) ++ break; + emit_mov_i(r_scratch, k, ctx); +- /* A = top 32 bits of the product */ +- emit(ARM_UMULL(r_scratch, r_A, r_A, r_scratch), ctx); ++ emit_udiv(r_A, r_A, r_scratch, ctx); + break; + case BPF_S_ALU_DIV_X: + update_on_xread(ctx); +--- a/arch/powerpc/net/bpf_jit_comp.c ++++ b/arch/powerpc/net/bpf_jit_comp.c +@@ -209,10 +209,11 @@ static int bpf_jit_build_body(struct sk_ + } + PPC_DIVWU(r_A, r_A, r_X); + break; +- case BPF_S_ALU_DIV_K: /* A = reciprocal_divide(A, K); */ ++ case BPF_S_ALU_DIV_K: /* A /= K */ ++ if (K == 1) ++ break; + PPC_LI32(r_scratch1, K); +- /* Top 32 bits of 64bit result -> A */ +- PPC_MULHWU(r_A, r_A, r_scratch1); ++ PPC_DIVWU(r_A, r_A, r_scratch1); + break; + case BPF_S_ALU_AND_X: + ctx->seen |= SEEN_XREG; +--- a/arch/s390/net/bpf_jit_comp.c ++++ b/arch/s390/net/bpf_jit_comp.c +@@ -338,11 +338,13 @@ static int bpf_jit_insn(struct bpf_jit * + /* dr %r4,%r12 */ + EMIT2(0x1d4c); + break; +- case BPF_S_ALU_DIV_K: /* A = reciprocal_divide(A, K) */ +- /* m %r4,(%r13) */ +- EMIT4_DISP(0x5c40d000, EMIT_CONST(K)); +- /* lr %r5,%r4 */ +- EMIT2(0x1854); ++ case BPF_S_ALU_DIV_K: /* A /= K */ ++ if (K == 1) ++ break; ++ /* lhi %r4,0 */ ++ EMIT4(0xa7480000); ++ /* d %r4,(%r13) */ ++ EMIT4_DISP(0x5d40d000, EMIT_CONST(K)); + break; + case BPF_S_ALU_MOD_X: /* A %= X */ + jit->seen |= SEEN_XREG | SEEN_RET0; +@@ -358,6 +360,11 @@ static int bpf_jit_insn(struct bpf_jit * + EMIT2(0x1854); + break; + case BPF_S_ALU_MOD_K: /* A %= K */ ++ if (K == 1) { ++ /* lhi %r5,0 */ ++ EMIT4(0xa7580000); ++ break; ++ } + /* lhi %r4,0 */ + EMIT4(0xa7480000); + /* d %r4,(%r13) */ +--- a/arch/sparc/net/bpf_jit_comp.c ++++ b/arch/sparc/net/bpf_jit_comp.c +@@ -497,9 +497,20 @@ void bpf_jit_compile(struct sk_filter *f + case BPF_S_ALU_MUL_K: /* A *= K */ + emit_alu_K(MUL, K); + break; +- case BPF_S_ALU_DIV_K: /* A /= K */ +- emit_alu_K(MUL, K); +- emit_read_y(r_A); ++ case BPF_S_ALU_DIV_K: /* A /= K with K != 0*/ ++ if (K == 1) ++ break; ++ emit_write_y(G0); ++#ifdef CONFIG_SPARC32 ++ /* The Sparc v8 architecture requires ++ * three instructions between a %y ++ * register write and the first use. ++ */ ++ emit_nop(); ++ emit_nop(); ++ emit_nop(); ++#endif ++ emit_alu_K(DIV, K); + break; + case BPF_S_ALU_DIV_X: /* A /= X; */ + emit_cmpi(r_X, 0); +--- a/arch/x86/net/bpf_jit_comp.c ++++ b/arch/x86/net/bpf_jit_comp.c +@@ -324,15 +324,21 @@ void bpf_jit_compile(struct sk_filter *f + EMIT2(0x89, 0xd0); /* mov %edx,%eax */ + break; + case BPF_S_ALU_MOD_K: /* A %= K; */ ++ if (K == 1) { ++ CLEAR_A(); ++ break; ++ } + EMIT2(0x31, 0xd2); /* xor %edx,%edx */ + EMIT1(0xb9);EMIT(K, 4); /* mov imm32,%ecx */ + EMIT2(0xf7, 0xf1); /* div %ecx */ + EMIT2(0x89, 0xd0); /* mov %edx,%eax */ + break; +- case BPF_S_ALU_DIV_K: /* A = reciprocal_divide(A, K); */ +- EMIT3(0x48, 0x69, 0xc0); /* imul imm32,%rax,%rax */ +- EMIT(K, 4); +- EMIT4(0x48, 0xc1, 0xe8, 0x20); /* shr $0x20,%rax */ ++ case BPF_S_ALU_DIV_K: /* A /= K */ ++ if (K == 1) ++ break; ++ EMIT2(0x31, 0xd2); /* xor %edx,%edx */ ++ EMIT1(0xb9);EMIT(K, 4); /* mov imm32,%ecx */ ++ EMIT2(0xf7, 0xf1); /* div %ecx */ + break; + case BPF_S_ALU_AND_X: + seen |= SEEN_XREG; +--- a/net/core/filter.c ++++ b/net/core/filter.c +@@ -36,7 +36,6 @@ + #include + #include + #include +-#include + #include + #include + #include +@@ -166,7 +165,7 @@ unsigned int sk_run_filter(const struct + A /= X; + continue; + case BPF_S_ALU_DIV_K: +- A = reciprocal_divide(A, K); ++ A /= K; + continue; + case BPF_S_ALU_MOD_X: + if (X == 0) +@@ -553,11 +552,6 @@ int sk_chk_filter(struct sock_filter *fi + /* Some instructions need special checks */ + switch (code) { + case BPF_S_ALU_DIV_K: +- /* check for division by zero */ +- if (ftest->k == 0) +- return -EINVAL; +- ftest->k = reciprocal_value(ftest->k); +- break; + case BPF_S_ALU_MOD_K: + /* check for division by zero */ + if (ftest->k == 0) +@@ -853,27 +847,7 @@ void sk_decode_filter(struct sock_filter + to->code = decodes[code]; + to->jt = filt->jt; + to->jf = filt->jf; +- +- if (code == BPF_S_ALU_DIV_K) { +- /* +- * When loaded this rule user gave us X, which was +- * translated into R = r(X). Now we calculate the +- * RR = r(R) and report it back. If next time this +- * value is loaded and RRR = r(RR) is calculated +- * then the R == RRR will be true. +- * +- * One exception. X == 1 translates into R == 0 and +- * we can't calculate RR out of it with r(). +- */ +- +- if (filt->k == 0) +- to->k = 1; +- else +- to->k = reciprocal_value(filt->k); +- +- BUG_ON(reciprocal_value(to->k) != filt->k); +- } else +- to->k = filt->k; ++ to->k = filt->k; + } + + int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, unsigned int len) diff --git a/queue-3.10/fib_frontend-fix-possible-null-pointer-dereference.patch b/queue-3.10/fib_frontend-fix-possible-null-pointer-dereference.patch new file mode 100644 index 00000000000..36d5ee30589 --- /dev/null +++ b/queue-3.10/fib_frontend-fix-possible-null-pointer-dereference.patch @@ -0,0 +1,35 @@ +From foo@baz Tue Feb 4 09:07:36 PST 2014 +From: Oliver Hartkopp +Date: Thu, 23 Jan 2014 10:19:34 +0100 +Subject: fib_frontend: fix possible NULL pointer dereference + +From: Oliver Hartkopp + +[ Upstream commit a0065f266a9b5d51575535a25c15ccbeed9a9966 ] + +The two commits 0115e8e30d (net: remove delay at device dismantle) and +748e2d9396a (net: reinstate rtnl in call_netdevice_notifiers()) silently +removed a NULL pointer check for in_dev since Linux 3.7. + +This patch re-introduces this check as it causes crashing the kernel when +setting small mtu values on non-ip capable netdevices. + +Signed-off-by: Oliver Hartkopp +Acked-by: Eric Dumazet +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/fib_frontend.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/net/ipv4/fib_frontend.c ++++ b/net/ipv4/fib_frontend.c +@@ -1049,6 +1049,8 @@ static int fib_netdev_event(struct notif + } + + in_dev = __in_dev_get_rtnl(dev); ++ if (!in_dev) ++ return NOTIFY_DONE; + + switch (event) { + case NETDEV_UP: diff --git a/queue-3.10/ieee802154-fix-memory-leak-in-ieee802154_add_iface.patch b/queue-3.10/ieee802154-fix-memory-leak-in-ieee802154_add_iface.patch new file mode 100644 index 00000000000..8a458d3848d --- /dev/null +++ b/queue-3.10/ieee802154-fix-memory-leak-in-ieee802154_add_iface.patch @@ -0,0 +1,34 @@ +From foo@baz Tue Feb 4 09:07:36 PST 2014 +From: Christian Engelmayer +Date: Sat, 11 Jan 2014 22:19:30 +0100 +Subject: ieee802154: Fix memory leak in ieee802154_add_iface() + +From: Christian Engelmayer + +[ Upstream commit 267d29a69c6af39445f36102a832b25ed483f299 ] + +Fix a memory leak in the ieee802154_add_iface() error handling path. +Detected by Coverity: CID 710490. + +Signed-off-by: Christian Engelmayer +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ieee802154/nl-phy.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +--- a/net/ieee802154/nl-phy.c ++++ b/net/ieee802154/nl-phy.c +@@ -224,8 +224,10 @@ static int ieee802154_add_iface(struct s + + if (info->attrs[IEEE802154_ATTR_DEV_TYPE]) { + type = nla_get_u8(info->attrs[IEEE802154_ATTR_DEV_TYPE]); +- if (type >= __IEEE802154_DEV_MAX) +- return -EINVAL; ++ if (type >= __IEEE802154_DEV_MAX) { ++ rc = -EINVAL; ++ goto nla_put_failure; ++ } + } + + dev = phy->add_iface(phy, devname, type); diff --git a/queue-3.10/inet_diag-fix-inet_diag_dump_icsk-timewait-socket-state-logic.patch b/queue-3.10/inet_diag-fix-inet_diag_dump_icsk-timewait-socket-state-logic.patch new file mode 100644 index 00000000000..87cdb6c7b0e --- /dev/null +++ b/queue-3.10/inet_diag-fix-inet_diag_dump_icsk-timewait-socket-state-logic.patch @@ -0,0 +1,62 @@ +From foo@baz Tue Feb 4 09:07:36 PST 2014 +From: Neal Cardwell +Date: Sun, 2 Feb 2014 20:40:13 -0500 +Subject: inet_diag: fix inet_diag_dump_icsk() timewait socket state logic + +From: Neal Cardwell + +[ Based upon upstream commit 70315d22d3c7383f9a508d0aab21e2eb35b2303a ] + +Fix inet_diag_dump_icsk() to reflect the fact that both TIME_WAIT and +FIN_WAIT2 connections are represented by inet_timewait_sock (not just +TIME_WAIT). Thus: + +(a) We need to iterate through the time_wait buckets if the user wants +either TIME_WAIT or FIN_WAIT2. (Before fixing this, "ss -nemoi state +fin-wait-2" would not return any sockets, even if there were some in +FIN_WAIT2.) + +(b) We need to check tw_substate to see if the user wants to dump +sockets in the particular substate (TIME_WAIT or FIN_WAIT2) that a +given connection is in. (Before fixing this, "ss -nemoi state +time-wait" would actually return sockets in state FIN_WAIT2.) + +An analogous fix is in v3.13: 70315d22d3c7383f9a508d0aab21e2eb35b2303a +("inet_diag: fix inet_diag_dump_icsk() to use correct state for +timewait sockets") but that patch is quite different because 3.13 code +is very different in this area due to the unification of TCP hash +tables in 05dbc7b ("tcp/dccp: remove twchain") in v3.13-rc1. + +I tested that this applies cleanly between v3.3 and v3.12, and tested +that it works in both 3.3 and 3.12. It does not apply cleanly to 3.2 +and earlier (though it makes semantic sense), and semantically is not +the right fix for 3.13 and beyond (as mentioned above). + +Signed-off-by: Neal Cardwell +Cc: Eric Dumazet +Acked-by: Eric Dumazet +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/inet_diag.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/net/ipv4/inet_diag.c ++++ b/net/ipv4/inet_diag.c +@@ -961,7 +961,7 @@ next_normal: + ++num; + } + +- if (r->idiag_states & TCPF_TIME_WAIT) { ++ if (r->idiag_states & (TCPF_TIME_WAIT | TCPF_FIN_WAIT2)) { + struct inet_timewait_sock *tw; + + inet_twsk_for_each(tw, node, +@@ -971,6 +971,8 @@ next_normal: + + if (num < s_num) + goto next_dying; ++ if (!(r->idiag_states & (1 << tw->tw_substate))) ++ goto next_dying; + if (r->sdiag_family != AF_UNSPEC && + tw->tw_family != r->sdiag_family) + goto next_dying; diff --git a/queue-3.10/ip6tnl-fix-double-free-of-fb_tnl_dev-on-exit.patch b/queue-3.10/ip6tnl-fix-double-free-of-fb_tnl_dev-on-exit.patch new file mode 100644 index 00000000000..4f6038a9ec5 --- /dev/null +++ b/queue-3.10/ip6tnl-fix-double-free-of-fb_tnl_dev-on-exit.patch @@ -0,0 +1,77 @@ +From foo@baz Tue Feb 4 09:07:36 PST 2014 +From: Nicolas Dichtel +Date: Fri, 31 Jan 2014 09:24:06 +0100 +Subject: ip6tnl: fix double free of fb_tnl_dev on exit + +From: Nicolas Dichtel + +[ No relevant upstream commit. ] + +This problem was fixed upstream by commit 1e9f3d6f1c40 ("ip6tnl: fix use after +free of fb_tnl_dev"). +The upstream patch depends on upstream commit 0bd8762824e7 ("ip6tnl: add x-netns +support"), which was not backported into 3.10 branch. + +First, explain the problem: when the ip6_tunnel module is unloaded, +ip6_tunnel_cleanup() is called. +rmmod ip6_tunnel +=> ip6_tunnel_cleanup() + => rtnl_link_unregister() + => __rtnl_kill_links() + => for_each_netdev(net, dev) { + if (dev->rtnl_link_ops == ops) + ops->dellink(dev, &list_kill); + } +At this point, the FB device is deleted (and all ip6tnl tunnels). + => unregister_pernet_device() + => unregister_pernet_operations() + => ops_exit_list() + => ip6_tnl_exit_net() + => ip6_tnl_destroy_tunnels() + => t = rtnl_dereference(ip6n->tnls_wc[0]); + unregister_netdevice_queue(t->dev, &list); +We delete the FB device a second time here! + +The previous fix removes these lines, which fix this double free. But the patch +introduces a memory leak when a netns is destroyed, because the FB device is +never deleted. By adding an rtnl ops which delete all ip6tnl device excepting +the FB device, we can keep this exlicit removal in ip6_tnl_destroy_tunnels(). + +CC: Steven Rostedt +CC: Willem de Bruijn +Signed-off-by: Nicolas Dichtel +Reported-by: Steven Rostedt +Tested-by: Steven Rostedt (and our entire MRG team) +Tested-by: "Luis Claudio R. Goncalves" +Tested-by: John Kacur +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv6/ip6_tunnel.c | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +--- a/net/ipv6/ip6_tunnel.c ++++ b/net/ipv6/ip6_tunnel.c +@@ -1617,6 +1617,15 @@ static int ip6_tnl_changelink(struct net + return ip6_tnl_update(t, &p); + } + ++static void ip6_tnl_dellink(struct net_device *dev, struct list_head *head) ++{ ++ struct net *net = dev_net(dev); ++ struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); ++ ++ if (dev != ip6n->fb_tnl_dev) ++ unregister_netdevice_queue(dev, head); ++} ++ + static size_t ip6_tnl_get_size(const struct net_device *dev) + { + return +@@ -1681,6 +1690,7 @@ static struct rtnl_link_ops ip6_link_ops + .validate = ip6_tnl_validate, + .newlink = ip6_tnl_newlink, + .changelink = ip6_tnl_changelink, ++ .dellink = ip6_tnl_dellink, + .get_size = ip6_tnl_get_size, + .fill_info = ip6_tnl_fill_info, + }; diff --git a/queue-3.10/ip_tunnel-clear-ipcb-in-ip_tunnel_xmit-in-case-dst_link_failure-is-called.patch b/queue-3.10/ip_tunnel-clear-ipcb-in-ip_tunnel_xmit-in-case-dst_link_failure-is-called.patch new file mode 100644 index 00000000000..996da3add6d --- /dev/null +++ b/queue-3.10/ip_tunnel-clear-ipcb-in-ip_tunnel_xmit-in-case-dst_link_failure-is-called.patch @@ -0,0 +1,36 @@ +From foo@baz Tue Feb 4 09:07:36 PST 2014 +From: Duan Jiong +Date: Thu, 23 Jan 2014 14:00:25 +0800 +Subject: ip_tunnel: clear IPCB in ip_tunnel_xmit() in case dst_link_failure() is called + +From: Duan Jiong + +[ Upstream commit 11c21a307d79ea5f6b6fc0d3dfdeda271e5e65f6 ] + +commit a622260254ee48("ip_tunnel: fix kernel panic with icmp_dest_unreach") +clear IPCB in ip_tunnel_xmit() , or else skb->cb[] may contain garbage from +GSO segmentation layer. + +But commit 0e6fbc5b6c621("ip_tunnels: extend iptunnel_xmit()") refactor codes, +and it clear IPCB behind the dst_link_failure(). + +So clear IPCB in ip_tunnel_xmit() just like commti a622260254ee48("ip_tunnel: +fix kernel panic with icmp_dest_unreach"). + +Signed-off-by: Duan Jiong +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/ip_tunnel.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/net/ipv4/ip_tunnel.c ++++ b/net/ipv4/ip_tunnel.c +@@ -636,6 +636,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, + tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { + tunnel->err_count--; + ++ memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + dst_link_failure(skb); + } else + tunnel->err_count = 0; diff --git a/queue-3.10/net-avoid-reference-counter-overflows-on-fib_rules-in-multicast-forwarding.patch b/queue-3.10/net-avoid-reference-counter-overflows-on-fib_rules-in-multicast-forwarding.patch new file mode 100644 index 00000000000..609ee8a25ca --- /dev/null +++ b/queue-3.10/net-avoid-reference-counter-overflows-on-fib_rules-in-multicast-forwarding.patch @@ -0,0 +1,73 @@ +From foo@baz Tue Feb 4 09:07:36 PST 2014 +From: Hannes Frederic Sowa +Date: Mon, 13 Jan 2014 02:45:22 +0100 +Subject: net: avoid reference counter overflows on fib_rules in multicast forwarding + +From: Hannes Frederic Sowa + +[ Upstream commit 95f4a45de1a0f172b35451fc52283290adb21f6e ] + +Bob Falken reported that after 4G packets, multicast forwarding stopped +working. This was because of a rule reference counter overflow which +freed the rule as soon as the overflow happend. + +This patch solves this by adding the FIB_LOOKUP_NOREF flag to +fib_rules_lookup calls. This is safe even from non-rcu locked sections +as in this case the flag only implies not taking a reference to the rule, +which we don't need at all. + +Rules only hold references to the namespace, which are guaranteed to be +available during the call of the non-rcu protected function reg_vif_xmit +because of the interface reference which itself holds a reference to +the net namespace. + +Fixes: f0ad0860d01e47 ("ipv4: ipmr: support multiple tables") +Fixes: d1db275dd3f6e4 ("ipv6: ip6mr: support multiple tables") +Reported-by: Bob Falken +Cc: Patrick McHardy +Cc: Thomas Graf +Cc: Julian Anastasov +Cc: Eric Dumazet +Signed-off-by: Hannes Frederic Sowa +Acked-by: Eric Dumazet +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/ipmr.c | 7 +++++-- + net/ipv6/ip6mr.c | 7 +++++-- + 2 files changed, 10 insertions(+), 4 deletions(-) + +--- a/net/ipv4/ipmr.c ++++ b/net/ipv4/ipmr.c +@@ -157,9 +157,12 @@ static struct mr_table *ipmr_get_table(s + static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4, + struct mr_table **mrt) + { +- struct ipmr_result res; +- struct fib_lookup_arg arg = { .result = &res, }; + int err; ++ struct ipmr_result res; ++ struct fib_lookup_arg arg = { ++ .result = &res, ++ .flags = FIB_LOOKUP_NOREF, ++ }; + + err = fib_rules_lookup(net->ipv4.mr_rules_ops, + flowi4_to_flowi(flp4), 0, &arg); +--- a/net/ipv6/ip6mr.c ++++ b/net/ipv6/ip6mr.c +@@ -141,9 +141,12 @@ static struct mr6_table *ip6mr_get_table + static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6, + struct mr6_table **mrt) + { +- struct ip6mr_result res; +- struct fib_lookup_arg arg = { .result = &res, }; + int err; ++ struct ip6mr_result res; ++ struct fib_lookup_arg arg = { ++ .result = &res, ++ .flags = FIB_LOOKUP_NOREF, ++ }; + + err = fib_rules_lookup(net->ipv6.mr6_rules_ops, + flowi6_to_flowi(flp6), 0, &arg); diff --git a/queue-3.10/net-fix-memory-leak-if-tproxy-used-with-tcp-early-demux.patch b/queue-3.10/net-fix-memory-leak-if-tproxy-used-with-tcp-early-demux.patch new file mode 100644 index 00000000000..903a033a14d --- /dev/null +++ b/queue-3.10/net-fix-memory-leak-if-tproxy-used-with-tcp-early-demux.patch @@ -0,0 +1,96 @@ +From foo@baz Tue Feb 4 09:07:36 PST 2014 +From: Holger Eitzenberger +Date: Mon, 27 Jan 2014 10:33:18 +0100 +Subject: net: Fix memory leak if TPROXY used with TCP early demux + +From: Holger Eitzenberger + +[ Upstream commit a452ce345d63ddf92cd101e4196569f8718ad319 ] + +I see a memory leak when using a transparent HTTP proxy using TPROXY +together with TCP early demux and Kernel v3.8.13.15 (Ubuntu stable): + +unreferenced object 0xffff88008cba4a40 (size 1696): + comm "softirq", pid 0, jiffies 4294944115 (age 8907.520s) + hex dump (first 32 bytes): + 0a e0 20 6a 40 04 1b 37 92 be 32 e2 e8 b4 00 00 .. j@..7..2..... + 02 00 07 01 00 00 00 00 00 00 00 00 00 00 00 00 ................ + backtrace: + [] kmem_cache_alloc+0xad/0xb9 + [] sk_prot_alloc+0x29/0xc5 + [] sk_clone_lock+0x14/0x283 + [] inet_csk_clone_lock+0xf/0x7b + [] netlink_broadcast+0x14/0x16 + [] tcp_create_openreq_child+0x1b/0x4c3 + [] tcp_v4_syn_recv_sock+0x38/0x25d + [] tcp_check_req+0x25c/0x3d0 + [] tcp_v4_do_rcv+0x287/0x40e + [] ip_route_input_noref+0x843/0xa55 + [] tcp_v4_rcv+0x4c9/0x725 + [] ip_local_deliver_finish+0xe9/0x154 + [] __netif_receive_skb+0x4b2/0x514 + [] process_backlog+0xee/0x1c5 + [] net_rx_action+0xa7/0x200 + [] add_interrupt_randomness+0x39/0x157 + +But there are many more, resulting in the machine going OOM after some +days. + +From looking at the TPROXY code, and with help from Florian, I see +that the memory leak is introduced in tcp_v4_early_demux(): + + void tcp_v4_early_demux(struct sk_buff *skb) + { + /* ... */ + + iph = ip_hdr(skb); + th = tcp_hdr(skb); + + if (th->doff < sizeof(struct tcphdr) / 4) + return; + + sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, + iph->saddr, th->source, + iph->daddr, ntohs(th->dest), + skb->skb_iif); + if (sk) { + skb->sk = sk; + +where the socket is assigned unconditionally to skb->sk, also bumping +the refcnt on it. This is problematic, because in our case the skb +has already a socket assigned in the TPROXY target. This then results +in the leak I see. + +The very same issue seems to be with IPv6, but haven't tested. + +Reviewed-by: Florian Westphal +Signed-off-by: Holger Eitzenberger +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/ip_input.c | 2 +- + net/ipv6/ip6_input.c | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +--- a/net/ipv4/ip_input.c ++++ b/net/ipv4/ip_input.c +@@ -313,7 +313,7 @@ static int ip_rcv_finish(struct sk_buff + const struct iphdr *iph = ip_hdr(skb); + struct rtable *rt; + +- if (sysctl_ip_early_demux && !skb_dst(skb)) { ++ if (sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) { + const struct net_protocol *ipprot; + int protocol = iph->protocol; + +--- a/net/ipv6/ip6_input.c ++++ b/net/ipv6/ip6_input.c +@@ -49,7 +49,7 @@ + + int ip6_rcv_finish(struct sk_buff *skb) + { +- if (sysctl_ip_early_demux && !skb_dst(skb)) { ++ if (sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) { + const struct inet6_protocol *ipprot; + + ipprot = rcu_dereference(inet6_protos[ipv6_hdr(skb)->nexthdr]); diff --git a/queue-3.10/net-rds-fix-per-cpu-helper-usage.patch b/queue-3.10/net-rds-fix-per-cpu-helper-usage.patch new file mode 100644 index 00000000000..43382dcf4cf --- /dev/null +++ b/queue-3.10/net-rds-fix-per-cpu-helper-usage.patch @@ -0,0 +1,52 @@ +From foo@baz Tue Feb 4 09:07:36 PST 2014 +From: Gerald Schaefer +Date: Thu, 16 Jan 2014 16:54:48 +0100 +Subject: net: rds: fix per-cpu helper usage + +From: Gerald Schaefer + +[ Upstream commit c196403b79aa241c3fefb3ee5bb328aa7c5cc860 ] + +commit ae4b46e9d "net: rds: use this_cpu_* per-cpu helper" broke per-cpu +handling for rds. chpfirst is the result of __this_cpu_read(), so it is +an absolute pointer and not __percpu. Therefore, __this_cpu_write() +should not operate on chpfirst, but rather on cache->percpu->first, just +like __this_cpu_read() did before. + +Signed-off-byd Gerald Schaefer +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/rds/ib_recv.c | 7 +++---- + 1 file changed, 3 insertions(+), 4 deletions(-) + +--- a/net/rds/ib_recv.c ++++ b/net/rds/ib_recv.c +@@ -421,8 +421,7 @@ static void rds_ib_recv_cache_put(struct + struct rds_ib_refill_cache *cache) + { + unsigned long flags; +- struct list_head *old; +- struct list_head __percpu *chpfirst; ++ struct list_head *old, *chpfirst; + + local_irq_save(flags); + +@@ -432,7 +431,7 @@ static void rds_ib_recv_cache_put(struct + else /* put on front */ + list_add_tail(new_item, chpfirst); + +- __this_cpu_write(chpfirst, new_item); ++ __this_cpu_write(cache->percpu->first, new_item); + __this_cpu_inc(cache->percpu->count); + + if (__this_cpu_read(cache->percpu->count) < RDS_IB_RECYCLE_BATCH_COUNT) +@@ -452,7 +451,7 @@ static void rds_ib_recv_cache_put(struct + } while (old); + + +- __this_cpu_write(chpfirst, NULL); ++ __this_cpu_write(cache->percpu->first, NULL); + __this_cpu_write(cache->percpu->count, 0); + end: + local_irq_restore(flags); diff --git a/queue-3.10/net-via-rhine-fix-tx_timeout-handling.patch b/queue-3.10/net-via-rhine-fix-tx_timeout-handling.patch new file mode 100644 index 00000000000..29ce8feaba9 --- /dev/null +++ b/queue-3.10/net-via-rhine-fix-tx_timeout-handling.patch @@ -0,0 +1,34 @@ +From foo@baz Tue Feb 4 09:07:36 PST 2014 +From: Richard Weinberger +Date: Tue, 14 Jan 2014 22:46:36 +0100 +Subject: net,via-rhine: Fix tx_timeout handling + +From: Richard Weinberger + +[ Upstream commit a926592f5e4e900f3fa903298c4619a131e60963 ] + +rhine_reset_task() misses to disable the tx scheduler upon reset, +this can lead to a crash if work is still scheduled while we're resetting +the tx queue. + +Fixes: +[ 93.591707] BUG: unable to handle kernel NULL pointer dereference at 0000004c +[ 93.595514] IP: [] rhine_napipoll+0x491/0x6 + +Signed-off-by: Richard Weinberger +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/via/via-rhine.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/drivers/net/ethernet/via/via-rhine.c ++++ b/drivers/net/ethernet/via/via-rhine.c +@@ -1611,6 +1611,7 @@ static void rhine_reset_task(struct work + goto out_unlock; + + napi_disable(&rp->napi); ++ netif_tx_disable(dev); + spin_lock_bh(&rp->lock); + + /* clear all descriptors */ diff --git a/queue-3.10/revert-ip6tnl-fix-use-after-free-of-fb_tnl_dev.patch b/queue-3.10/revert-ip6tnl-fix-use-after-free-of-fb_tnl_dev.patch new file mode 100644 index 00000000000..a16888ff548 --- /dev/null +++ b/queue-3.10/revert-ip6tnl-fix-use-after-free-of-fb_tnl_dev.patch @@ -0,0 +1,35 @@ +From foo@baz Tue Feb 4 09:07:36 PST 2014 +From: Nicolas Dichtel +Date: Fri, 31 Jan 2014 09:24:05 +0100 +Subject: Revert "ip6tnl: fix use after free of fb_tnl_dev" + +From: Nicolas Dichtel + +[ No relevant upstream commit. ] + +This reverts commit 22c3ec552c29cf4bd4a75566088950fe57d860c4. + +This patch is not the right fix, it introduces a memory leak when a netns is +destroyed (the FB device is never deleted). + +Signed-off-by: Nicolas Dichtel +Reported-by: Steven Rostedt +Tested-by: Steven Rostedt (and our entire MRG team) +Tested-by: "Luis Claudio R. Goncalves" +Tested-by: John Kacur +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv6/ip6_tunnel.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/net/ipv6/ip6_tunnel.c ++++ b/net/ipv6/ip6_tunnel.c +@@ -1711,6 +1711,8 @@ static void __net_exit ip6_tnl_destroy_t + } + } + ++ t = rtnl_dereference(ip6n->tnls_wc[0]); ++ unregister_netdevice_queue(t->dev, &list); + unregister_netdevice_many(&list); + } + diff --git a/queue-3.10/s390-bpf-jit-fix-32-bit-divisions-use-unsigned-divide-instructions.patch b/queue-3.10/s390-bpf-jit-fix-32-bit-divisions-use-unsigned-divide-instructions.patch new file mode 100644 index 00000000000..ff853a809db --- /dev/null +++ b/queue-3.10/s390-bpf-jit-fix-32-bit-divisions-use-unsigned-divide-instructions.patch @@ -0,0 +1,74 @@ +From foo@baz Tue Feb 4 09:07:36 PST 2014 +From: Heiko Carstens +Date: Fri, 17 Jan 2014 09:37:15 +0100 +Subject: s390/bpf,jit: fix 32 bit divisions, use unsigned divide instructions + +From: Heiko Carstens + +[ Upstream commit 3af57f78c38131b7a66e2b01e06fdacae01992a3 ] + +The s390 bpf jit compiler emits the signed divide instructions "dr" and "d" +for unsigned divisions. +This can cause problems: the dividend will be zero extended to a 64 bit value +and the divisor is the 32 bit signed value as specified A or X accumulator, +even though A and X are supposed to be treated as unsigned values. + +The divide instrunctions will generate an exception if the result cannot be +expressed with a 32 bit signed value. +This is the case if e.g. the dividend is 0xffffffff and the divisor either 1 +or also 0xffffffff (signed: -1). + +To avoid all these issues simply use unsigned divide instructions. + +Signed-off-by: Heiko Carstens +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + arch/s390/net/bpf_jit_comp.c | 16 ++++++++-------- + 1 file changed, 8 insertions(+), 8 deletions(-) + +--- a/arch/s390/net/bpf_jit_comp.c ++++ b/arch/s390/net/bpf_jit_comp.c +@@ -335,16 +335,16 @@ static int bpf_jit_insn(struct bpf_jit * + EMIT4_PCREL(0xa7840000, (jit->ret0_ip - jit->prg)); + /* lhi %r4,0 */ + EMIT4(0xa7480000); +- /* dr %r4,%r12 */ +- EMIT2(0x1d4c); ++ /* dlr %r4,%r12 */ ++ EMIT4(0xb997004c); + break; + case BPF_S_ALU_DIV_K: /* A /= K */ + if (K == 1) + break; + /* lhi %r4,0 */ + EMIT4(0xa7480000); +- /* d %r4,(%r13) */ +- EMIT4_DISP(0x5d40d000, EMIT_CONST(K)); ++ /* dl %r4,(%r13) */ ++ EMIT6_DISP(0xe340d000, 0x0097, EMIT_CONST(K)); + break; + case BPF_S_ALU_MOD_X: /* A %= X */ + jit->seen |= SEEN_XREG | SEEN_RET0; +@@ -354,8 +354,8 @@ static int bpf_jit_insn(struct bpf_jit * + EMIT4_PCREL(0xa7840000, (jit->ret0_ip - jit->prg)); + /* lhi %r4,0 */ + EMIT4(0xa7480000); +- /* dr %r4,%r12 */ +- EMIT2(0x1d4c); ++ /* dlr %r4,%r12 */ ++ EMIT4(0xb997004c); + /* lr %r5,%r4 */ + EMIT2(0x1854); + break; +@@ -367,8 +367,8 @@ static int bpf_jit_insn(struct bpf_jit * + } + /* lhi %r4,0 */ + EMIT4(0xa7480000); +- /* d %r4,(%r13) */ +- EMIT4_DISP(0x5d40d000, EMIT_CONST(K)); ++ /* dl %r4,(%r13) */ ++ EMIT6_DISP(0xe340d000, 0x0097, EMIT_CONST(K)); + /* lr %r5,%r4 */ + EMIT2(0x1854); + break; diff --git a/queue-3.10/series b/queue-3.10/series index ef9af12e52a..ad0ca5308bb 100644 --- a/queue-3.10/series +++ b/queue-3.10/series @@ -71,3 +71,19 @@ ext4-avoid-clearing-beyond-i_blocks-when-truncating-an-inline-data-file.patch vfs-is-mounted-should-be-testing-mnt_ns-for-null-or-error.patch bcache-data-corruption-fix.patch hp_accel-add-a-new-pnp-id-hpq6007-for-new-hp-laptops.patch +bnx2x-fix-dma-unmapping-of-tso-split-bds.patch +inet_diag-fix-inet_diag_dump_icsk-timewait-socket-state-logic.patch +ieee802154-fix-memory-leak-in-ieee802154_add_iface.patch +net-avoid-reference-counter-overflows-on-fib_rules-in-multicast-forwarding.patch +net-via-rhine-fix-tx_timeout-handling.patch +net-rds-fix-per-cpu-helper-usage.patch +tcp-metrics-avoid-duplicate-entries-with-the-same-destination-ip.patch +bpf-do-not-use-reciprocal-divide.patch +s390-bpf-jit-fix-32-bit-divisions-use-unsigned-divide-instructions.patch +ip_tunnel-clear-ipcb-in-ip_tunnel_xmit-in-case-dst_link_failure-is-called.patch +fib_frontend-fix-possible-null-pointer-dereference.patch +net-fix-memory-leak-if-tproxy-used-with-tcp-early-demux.patch +xen-netfront-fix-resource-leak-in-netfront.patch +sit-fix-double-free-of-fb_tunnel_dev-on-exit.patch +revert-ip6tnl-fix-use-after-free-of-fb_tnl_dev.patch +ip6tnl-fix-double-free-of-fb_tnl_dev-on-exit.patch diff --git a/queue-3.10/sit-fix-double-free-of-fb_tunnel_dev-on-exit.patch b/queue-3.10/sit-fix-double-free-of-fb_tunnel_dev-on-exit.patch new file mode 100644 index 00000000000..c22a52c0fc4 --- /dev/null +++ b/queue-3.10/sit-fix-double-free-of-fb_tunnel_dev-on-exit.patch @@ -0,0 +1,77 @@ +From foo@baz Tue Feb 4 09:07:36 PST 2014 +From: Nicolas Dichtel +Date: Fri, 31 Jan 2014 09:24:04 +0100 +Subject: sit: fix double free of fb_tunnel_dev on exit + +From: Nicolas Dichtel + +[ No relevant upstream commit. ] + +This problem was fixed upstream by commit 9434266f2c64 ("sit: fix use after free +of fb_tunnel_dev"). +The upstream patch depends on upstream commit 5e6700b3bf98 ("sit: add support of +x-netns"), which was not backported into 3.10 branch. + +First, explain the problem: when the sit module is unloaded, sit_cleanup() is +called. +rmmod sit +=> sit_cleanup() + => rtnl_link_unregister() + => __rtnl_kill_links() + => for_each_netdev(net, dev) { + if (dev->rtnl_link_ops == ops) + ops->dellink(dev, &list_kill); + } +At this point, the FB device is deleted (and all sit tunnels). + => unregister_pernet_device() + => unregister_pernet_operations() + => ops_exit_list() + => sit_exit_net() + => sit_destroy_tunnels() + In this function, no tunnel is found. + => unregister_netdevice_queue(sitn->fb_tunnel_dev, &list); +We delete the FB device a second time here! + +Because we cannot simply remove the second deletion (sit_exit_net() must remove +the FB device when a netns is deleted), we add an rtnl ops which delete all sit +device excepting the FB device and thus we can keep the explicit deletion in +sit_exit_net(). + +CC: Steven Rostedt +Signed-off-by: Nicolas Dichtel +Acked-by: Willem de Bruijn +Reported-by: Steven Rostedt +Tested-by: Steven Rostedt (and our entire MRG team) +Tested-by: "Luis Claudio R. Goncalves" +Tested-by: John Kacur +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv6/sit.c | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +--- a/net/ipv6/sit.c ++++ b/net/ipv6/sit.c +@@ -1507,6 +1507,15 @@ static const struct nla_policy ipip6_pol + #endif + }; + ++static void ipip6_dellink(struct net_device *dev, struct list_head *head) ++{ ++ struct net *net = dev_net(dev); ++ struct sit_net *sitn = net_generic(net, sit_net_id); ++ ++ if (dev != sitn->fb_tunnel_dev) ++ unregister_netdevice_queue(dev, head); ++} ++ + static struct rtnl_link_ops sit_link_ops __read_mostly = { + .kind = "sit", + .maxtype = IFLA_IPTUN_MAX, +@@ -1517,6 +1526,7 @@ static struct rtnl_link_ops sit_link_ops + .changelink = ipip6_changelink, + .get_size = ipip6_get_size, + .fill_info = ipip6_fill_info, ++ .dellink = ipip6_dellink, + }; + + static struct xfrm_tunnel sit_handler __read_mostly = { diff --git a/queue-3.10/tcp-metrics-avoid-duplicate-entries-with-the-same-destination-ip.patch b/queue-3.10/tcp-metrics-avoid-duplicate-entries-with-the-same-destination-ip.patch new file mode 100644 index 00000000000..786c415393c --- /dev/null +++ b/queue-3.10/tcp-metrics-avoid-duplicate-entries-with-the-same-destination-ip.patch @@ -0,0 +1,128 @@ +From foo@baz Tue Feb 4 09:07:36 PST 2014 +From: Christoph Paasch +Date: Thu, 16 Jan 2014 20:01:21 +0100 +Subject: tcp: metrics: Avoid duplicate entries with the same destination-IP + +From: Christoph Paasch + +[ Upstream commit 77f99ad16a07aa062c2d30fae57b1fee456f6ef6 ] + +Because the tcp-metrics is an RCU-list, it may be that two +soft-interrupts are inside __tcp_get_metrics() for the same +destination-IP at the same time. If this destination-IP is not yet part of +the tcp-metrics, both soft-interrupts will end up in tcpm_new and create +a new entry for this IP. +So, we will have two tcp-metrics with the same destination-IP in the list. + +This patch checks twice __tcp_get_metrics(). First without holding the +lock, then while holding the lock. The second one is there to confirm +that the entry has not been added by another soft-irq while waiting for +the spin-lock. + +Fixes: 51c5d0c4b169b (tcp: Maintain dynamic metrics in local cache.) +Signed-off-by: Christoph Paasch +Reviewed-by: Eric Dumazet +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/tcp_metrics.c | 51 ++++++++++++++++++++++++++++++------------------- + 1 file changed, 32 insertions(+), 19 deletions(-) + +--- a/net/ipv4/tcp_metrics.c ++++ b/net/ipv4/tcp_metrics.c +@@ -22,6 +22,9 @@ + + int sysctl_tcp_nometrics_save __read_mostly; + ++static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *addr, ++ struct net *net, unsigned int hash); ++ + struct tcp_fastopen_metrics { + u16 mss; + u16 syn_loss:10; /* Recurring Fast Open SYN losses */ +@@ -130,16 +133,41 @@ static void tcpm_suck_dst(struct tcp_met + } + } + ++#define TCP_METRICS_TIMEOUT (60 * 60 * HZ) ++ ++static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst) ++{ ++ if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT))) ++ tcpm_suck_dst(tm, dst, false); ++} ++ ++#define TCP_METRICS_RECLAIM_DEPTH 5 ++#define TCP_METRICS_RECLAIM_PTR (struct tcp_metrics_block *) 0x1UL ++ + static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, + struct inetpeer_addr *addr, +- unsigned int hash, +- bool reclaim) ++ unsigned int hash) + { + struct tcp_metrics_block *tm; + struct net *net; ++ bool reclaim = false; + + spin_lock_bh(&tcp_metrics_lock); + net = dev_net(dst->dev); ++ ++ /* While waiting for the spin-lock the cache might have been populated ++ * with this entry and so we have to check again. ++ */ ++ tm = __tcp_get_metrics(addr, net, hash); ++ if (tm == TCP_METRICS_RECLAIM_PTR) { ++ reclaim = true; ++ tm = NULL; ++ } ++ if (tm) { ++ tcpm_check_stamp(tm, dst); ++ goto out_unlock; ++ } ++ + if (unlikely(reclaim)) { + struct tcp_metrics_block *oldest; + +@@ -169,17 +197,6 @@ out_unlock: + return tm; + } + +-#define TCP_METRICS_TIMEOUT (60 * 60 * HZ) +- +-static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst) +-{ +- if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT))) +- tcpm_suck_dst(tm, dst, false); +-} +- +-#define TCP_METRICS_RECLAIM_DEPTH 5 +-#define TCP_METRICS_RECLAIM_PTR (struct tcp_metrics_block *) 0x1UL +- + static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, int depth) + { + if (tm) +@@ -280,7 +297,6 @@ static struct tcp_metrics_block *tcp_get + struct inetpeer_addr addr; + unsigned int hash; + struct net *net; +- bool reclaim; + + addr.family = sk->sk_family; + switch (addr.family) { +@@ -300,13 +316,10 @@ static struct tcp_metrics_block *tcp_get + hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); + + tm = __tcp_get_metrics(&addr, net, hash); +- reclaim = false; +- if (tm == TCP_METRICS_RECLAIM_PTR) { +- reclaim = true; ++ if (tm == TCP_METRICS_RECLAIM_PTR) + tm = NULL; +- } + if (!tm && create) +- tm = tcpm_new(dst, &addr, hash, reclaim); ++ tm = tcpm_new(dst, &addr, hash); + else + tcpm_check_stamp(tm, dst); + diff --git a/queue-3.10/xen-netfront-fix-resource-leak-in-netfront.patch b/queue-3.10/xen-netfront-fix-resource-leak-in-netfront.patch new file mode 100644 index 00000000000..0f838a47bb4 --- /dev/null +++ b/queue-3.10/xen-netfront-fix-resource-leak-in-netfront.patch @@ -0,0 +1,204 @@ +From foo@baz Tue Feb 4 09:07:36 PST 2014 +From: Annie Li +Date: Tue, 28 Jan 2014 11:35:42 +0800 +Subject: xen-netfront: fix resource leak in netfront + +From: Annie Li + +[ Upstream commit cefe0078eea52af17411eb1248946a94afb84ca5 ] + +This patch removes grant transfer releasing code from netfront, and uses +gnttab_end_foreign_access to end grant access since +gnttab_end_foreign_access_ref may fail when the grant entry is +currently used for reading or writing. + +* clean up grant transfer code kept from old netfront(2.6.18) which grants +pages for access/map and transfer. But grant transfer is deprecated in current +netfront, so remove corresponding release code for transfer. + +* fix resource leak, release grant access (through gnttab_end_foreign_access) +and skb for tx/rx path, use get_page to ensure page is released when grant +access is completed successfully. + +Xen-blkfront/xen-tpmfront/xen-pcifront also have similar issue, but patches +for them will be created separately. + +V6: Correct subject line and commit message. + +V5: Remove unecessary change in xennet_end_access. + +V4: Revert put_page in gnttab_end_foreign_access, and keep netfront change in +single patch. + +V3: Changes as suggestion from David Vrabel, ensure pages are not freed untill +grant acess is ended. + +V2: Improve patch comments. + +Signed-off-by: Annie Li +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/xen-netfront.c | 88 +++++++++++++-------------------------------- + 1 file changed, 26 insertions(+), 62 deletions(-) + +--- a/drivers/net/xen-netfront.c ++++ b/drivers/net/xen-netfront.c +@@ -107,6 +107,7 @@ struct netfront_info { + } tx_skbs[NET_TX_RING_SIZE]; + grant_ref_t gref_tx_head; + grant_ref_t grant_tx_ref[NET_TX_RING_SIZE]; ++ struct page *grant_tx_page[NET_TX_RING_SIZE]; + unsigned tx_skb_freelist; + + spinlock_t rx_lock ____cacheline_aligned_in_smp; +@@ -386,6 +387,7 @@ static void xennet_tx_buf_gc(struct net_ + gnttab_release_grant_reference( + &np->gref_tx_head, np->grant_tx_ref[id]); + np->grant_tx_ref[id] = GRANT_INVALID_REF; ++ np->grant_tx_page[id] = NULL; + add_id_to_freelist(&np->tx_skb_freelist, np->tx_skbs, id); + dev_kfree_skb_irq(skb); + } +@@ -442,6 +444,7 @@ static void xennet_make_frags(struct sk_ + gnttab_grant_foreign_access_ref(ref, np->xbdev->otherend_id, + mfn, GNTMAP_readonly); + ++ np->grant_tx_page[id] = virt_to_page(data); + tx->gref = np->grant_tx_ref[id] = ref; + tx->offset = offset; + tx->size = len; +@@ -487,6 +490,7 @@ static void xennet_make_frags(struct sk_ + np->xbdev->otherend_id, + mfn, GNTMAP_readonly); + ++ np->grant_tx_page[id] = page; + tx->gref = np->grant_tx_ref[id] = ref; + tx->offset = offset; + tx->size = bytes; +@@ -586,6 +590,7 @@ static int xennet_start_xmit(struct sk_b + mfn = virt_to_mfn(data); + gnttab_grant_foreign_access_ref( + ref, np->xbdev->otherend_id, mfn, GNTMAP_readonly); ++ np->grant_tx_page[id] = virt_to_page(data); + tx->gref = np->grant_tx_ref[id] = ref; + tx->offset = offset; + tx->size = len; +@@ -1115,10 +1120,11 @@ static void xennet_release_tx_bufs(struc + continue; + + skb = np->tx_skbs[i].skb; +- gnttab_end_foreign_access_ref(np->grant_tx_ref[i], +- GNTMAP_readonly); +- gnttab_release_grant_reference(&np->gref_tx_head, +- np->grant_tx_ref[i]); ++ get_page(np->grant_tx_page[i]); ++ gnttab_end_foreign_access(np->grant_tx_ref[i], ++ GNTMAP_readonly, ++ (unsigned long)page_address(np->grant_tx_page[i])); ++ np->grant_tx_page[i] = NULL; + np->grant_tx_ref[i] = GRANT_INVALID_REF; + add_id_to_freelist(&np->tx_skb_freelist, np->tx_skbs, i); + dev_kfree_skb_irq(skb); +@@ -1127,78 +1133,35 @@ static void xennet_release_tx_bufs(struc + + static void xennet_release_rx_bufs(struct netfront_info *np) + { +- struct mmu_update *mmu = np->rx_mmu; +- struct multicall_entry *mcl = np->rx_mcl; +- struct sk_buff_head free_list; +- struct sk_buff *skb; +- unsigned long mfn; +- int xfer = 0, noxfer = 0, unused = 0; + int id, ref; + +- dev_warn(&np->netdev->dev, "%s: fix me for copying receiver.\n", +- __func__); +- return; +- +- skb_queue_head_init(&free_list); +- + spin_lock_bh(&np->rx_lock); + + for (id = 0; id < NET_RX_RING_SIZE; id++) { +- ref = np->grant_rx_ref[id]; +- if (ref == GRANT_INVALID_REF) { +- unused++; +- continue; +- } ++ struct sk_buff *skb; ++ struct page *page; + + skb = np->rx_skbs[id]; +- mfn = gnttab_end_foreign_transfer_ref(ref); +- gnttab_release_grant_reference(&np->gref_rx_head, ref); +- np->grant_rx_ref[id] = GRANT_INVALID_REF; +- +- if (0 == mfn) { +- skb_shinfo(skb)->nr_frags = 0; +- dev_kfree_skb(skb); +- noxfer++; ++ if (!skb) + continue; +- } + +- if (!xen_feature(XENFEAT_auto_translated_physmap)) { +- /* Remap the page. */ +- const struct page *page = +- skb_frag_page(&skb_shinfo(skb)->frags[0]); +- unsigned long pfn = page_to_pfn(page); +- void *vaddr = page_address(page); ++ ref = np->grant_rx_ref[id]; ++ if (ref == GRANT_INVALID_REF) ++ continue; + +- MULTI_update_va_mapping(mcl, (unsigned long)vaddr, +- mfn_pte(mfn, PAGE_KERNEL), +- 0); +- mcl++; +- mmu->ptr = ((u64)mfn << PAGE_SHIFT) +- | MMU_MACHPHYS_UPDATE; +- mmu->val = pfn; +- mmu++; ++ page = skb_frag_page(&skb_shinfo(skb)->frags[0]); + +- set_phys_to_machine(pfn, mfn); +- } +- __skb_queue_tail(&free_list, skb); +- xfer++; +- } +- +- dev_info(&np->netdev->dev, "%s: %d xfer, %d noxfer, %d unused\n", +- __func__, xfer, noxfer, unused); ++ /* gnttab_end_foreign_access() needs a page ref until ++ * foreign access is ended (which may be deferred). ++ */ ++ get_page(page); ++ gnttab_end_foreign_access(ref, 0, ++ (unsigned long)page_address(page)); ++ np->grant_rx_ref[id] = GRANT_INVALID_REF; + +- if (xfer) { +- if (!xen_feature(XENFEAT_auto_translated_physmap)) { +- /* Do all the remapping work and M2P updates. */ +- MULTI_mmu_update(mcl, np->rx_mmu, mmu - np->rx_mmu, +- NULL, DOMID_SELF); +- mcl++; +- HYPERVISOR_multicall(np->rx_mcl, mcl - np->rx_mcl); +- } ++ kfree_skb(skb); + } + +- __skb_queue_purge(&free_list); +- + spin_unlock_bh(&np->rx_lock); + } + +@@ -1333,6 +1296,7 @@ static struct net_device *xennet_create_ + for (i = 0; i < NET_RX_RING_SIZE; i++) { + np->rx_skbs[i] = NULL; + np->grant_rx_ref[i] = GRANT_INVALID_REF; ++ np->grant_tx_page[i] = NULL; + } + + /* A grant for every tx ring slot */