From: Greg Kroah-Hartman Date: Wed, 11 Sep 2013 16:13:50 +0000 (-0700) Subject: 3.4-stable patches X-Git-Tag: v3.0.96~10 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=8dc8de6aec819b2f8b01481e3f801e553fedb43c;p=thirdparty%2Fkernel%2Fstable-queue.git 3.4-stable patches added patches: 8139cp-fix-skb-leak-in-rx_status_loop-failure-path.patch af_key-initialize-satype-in-key_notify_policy_flush.patch bonding-modify-only-neigh_parms-owned-by-us.patch fib_trie-remove-potential-out-of-bound-access.patch htb-fix-sign-extension-bug.patch icmpv6-treat-dest-unreachable-codes-5-and-6-as-eacces-not-eproto.patch ipv6-don-t-depend-on-per-socket-memory-for-neighbour-discovery-messages.patch ipv6-don-t-stop-backtracking-in-fib6_lookup_1-if-subtree-does-not-match.patch ipv6-drop-packets-with-multiple-fragmentation-headers.patch ipv6-remove-max_addresses-check-from-ipv6_create_tempaddr.patch macvtap-do-not-zerocopy-if-iov-needs-more-pages-than-max_skb_frags.patch neighbour-populate-neigh_parms-on-alloc-before-calling-ndo_neigh_setup.patch net-bridge-convert-mldv2-query-mrc-into-msecs_to_jiffies-for-max_delay.patch net-check-net.core.somaxconn-sysctl-values.patch net-ipv6-tcp-fix-potential-use-after-free-in-tcp_v6_do_rcv.patch tcp-cubic-fix-bug-in-bictcp_acked.patch tcp-cubic-fix-overflow-error-in-bictcp_update.patch tipc-fix-lockdep-warning-during-bearer-initialization.patch tun-signedness-bug-in-tun_get_user.patch vhost-zerocopy-poll-vq-in-zerocopy-callback.patch --- diff --git a/queue-3.4/8139cp-fix-skb-leak-in-rx_status_loop-failure-path.patch b/queue-3.4/8139cp-fix-skb-leak-in-rx_status_loop-failure-path.patch new file mode 100644 index 00000000000..0ead4b05271 --- /dev/null +++ b/queue-3.4/8139cp-fix-skb-leak-in-rx_status_loop-failure-path.patch @@ -0,0 +1,29 @@ +From b2fb347f0dae2ffea9234d3c6b4fd6ad4b75fe81 Mon Sep 17 00:00:00 2001 +From: Dave Jones +Date: Fri, 9 Aug 2013 11:16:34 -0700 +Subject: 8139cp: Fix skb leak in rx_status_loop failure path. + +From: Dave Jones + +[ Upstream commit d06f5187469eee1b2932c02fd093d113cfc60d5e ] + +Introduced in cf3c4c03060b688cbc389ebc5065ebcce5653e96 +("8139cp: Add dma_mapping_error checking") + +Signed-off-by: Dave Jones +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/realtek/8139cp.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/drivers/net/ethernet/realtek/8139cp.c ++++ b/drivers/net/ethernet/realtek/8139cp.c +@@ -524,6 +524,7 @@ rx_status_loop: + PCI_DMA_FROMDEVICE); + if (dma_mapping_error(&cp->pdev->dev, new_mapping)) { + dev->stats.rx_dropped++; ++ kfree_skb(new_skb); + goto rx_next; + } + diff --git a/queue-3.4/af_key-initialize-satype-in-key_notify_policy_flush.patch b/queue-3.4/af_key-initialize-satype-in-key_notify_policy_flush.patch new file mode 100644 index 00000000000..004bd147d41 --- /dev/null +++ b/queue-3.4/af_key-initialize-satype-in-key_notify_policy_flush.patch @@ -0,0 +1,29 @@ +From 930e232cc73bdf918a0896ffc458902ab8897a88 Mon Sep 17 00:00:00 2001 +From: Nicolas Dichtel +Date: Mon, 18 Feb 2013 16:24:20 +0100 +Subject: af_key: initialize satype in key_notify_policy_flush() + +From: Nicolas Dichtel + +[ Upstream commit 85dfb745ee40232876663ae206cba35f24ab2a40 ] + +This field was left uninitialized. Some user daemons perform check against this +field. + +Signed-off-by: Nicolas Dichtel +Signed-off-by: Steffen Klassert +Signed-off-by: Greg Kroah-Hartman +--- + net/key/af_key.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/net/key/af_key.c ++++ b/net/key/af_key.c +@@ -1704,6 +1704,7 @@ static int key_notify_sa_flush(const str + hdr->sadb_msg_pid = c->pid; + hdr->sadb_msg_version = PF_KEY_V2; + hdr->sadb_msg_errno = (uint8_t) 0; ++ hdr->sadb_msg_satype = SADB_SATYPE_UNSPEC; + hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); + hdr->sadb_msg_reserved = 0; + diff --git a/queue-3.4/bonding-modify-only-neigh_parms-owned-by-us.patch b/queue-3.4/bonding-modify-only-neigh_parms-owned-by-us.patch new file mode 100644 index 00000000000..cce2f8d3616 --- /dev/null +++ b/queue-3.4/bonding-modify-only-neigh_parms-owned-by-us.patch @@ -0,0 +1,40 @@ +From f2c884d19bed59210a90449397ea9d34de0240ba Mon Sep 17 00:00:00 2001 +From: Veaceslav Falico +Date: Fri, 2 Aug 2013 19:07:39 +0200 +Subject: bonding: modify only neigh_parms owned by us + +From: Veaceslav Falico + +[ Upstream commit 9918d5bf329d0dc5bb2d9d293bcb772bdb626e65 ] + +Otherwise, on neighbour creation, bond_neigh_init() will be called with a +foreign netdev. + +Signed-off-by: Veaceslav Falico +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/bonding/bond_main.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +--- a/drivers/net/bonding/bond_main.c ++++ b/drivers/net/bonding/bond_main.c +@@ -3750,11 +3750,17 @@ static int bond_neigh_init(struct neighb + * The bonding ndo_neigh_setup is called at init time beofre any + * slave exists. So we must declare proxy setup function which will + * be used at run time to resolve the actual slave neigh param setup. ++ * ++ * It's also called by master devices (such as vlans) to setup their ++ * underlying devices. In that case - do nothing, we're already set up from ++ * our init. + */ + static int bond_neigh_setup(struct net_device *dev, + struct neigh_parms *parms) + { +- parms->neigh_setup = bond_neigh_init; ++ /* modify only our neigh_parms */ ++ if (parms->dev == dev) ++ parms->neigh_setup = bond_neigh_init; + + return 0; + } diff --git a/queue-3.4/fib_trie-remove-potential-out-of-bound-access.patch b/queue-3.4/fib_trie-remove-potential-out-of-bound-access.patch new file mode 100644 index 00000000000..0163ea43ca6 --- /dev/null +++ b/queue-3.4/fib_trie-remove-potential-out-of-bound-access.patch @@ -0,0 +1,51 @@ +From 28561e4df3d4d37ea5dbf01cd8623a5fe2dbb369 Mon Sep 17 00:00:00 2001 +From: Eric Dumazet +Date: Mon, 5 Aug 2013 11:18:49 -0700 +Subject: fib_trie: remove potential out of bound access + +From: Eric Dumazet + +[ Upstream commit aab515d7c32a34300312416c50314e755ea6f765 ] + +AddressSanitizer [1] dynamic checker pointed a potential +out of bound access in leaf_walk_rcu() + +We could allocate one more slot in tnode_new() to leave the prefetch() +in-place but it looks not worth the pain. + +Bug added in commit 82cfbb008572b ("[IPV4] fib_trie: iterator recode") + +[1] : +https://code.google.com/p/address-sanitizer/wiki/AddressSanitizerForKernel + +Reported-by: Andrey Konovalov +Signed-off-by: Eric Dumazet +Cc: Dmitry Vyukov +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/fib_trie.c | 5 +---- + 1 file changed, 1 insertion(+), 4 deletions(-) + +--- a/net/ipv4/fib_trie.c ++++ b/net/ipv4/fib_trie.c +@@ -71,7 +71,6 @@ + #include + #include + #include +-#include + #include + #include + #include +@@ -1772,10 +1771,8 @@ static struct leaf *leaf_walk_rcu(struct + if (!c) + continue; + +- if (IS_LEAF(c)) { +- prefetch(rcu_dereference_rtnl(p->child[idx])); ++ if (IS_LEAF(c)) + return (struct leaf *) c; +- } + + /* Rescan start scanning in new node */ + p = (struct tnode *) c; diff --git a/queue-3.4/htb-fix-sign-extension-bug.patch b/queue-3.4/htb-fix-sign-extension-bug.patch new file mode 100644 index 00000000000..02472c0ae14 --- /dev/null +++ b/queue-3.4/htb-fix-sign-extension-bug.patch @@ -0,0 +1,39 @@ +From 470ca701447611fa3276dec1994b2be6fb3e3746 Mon Sep 17 00:00:00 2001 +From: stephen hemminger +Date: Thu, 1 Aug 2013 22:32:07 -0700 +Subject: htb: fix sign extension bug + +From: stephen hemminger + +[ Upstream commit cbd375567f7e4811b1c721f75ec519828ac6583f ] + +When userspace passes a large priority value +the assignment of the unsigned value hopt->prio +to signed int cl->prio causes cl->prio to become negative and the +comparison is with TC_HTB_NUMPRIO is always false. + +The result is that HTB crashes by referencing outside +the array when processing packets. With this patch the large value +wraps around like other values outside the normal range. + +See: https://bugzilla.kernel.org/show_bug.cgi?id=60669 + +Signed-off-by: Stephen Hemminger +Acked-by: Eric Dumazet +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/sched/sch_htb.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/sched/sch_htb.c ++++ b/net/sched/sch_htb.c +@@ -86,7 +86,7 @@ struct htb_class { + unsigned int children; + struct htb_class *parent; /* parent class */ + +- int prio; /* these two are used only by leaves... */ ++ u32 prio; /* these two are used only by leaves... */ + int quantum; /* but stored for parent-to-leaf return */ + + union { diff --git a/queue-3.4/icmpv6-treat-dest-unreachable-codes-5-and-6-as-eacces-not-eproto.patch b/queue-3.4/icmpv6-treat-dest-unreachable-codes-5-and-6-as-eacces-not-eproto.patch new file mode 100644 index 00000000000..a149be719ba --- /dev/null +++ b/queue-3.4/icmpv6-treat-dest-unreachable-codes-5-and-6-as-eacces-not-eproto.patch @@ -0,0 +1,72 @@ +From a2681be0cbd9c7152cecd6c9a1ab5bfaa10e2471 Mon Sep 17 00:00:00 2001 +From: Jiri Bohac +Date: Fri, 30 Aug 2013 11:18:45 +0200 +Subject: ICMPv6: treat dest unreachable codes 5 and 6 as EACCES, not EPROTO + +From: Jiri Bohac + +[ Upstream commit 61e76b178dbe7145e8d6afa84bb4ccea71918994 ] + +RFC 4443 has defined two additional codes for ICMPv6 type 1 (destination +unreachable) messages: + 5 - Source address failed ingress/egress policy + 6 - Reject route to destination + +Now they are treated as protocol error and icmpv6_err_convert() converts them +to EPROTO. + +RFC 4443 says: + "Codes 5 and 6 are more informative subsets of code 1." + +Treat codes 5 and 6 as code 1 (EACCES) + +Btw, connect() returning -EPROTO confuses firefox, so that fallback to +other/IPv4 addresses does not work: +https://bugzilla.mozilla.org/show_bug.cgi?id=910773 + +Signed-off-by: Jiri Bohac +Acked-by: Hannes Frederic Sowa +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/icmpv6.h | 2 ++ + net/ipv6/icmp.c | 10 +++++++++- + 2 files changed, 11 insertions(+), 1 deletion(-) + +--- a/include/linux/icmpv6.h ++++ b/include/linux/icmpv6.h +@@ -123,6 +123,8 @@ static inline struct icmp6hdr *icmp6_hdr + #define ICMPV6_NOT_NEIGHBOUR 2 + #define ICMPV6_ADDR_UNREACH 3 + #define ICMPV6_PORT_UNREACH 4 ++#define ICMPV6_POLICY_FAIL 5 ++#define ICMPV6_REJECT_ROUTE 6 + + /* + * Codes for Time Exceeded +--- a/net/ipv6/icmp.c ++++ b/net/ipv6/icmp.c +@@ -917,6 +917,14 @@ static const struct icmp6_err { + .err = ECONNREFUSED, + .fatal = 1, + }, ++ { /* POLICY_FAIL */ ++ .err = EACCES, ++ .fatal = 1, ++ }, ++ { /* REJECT_ROUTE */ ++ .err = EACCES, ++ .fatal = 1, ++ }, + }; + + int icmpv6_err_convert(u8 type, u8 code, int *err) +@@ -928,7 +936,7 @@ int icmpv6_err_convert(u8 type, u8 code, + switch (type) { + case ICMPV6_DEST_UNREACH: + fatal = 1; +- if (code <= ICMPV6_PORT_UNREACH) { ++ if (code < ARRAY_SIZE(tab_unreach)) { + *err = tab_unreach[code].err; + fatal = tab_unreach[code].fatal; + } diff --git a/queue-3.4/ipv6-don-t-depend-on-per-socket-memory-for-neighbour-discovery-messages.patch b/queue-3.4/ipv6-don-t-depend-on-per-socket-memory-for-neighbour-discovery-messages.patch new file mode 100644 index 00000000000..c73748e1946 --- /dev/null +++ b/queue-3.4/ipv6-don-t-depend-on-per-socket-memory-for-neighbour-discovery-messages.patch @@ -0,0 +1,81 @@ +From b124cefb7fb9d7d58f1b7579bedfd8b8cdad2b11 Mon Sep 17 00:00:00 2001 +From: Thomas Graf +Date: Tue, 3 Sep 2013 13:37:01 +0200 +Subject: ipv6: Don't depend on per socket memory for neighbour discovery messages + +From: Thomas Graf + +[ Upstream commit 25a6e6b84fba601eff7c28d30da8ad7cfbef0d43 ] + +Allocating skbs when sending out neighbour discovery messages +currently uses sock_alloc_send_skb() based on a per net namespace +socket and thus share a socket wmem buffer space. + +If a netdevice is temporarily unable to transmit due to carrier +loss or for other reasons, the queued up ndisc messages will cosnume +all of the wmem space and will thus prevent from any more skbs to +be allocated even for netdevices that are able to transmit packets. + +The number of neighbour discovery messages sent is very limited, +use of alloc_skb() bypasses the socket wmem buffer size enforcement +while the manual call to skb_set_owner_w() maintains the socket +reference needed for the IPv6 output path. + +This patch has orginally been posted by Eric Dumazet in a modified +form. + +Signed-off-by: Thomas Graf +Cc: Eric Dumazet +Cc: Hannes Frederic Sowa +Cc: Stephen Warren +Cc: Fabio Estevam +Tested-by: Fabio Estevam +Tested-by: Stephen Warren +Acked-by: Hannes Frederic Sowa +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv6/ndisc.c | 16 +++++++++------- + 1 file changed, 9 insertions(+), 7 deletions(-) + +--- a/net/ipv6/ndisc.c ++++ b/net/ipv6/ndisc.c +@@ -441,7 +441,6 @@ struct sk_buff *ndisc_build_skb(struct n + int hlen = LL_RESERVED_SPACE(dev); + int tlen = dev->needed_tailroom; + int len; +- int err; + u8 *opt; + + if (!dev->addr_len) +@@ -451,14 +450,12 @@ struct sk_buff *ndisc_build_skb(struct n + if (llinfo) + len += ndisc_opt_addr_space(dev); + +- skb = sock_alloc_send_skb(sk, +- (MAX_HEADER + sizeof(struct ipv6hdr) + +- len + hlen + tlen), +- 1, &err); ++ skb = alloc_skb((MAX_HEADER + sizeof(struct ipv6hdr) + ++ len + hlen + tlen), GFP_ATOMIC); + if (!skb) { + ND_PRINTK0(KERN_ERR +- "ICMPv6 ND: %s() failed to allocate an skb, err=%d.\n", +- __func__, err); ++ "ICMPv6 ND: %s() failed to allocate an skb.\n", ++ __func__); + return NULL; + } + +@@ -486,6 +483,11 @@ struct sk_buff *ndisc_build_skb(struct n + csum_partial(hdr, + len, 0)); + ++ /* Manually assign socket ownership as we avoid calling ++ * sock_alloc_send_pskb() to bypass wmem buffer limits ++ */ ++ skb_set_owner_w(skb, sk); ++ + return skb; + } + diff --git a/queue-3.4/ipv6-don-t-stop-backtracking-in-fib6_lookup_1-if-subtree-does-not-match.patch b/queue-3.4/ipv6-don-t-stop-backtracking-in-fib6_lookup_1-if-subtree-does-not-match.patch new file mode 100644 index 00000000000..d2332bd5328 --- /dev/null +++ b/queue-3.4/ipv6-don-t-stop-backtracking-in-fib6_lookup_1-if-subtree-does-not-match.patch @@ -0,0 +1,58 @@ +From 2d8153827908cca60567ab7dd7abe92affca8823 Mon Sep 17 00:00:00 2001 +From: Hannes Frederic Sowa +Date: Wed, 7 Aug 2013 02:34:31 +0200 +Subject: ipv6: don't stop backtracking in fib6_lookup_1 if subtree does not match + +From: Hannes Frederic Sowa + +[ Upstream commit 3e3be275851bc6fc90bfdcd732cd95563acd982b ] + +In case a subtree did not match we currently stop backtracking and return +NULL (root table from fib_lookup). This could yield in invalid routing +table lookups when using subtrees. + +Instead continue to backtrack until a valid subtree or node is found +and return this match. + +Also remove unneeded NULL check. + +Reported-by: Teco Boot +Cc: YOSHIFUJI Hideaki +Cc: David Lamparter +Cc: +Signed-off-by: Hannes Frederic Sowa +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv6/ip6_fib.c | 16 ++++++++++++---- + 1 file changed, 12 insertions(+), 4 deletions(-) + +--- a/net/ipv6/ip6_fib.c ++++ b/net/ipv6/ip6_fib.c +@@ -949,14 +949,22 @@ static struct fib6_node * fib6_lookup_1( + + if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) { + #ifdef CONFIG_IPV6_SUBTREES +- if (fn->subtree) +- fn = fib6_lookup_1(fn->subtree, args + 1); ++ if (fn->subtree) { ++ struct fib6_node *sfn; ++ sfn = fib6_lookup_1(fn->subtree, ++ args + 1); ++ if (!sfn) ++ goto backtrack; ++ fn = sfn; ++ } + #endif +- if (!fn || fn->fn_flags & RTN_RTINFO) ++ if (fn->fn_flags & RTN_RTINFO) + return fn; + } + } +- ++#ifdef CONFIG_IPV6_SUBTREES ++backtrack: ++#endif + if (fn->fn_flags & RTN_ROOT) + break; + diff --git a/queue-3.4/ipv6-drop-packets-with-multiple-fragmentation-headers.patch b/queue-3.4/ipv6-drop-packets-with-multiple-fragmentation-headers.patch new file mode 100644 index 00000000000..c189b665130 --- /dev/null +++ b/queue-3.4/ipv6-drop-packets-with-multiple-fragmentation-headers.patch @@ -0,0 +1,63 @@ +From 301d2f6834afe6f4049b9193a85b05bbb65ffb6a Mon Sep 17 00:00:00 2001 +From: Hannes Frederic Sowa +Date: Fri, 16 Aug 2013 13:30:07 +0200 +Subject: ipv6: drop packets with multiple fragmentation headers + +From: Hannes Frederic Sowa + +[ Upstream commit f46078cfcd77fa5165bf849f5e568a7ac5fa569c ] + +It is not allowed for an ipv6 packet to contain multiple fragmentation +headers. So discard packets which were already reassembled by +fragmentation logic and send back a parameter problem icmp. + +The updates for RFC 6980 will come in later, I have to do a bit more +research here. + +Cc: YOSHIFUJI Hideaki +Signed-off-by: Hannes Frederic Sowa +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/ipv6.h | 1 + + net/ipv6/reassembly.c | 5 +++++ + 2 files changed, 6 insertions(+) + +--- a/include/linux/ipv6.h ++++ b/include/linux/ipv6.h +@@ -260,6 +260,7 @@ struct inet6_skb_parm { + #define IP6SKB_XFRM_TRANSFORMED 1 + #define IP6SKB_FORWARDED 2 + #define IP6SKB_REROUTED 4 ++#define IP6SKB_FRAGMENTED 16 + }; + + #define IP6CB(skb) ((struct inet6_skb_parm*)((skb)->cb)) +--- a/net/ipv6/reassembly.c ++++ b/net/ipv6/reassembly.c +@@ -516,6 +516,7 @@ static int ip6_frag_reasm(struct frag_qu + head->tstamp = fq->q.stamp; + ipv6_hdr(head)->payload_len = htons(payload_len); + IP6CB(head)->nhoff = nhoff; ++ IP6CB(head)->flags |= IP6SKB_FRAGMENTED; + + /* Yes, and fold redundant checksum back. 8) */ + if (head->ip_summed == CHECKSUM_COMPLETE) +@@ -551,6 +552,9 @@ static int ipv6_frag_rcv(struct sk_buff + const struct ipv6hdr *hdr = ipv6_hdr(skb); + struct net *net = dev_net(skb_dst(skb)->dev); + ++ if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED) ++ goto fail_hdr; ++ + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMREQDS); + + /* Jumbo payload inhibits frag. header */ +@@ -571,6 +575,7 @@ static int ipv6_frag_rcv(struct sk_buff + ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMOKS); + + IP6CB(skb)->nhoff = (u8 *)fhdr - skb_network_header(skb); ++ IP6CB(skb)->flags |= IP6SKB_FRAGMENTED; + return 1; + } + diff --git a/queue-3.4/ipv6-remove-max_addresses-check-from-ipv6_create_tempaddr.patch b/queue-3.4/ipv6-remove-max_addresses-check-from-ipv6_create_tempaddr.patch new file mode 100644 index 00000000000..30143fd3ee6 --- /dev/null +++ b/queue-3.4/ipv6-remove-max_addresses-check-from-ipv6_create_tempaddr.patch @@ -0,0 +1,64 @@ +From 79ada7773990ac5e464479790e8b4dc8ab0d48ac Mon Sep 17 00:00:00 2001 +From: Hannes Frederic Sowa +Date: Fri, 16 Aug 2013 13:02:27 +0200 +Subject: ipv6: remove max_addresses check from ipv6_create_tempaddr + +From: Hannes Frederic Sowa + +[ Upstream commit 4b08a8f1bd8cb4541c93ec170027b4d0782dab52 ] + +Because of the max_addresses check attackers were able to disable privacy +extensions on an interface by creating enough autoconfigured addresses: + + + +But the check is not actually needed: max_addresses protects the +kernel to install too many ipv6 addresses on an interface and guards +addrconf_prefix_rcv to install further addresses as soon as this limit +is reached. We only generate temporary addresses in direct response of +a new address showing up. As soon as we filled up the maximum number of +addresses of an interface, we stop installing more addresses and thus +also stop generating more temp addresses. + +Even if the attacker tries to generate a lot of temporary addresses +by announcing a prefix and removing it again (lifetime == 0) we won't +install more temp addresses, because the temporary addresses do count +to the maximum number of addresses, thus we would stop installing new +autoconfigured addresses when the limit is reached. + +This patch fixes CVE-2013-0343 (but other layer-2 attacks are still +possible). + +Thanks to Ding Tianhong to bring this topic up again. + +Signed-off-by: Hannes Frederic Sowa +Cc: Ding Tianhong +Cc: George Kargiotakis +Cc: P J P +Cc: YOSHIFUJI Hideaki +Acked-by: Ding Tianhong +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv6/addrconf.c | 10 ++++------ + 1 file changed, 4 insertions(+), 6 deletions(-) + +--- a/net/ipv6/addrconf.c ++++ b/net/ipv6/addrconf.c +@@ -910,12 +910,10 @@ retry: + if (ifp->flags & IFA_F_OPTIMISTIC) + addr_flags |= IFA_F_OPTIMISTIC; + +- ift = !max_addresses || +- ipv6_count_addresses(idev) < max_addresses ? +- ipv6_add_addr(idev, &addr, tmp_plen, +- ipv6_addr_type(&addr)&IPV6_ADDR_SCOPE_MASK, +- addr_flags) : NULL; +- if (!ift || IS_ERR(ift)) { ++ ift = ipv6_add_addr(idev, &addr, tmp_plen, ++ ipv6_addr_type(&addr)&IPV6_ADDR_SCOPE_MASK, ++ addr_flags); ++ if (IS_ERR(ift)) { + in6_ifa_put(ifp); + in6_dev_put(idev); + printk(KERN_INFO diff --git a/queue-3.4/macvtap-do-not-zerocopy-if-iov-needs-more-pages-than-max_skb_frags.patch b/queue-3.4/macvtap-do-not-zerocopy-if-iov-needs-more-pages-than-max_skb_frags.patch new file mode 100644 index 00000000000..4c85bf24dcb --- /dev/null +++ b/queue-3.4/macvtap-do-not-zerocopy-if-iov-needs-more-pages-than-max_skb_frags.patch @@ -0,0 +1,123 @@ +From 11613badc2b7ff4c08b8503ea2580d70117b995a Mon Sep 17 00:00:00 2001 +From: Jason Wang +Date: Tue, 6 Aug 2013 17:29:19 +0800 +Subject: macvtap: do not zerocopy if iov needs more pages than MAX_SKB_FRAGS + +From: Jason Wang + +commit ece793fcfc417b3925844be88a6a6dc82ae8f7c6 upstream. + +We try to linearize part of the skb when the number of iov is greater than +MAX_SKB_FRAGS. This is not enough since each single vector may occupy more than +one pages, so zerocopy_sg_fromiovec() may still fail and may break the guest +network. + +Solve this problem by calculate the pages needed for iov before trying to do +zerocopy and switch to use copy instead of zerocopy if it needs more than +MAX_SKB_FRAGS. + +This is done through introducing a new helper to count the pages for iov, and +call uarg->callback() manually when switching from zerocopy to copy to notify +vhost. + +We can do further optimization on top. + +This bug were introduced from b92946e2919134ebe2a4083e4302236295ea2a73 +(macvtap: zerocopy: validate vectors before building skb). + +Cc: Michael S. Tsirkin +Signed-off-by: Jason Wang +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/macvtap.c | 62 +++++++++++++++++++++++++++++--------------------- + 1 file changed, 37 insertions(+), 25 deletions(-) + +--- a/drivers/net/macvtap.c ++++ b/drivers/net/macvtap.c +@@ -642,6 +642,28 @@ static int macvtap_skb_to_vnet_hdr(const + return 0; + } + ++static unsigned long iov_pages(const struct iovec *iv, int offset, ++ unsigned long nr_segs) ++{ ++ unsigned long seg, base; ++ int pages = 0, len, size; ++ ++ while (nr_segs && (offset >= iv->iov_len)) { ++ offset -= iv->iov_len; ++ ++iv; ++ --nr_segs; ++ } ++ ++ for (seg = 0; seg < nr_segs; seg++) { ++ base = (unsigned long)iv[seg].iov_base + offset; ++ len = iv[seg].iov_len - offset; ++ size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; ++ pages += size; ++ offset = 0; ++ } ++ ++ return pages; ++} + + /* Get packet from user space buffer */ + static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m, +@@ -688,31 +710,15 @@ static ssize_t macvtap_get_user(struct m + if (unlikely(count > UIO_MAXIOV)) + goto err; + +- if (m && m->msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) +- zerocopy = true; +- +- if (zerocopy) { +- /* Userspace may produce vectors with count greater than +- * MAX_SKB_FRAGS, so we need to linearize parts of the skb +- * to let the rest of data to be fit in the frags. +- */ +- if (count > MAX_SKB_FRAGS) { +- copylen = iov_length(iv, count - MAX_SKB_FRAGS); +- if (copylen < vnet_hdr_len) +- copylen = 0; +- else +- copylen -= vnet_hdr_len; +- } +- /* There are 256 bytes to be copied in skb, so there is enough +- * room for skb expand head in case it is used. +- * The rest buffer is mapped from userspace. +- */ +- if (copylen < vnet_hdr.hdr_len) +- copylen = vnet_hdr.hdr_len; +- if (!copylen) +- copylen = GOODCOPY_LEN; ++ if (m && m->msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) { ++ copylen = vnet_hdr.hdr_len ? vnet_hdr.hdr_len : GOODCOPY_LEN; + linear = copylen; +- } else { ++ if (iov_pages(iv, vnet_hdr_len + copylen, count) ++ <= MAX_SKB_FRAGS) ++ zerocopy = true; ++ } ++ ++ if (!zerocopy) { + copylen = len; + linear = vnet_hdr.hdr_len; + } +@@ -724,9 +730,15 @@ static ssize_t macvtap_get_user(struct m + + if (zerocopy) + err = zerocopy_sg_from_iovec(skb, iv, vnet_hdr_len, count); +- else ++ else { + err = skb_copy_datagram_from_iovec(skb, 0, iv, vnet_hdr_len, + len); ++ if (!err && m && m->msg_control) { ++ struct ubuf_info *uarg = m->msg_control; ++ uarg->callback(uarg); ++ } ++ } ++ + if (err) + goto err_kfree; + diff --git a/queue-3.4/neighbour-populate-neigh_parms-on-alloc-before-calling-ndo_neigh_setup.patch b/queue-3.4/neighbour-populate-neigh_parms-on-alloc-before-calling-ndo_neigh_setup.patch new file mode 100644 index 00000000000..f72dcc9ed13 --- /dev/null +++ b/queue-3.4/neighbour-populate-neigh_parms-on-alloc-before-calling-ndo_neigh_setup.patch @@ -0,0 +1,44 @@ +From 864c43185acc3b0cb04dd2d15d4f1a5a8ff6a557 Mon Sep 17 00:00:00 2001 +From: Veaceslav Falico +Date: Fri, 2 Aug 2013 19:07:38 +0200 +Subject: neighbour: populate neigh_parms on alloc before calling ndo_neigh_setup + +From: Veaceslav Falico + +[ Upstream commit 63134803a6369dcf7dddf7f0d5e37b9566b308d2 ] + +dev->ndo_neigh_setup() might need some of the values of neigh_parms, so +populate them before calling it. + +Signed-off-by: Veaceslav Falico +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/core/neighbour.c | 10 ++++++---- + 1 file changed, 6 insertions(+), 4 deletions(-) + +--- a/net/core/neighbour.c ++++ b/net/core/neighbour.c +@@ -1442,16 +1442,18 @@ struct neigh_parms *neigh_parms_alloc(st + atomic_set(&p->refcnt, 1); + p->reachable_time = + neigh_rand_reach_time(p->base_reachable_time); ++ dev_hold(dev); ++ p->dev = dev; ++ write_pnet(&p->net, hold_net(net)); ++ p->sysctl_table = NULL; + + if (ops->ndo_neigh_setup && ops->ndo_neigh_setup(dev, p)) { ++ release_net(net); ++ dev_put(dev); + kfree(p); + return NULL; + } + +- dev_hold(dev); +- p->dev = dev; +- write_pnet(&p->net, hold_net(net)); +- p->sysctl_table = NULL; + write_lock_bh(&tbl->lock); + p->next = tbl->parms.next; + tbl->parms.next = p; diff --git a/queue-3.4/net-bridge-convert-mldv2-query-mrc-into-msecs_to_jiffies-for-max_delay.patch b/queue-3.4/net-bridge-convert-mldv2-query-mrc-into-msecs_to_jiffies-for-max_delay.patch new file mode 100644 index 00000000000..df0aed0b3f4 --- /dev/null +++ b/queue-3.4/net-bridge-convert-mldv2-query-mrc-into-msecs_to_jiffies-for-max_delay.patch @@ -0,0 +1,43 @@ +From 7a46c6dcf91c3025160c9163f5b5a82400c3c078 Mon Sep 17 00:00:00 2001 +From: Daniel Borkmann +Date: Thu, 29 Aug 2013 23:55:05 +0200 +Subject: net: bridge: convert MLDv2 Query MRC into msecs_to_jiffies for max_delay + +From: Daniel Borkmann + +[ Upstream commit 2d98c29b6fb3de44d9eaa73c09f9cf7209346383 ] + +While looking into MLDv1/v2 code, I noticed that bridging code does +not convert it's max delay into jiffies for MLDv2 messages as we do +in core IPv6' multicast code. + +RFC3810, 5.1.3. Maximum Response Code says: + + The Maximum Response Code field specifies the maximum time allowed + before sending a responding Report. The actual time allowed, called + the Maximum Response Delay, is represented in units of milliseconds, + and is derived from the Maximum Response Code as follows: [...] + +As we update timers that work with jiffies, we need to convert it. + +Signed-off-by: Daniel Borkmann +Cc: Linus Lüssing +Cc: Hannes Frederic Sowa +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/bridge/br_multicast.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/net/bridge/br_multicast.c ++++ b/net/bridge/br_multicast.c +@@ -1155,7 +1155,8 @@ static int br_ip6_multicast_query(struct + mld2q = (struct mld2_query *)icmp6_hdr(skb); + if (!mld2q->mld2q_nsrcs) + group = &mld2q->mld2q_mca; +- max_delay = mld2q->mld2q_mrc ? MLDV2_MRC(mld2q->mld2q_mrc) : 1; ++ ++ max_delay = max(msecs_to_jiffies(MLDV2_MRC(ntohs(mld2q->mld2q_mrc))), 1UL); + } + + if (!group) diff --git a/queue-3.4/net-check-net.core.somaxconn-sysctl-values.patch b/queue-3.4/net-check-net.core.somaxconn-sysctl-values.patch new file mode 100644 index 00000000000..6e0205a261a --- /dev/null +++ b/queue-3.4/net-check-net.core.somaxconn-sysctl-values.patch @@ -0,0 +1,70 @@ +From 52ab602137b1f0948243d36c6e393fd0643aaa96 Mon Sep 17 00:00:00 2001 +From: Roman Gushchin +Date: Fri, 2 Aug 2013 18:36:40 +0400 +Subject: net: check net.core.somaxconn sysctl values + +From: Roman Gushchin + +[ Upstream commit 5f671d6b4ec3e6d66c2a868738af2cdea09e7509 ] + +It's possible to assign an invalid value to the net.core.somaxconn +sysctl variable, because there is no checks at all. + +The sk_max_ack_backlog field of the sock structure is defined as +unsigned short. Therefore, the backlog argument in inet_listen() +shouldn't exceed USHRT_MAX. The backlog argument in the listen() syscall +is truncated to the somaxconn value. So, the somaxconn value shouldn't +exceed 65535 (USHRT_MAX). +Also, negative values of somaxconn are meaningless. + +before: +$ sysctl -w net.core.somaxconn=256 +net.core.somaxconn = 256 +$ sysctl -w net.core.somaxconn=65536 +net.core.somaxconn = 65536 +$ sysctl -w net.core.somaxconn=-100 +net.core.somaxconn = -100 + +after: +$ sysctl -w net.core.somaxconn=256 +net.core.somaxconn = 256 +$ sysctl -w net.core.somaxconn=65536 +error: "Invalid argument" setting key "net.core.somaxconn" +$ sysctl -w net.core.somaxconn=-100 +error: "Invalid argument" setting key "net.core.somaxconn" + +Based on a prior patch from Changli Gao. + +Signed-off-by: Roman Gushchin +Reported-by: Changli Gao +Suggested-by: Eric Dumazet +Acked-by: Eric Dumazet +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/core/sysctl_net_core.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +--- a/net/core/sysctl_net_core.c ++++ b/net/core/sysctl_net_core.c +@@ -19,6 +19,9 @@ + #include + #include + ++static int zero = 0; ++static int ushort_max = USHRT_MAX; ++ + #ifdef CONFIG_RPS + static int rps_sock_flow_sysctl(ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +@@ -197,7 +200,9 @@ static struct ctl_table netns_core_table + .data = &init_net.core.sysctl_somaxconn, + .maxlen = sizeof(int), + .mode = 0644, +- .proc_handler = proc_dointvec ++ .extra1 = &zero, ++ .extra2 = &ushort_max, ++ .proc_handler = proc_dointvec_minmax + }, + { } + }; diff --git a/queue-3.4/net-ipv6-tcp-fix-potential-use-after-free-in-tcp_v6_do_rcv.patch b/queue-3.4/net-ipv6-tcp-fix-potential-use-after-free-in-tcp_v6_do_rcv.patch new file mode 100644 index 00000000000..984eaf7625b --- /dev/null +++ b/queue-3.4/net-ipv6-tcp-fix-potential-use-after-free-in-tcp_v6_do_rcv.patch @@ -0,0 +1,45 @@ +From c596c9f2530e3d305d2e4d9e3491df8fab08c97f Mon Sep 17 00:00:00 2001 +From: Daniel Borkmann +Date: Tue, 3 Sep 2013 19:29:12 +0200 +Subject: net: ipv6: tcp: fix potential use after free in tcp_v6_do_rcv + +From: Daniel Borkmann + +[ Upstream commit 3a1c756590633c0e86df606e5c618c190926a0df ] + +In tcp_v6_do_rcv() code, when processing pkt options, we soley work +on our skb clone opt_skb that we've created earlier before entering +tcp_rcv_established() on our way. However, only in condition ... + + if (np->rxopt.bits.rxtclass) + np->rcv_tclass = ipv6_get_dsfield(ipv6_hdr(skb)); + +... we work on skb itself. As we extract every other information out +of opt_skb in ipv6_pktoptions path, this seems wrong, since skb can +already be released by tcp_rcv_established() earlier on. When we try +to access it in ipv6_hdr(), we will dereference freed skb. + +[ Bug added by commit 4c507d2897bd9b ("net: implement IP_RECVTOS for + IP_PKTOPTIONS") ] + +Signed-off-by: Daniel Borkmann +Cc: Eric Dumazet +Acked-by: Eric Dumazet +Acked-by: Jiri Benc +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv6/tcp_ipv6.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/ipv6/tcp_ipv6.c ++++ b/net/ipv6/tcp_ipv6.c +@@ -1571,7 +1571,7 @@ ipv6_pktoptions: + if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) + np->mcast_hops = ipv6_hdr(opt_skb)->hop_limit; + if (np->rxopt.bits.rxtclass) +- np->rcv_tclass = ipv6_tclass(ipv6_hdr(skb)); ++ np->rcv_tclass = ipv6_tclass(ipv6_hdr(opt_skb)); + if (ipv6_opt_accepted(sk, opt_skb)) { + skb_set_owner_r(opt_skb, sk); + opt_skb = xchg(&np->pktoptions, opt_skb); diff --git a/queue-3.4/series b/queue-3.4/series new file mode 100644 index 00000000000..1251e643cec --- /dev/null +++ b/queue-3.4/series @@ -0,0 +1,20 @@ +htb-fix-sign-extension-bug.patch +net-check-net.core.somaxconn-sysctl-values.patch +neighbour-populate-neigh_parms-on-alloc-before-calling-ndo_neigh_setup.patch +bonding-modify-only-neigh_parms-owned-by-us.patch +fib_trie-remove-potential-out-of-bound-access.patch +tcp-cubic-fix-overflow-error-in-bictcp_update.patch +tcp-cubic-fix-bug-in-bictcp_acked.patch +ipv6-don-t-stop-backtracking-in-fib6_lookup_1-if-subtree-does-not-match.patch +8139cp-fix-skb-leak-in-rx_status_loop-failure-path.patch +tun-signedness-bug-in-tun_get_user.patch +ipv6-remove-max_addresses-check-from-ipv6_create_tempaddr.patch +ipv6-drop-packets-with-multiple-fragmentation-headers.patch +ipv6-don-t-depend-on-per-socket-memory-for-neighbour-discovery-messages.patch +net-bridge-convert-mldv2-query-mrc-into-msecs_to_jiffies-for-max_delay.patch +icmpv6-treat-dest-unreachable-codes-5-and-6-as-eacces-not-eproto.patch +net-ipv6-tcp-fix-potential-use-after-free-in-tcp_v6_do_rcv.patch +vhost-zerocopy-poll-vq-in-zerocopy-callback.patch +macvtap-do-not-zerocopy-if-iov-needs-more-pages-than-max_skb_frags.patch +tipc-fix-lockdep-warning-during-bearer-initialization.patch +af_key-initialize-satype-in-key_notify_policy_flush.patch diff --git a/queue-3.4/tcp-cubic-fix-bug-in-bictcp_acked.patch b/queue-3.4/tcp-cubic-fix-bug-in-bictcp_acked.patch new file mode 100644 index 00000000000..bd89c19a7c5 --- /dev/null +++ b/queue-3.4/tcp-cubic-fix-bug-in-bictcp_acked.patch @@ -0,0 +1,46 @@ +From 6c9eced920a991497673accec4df3a17ca3ee1a4 Mon Sep 17 00:00:00 2001 +From: Eric Dumazet +Date: Mon, 5 Aug 2013 20:05:12 -0700 +Subject: tcp: cubic: fix bug in bictcp_acked() + +From: Eric Dumazet + +[ Upstream commit cd6b423afd3c08b27e1fed52db828ade0addbc6b ] + +While investigating about strange increase of retransmit rates +on hosts ~24 days after boot, Van found hystart was disabled +if ca->epoch_start was 0, as following condition is true +when tcp_time_stamp high order bit is set. + +(s32)(tcp_time_stamp - ca->epoch_start) < HZ + +Quoting Van : + + At initialization & after every loss ca->epoch_start is set to zero so + I believe that the above line will turn off hystart as soon as the 2^31 + bit is set in tcp_time_stamp & hystart will stay off for 24 days. + I think we've observed that cubic's restart is too aggressive without + hystart so this might account for the higher drop rate we observe. + +Diagnosed-by: Van Jacobson +Signed-off-by: Eric Dumazet +Cc: Neal Cardwell +Cc: Yuchung Cheng +Acked-by: Neal Cardwell +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/tcp_cubic.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/ipv4/tcp_cubic.c ++++ b/net/ipv4/tcp_cubic.c +@@ -416,7 +416,7 @@ static void bictcp_acked(struct sock *sk + return; + + /* Discard delay samples right after fast recovery */ +- if ((s32)(tcp_time_stamp - ca->epoch_start) < HZ) ++ if (ca->epoch_start && (s32)(tcp_time_stamp - ca->epoch_start) < HZ) + return; + + delay = (rtt_us << 3) / USEC_PER_MSEC; diff --git a/queue-3.4/tcp-cubic-fix-overflow-error-in-bictcp_update.patch b/queue-3.4/tcp-cubic-fix-overflow-error-in-bictcp_update.patch new file mode 100644 index 00000000000..a13b7c7c416 --- /dev/null +++ b/queue-3.4/tcp-cubic-fix-overflow-error-in-bictcp_update.patch @@ -0,0 +1,68 @@ +From 9b5d5463fad24e4487187d9bb64f03921f108aeb Mon Sep 17 00:00:00 2001 +From: Eric Dumazet +Date: Mon, 5 Aug 2013 17:10:15 -0700 +Subject: tcp: cubic: fix overflow error in bictcp_update() + +From: Eric Dumazet + +[ Upstream commit 2ed0edf9090bf4afa2c6fc4f38575a85a80d4b20 ] + +commit 17a6e9f1aa9 ("tcp_cubic: fix clock dependency") added an +overflow error in bictcp_update() in following code : + +/* change the unit from HZ to bictcp_HZ */ +t = ((tcp_time_stamp + msecs_to_jiffies(ca->delay_min>>3) - + ca->epoch_start) << BICTCP_HZ) / HZ; + +Because msecs_to_jiffies() being unsigned long, compiler does +implicit type promotion. + +We really want to constrain (tcp_time_stamp - ca->epoch_start) +to a signed 32bit value, or else 't' has unexpected high values. + +This bugs triggers an increase of retransmit rates ~24 days after +boot [1], as the high order bit of tcp_time_stamp flips. + +[1] for hosts with HZ=1000 + +Big thanks to Van Jacobson for spotting this problem. + +Diagnosed-by: Van Jacobson +Signed-off-by: Eric Dumazet +Cc: Neal Cardwell +Cc: Yuchung Cheng +Cc: Stephen Hemminger +Acked-by: Neal Cardwell +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/tcp_cubic.c | 10 ++++++---- + 1 file changed, 6 insertions(+), 4 deletions(-) + +--- a/net/ipv4/tcp_cubic.c ++++ b/net/ipv4/tcp_cubic.c +@@ -206,8 +206,8 @@ static u32 cubic_root(u64 a) + */ + static inline void bictcp_update(struct bictcp *ca, u32 cwnd) + { +- u64 offs; +- u32 delta, t, bic_target, max_cnt; ++ u32 delta, bic_target, max_cnt; ++ u64 offs, t; + + ca->ack_cnt++; /* count the number of ACKs */ + +@@ -250,9 +250,11 @@ static inline void bictcp_update(struct + * if the cwnd < 1 million packets !!! + */ + ++ t = (s32)(tcp_time_stamp - ca->epoch_start); ++ t += msecs_to_jiffies(ca->delay_min >> 3); + /* change the unit from HZ to bictcp_HZ */ +- t = ((tcp_time_stamp + msecs_to_jiffies(ca->delay_min>>3) +- - ca->epoch_start) << BICTCP_HZ) / HZ; ++ t <<= BICTCP_HZ; ++ do_div(t, HZ); + + if (t < ca->bic_K) /* t - K */ + offs = ca->bic_K - t; diff --git a/queue-3.4/tipc-fix-lockdep-warning-during-bearer-initialization.patch b/queue-3.4/tipc-fix-lockdep-warning-during-bearer-initialization.patch new file mode 100644 index 00000000000..ef776a34046 --- /dev/null +++ b/queue-3.4/tipc-fix-lockdep-warning-during-bearer-initialization.patch @@ -0,0 +1,175 @@ +From 9ee9730a92ab8f0bf0e2b3994a9be5fc82380b7c Mon Sep 17 00:00:00 2001 +From: Ying Xue +Date: Thu, 16 Aug 2012 12:09:07 +0000 +Subject: tipc: fix lockdep warning during bearer initialization + +From: Ying Xue + +[ Upstream commit 4225a398c1352a7a5c14dc07277cb5cc4473983b ] + +When the lockdep validator is enabled, it will report the below +warning when we enable a TIPC bearer: + +[ INFO: possible irq lock inversion dependency detected ] +--------------------------------------------------------- +Possible interrupt unsafe locking scenario: + + CPU0 CPU1 + ---- ---- + lock(ptype_lock); + local_irq_disable(); + lock(tipc_net_lock); + lock(ptype_lock); + + lock(tipc_net_lock); + + *** DEADLOCK *** + +the shortest dependencies between 2nd lock and 1st lock: + -> (ptype_lock){+.+...} ops: 10 { +[...] +SOFTIRQ-ON-W at: + [] __lock_acquire+0x528/0x13e0 + [] lock_acquire+0x90/0x100 + [] _raw_spin_lock+0x38/0x50 + [] dev_add_pack+0x3a/0x60 + [] arp_init+0x1a/0x48 + [] inet_init+0x181/0x27e + [] do_one_initcall+0x34/0x170 + [] kernel_init+0x110/0x1b2 + [] kernel_thread_helper+0x6/0x10 +[...] + ... key at: [] ptype_lock+0x10/0x20 + ... acquired at: + [] lock_acquire+0x90/0x100 + [] _raw_spin_lock+0x38/0x50 + [] dev_add_pack+0x3a/0x60 + [] enable_bearer+0xf2/0x140 [tipc] + [] tipc_enable_bearer+0x1ba/0x450 [tipc] + [] tipc_cfg_do_cmd+0x5c4/0x830 [tipc] + [] handle_cmd+0x42/0xd0 [tipc] + [] genl_rcv_msg+0x232/0x280 + [] netlink_rcv_skb+0x86/0xb0 + [] genl_rcv+0x1c/0x30 + [] netlink_unicast+0x174/0x1f0 + [] netlink_sendmsg+0x1eb/0x2d0 + [] sock_aio_write+0x161/0x170 + [] do_sync_write+0xac/0xf0 + [] vfs_write+0x156/0x170 + [] sys_write+0x42/0x70 + [] sysenter_do_call+0x12/0x38 +[...] +} + -> (tipc_net_lock){+..-..} ops: 4 { +[...] + IN-SOFTIRQ-R at: + [] __lock_acquire+0x64a/0x13e0 + [] lock_acquire+0x90/0x100 + [] _raw_read_lock_bh+0x3d/0x50 + [] tipc_recv_msg+0x1d/0x830 [tipc] + [] recv_msg+0x3f/0x50 [tipc] + [] __netif_receive_skb+0x22a/0x590 + [] netif_receive_skb+0x2b/0xf0 + [] pcnet32_poll+0x292/0x780 + [] net_rx_action+0xfa/0x1e0 + [] __do_softirq+0xae/0x1e0 +[...] +} + +>From the log, we can see three different call chains between +CPU0 and CPU1: + +Time 0 on CPU0: + + kernel_init()->inet_init()->dev_add_pack() + +At time 0, the ptype_lock is held by CPU0 in dev_add_pack(); + +Time 1 on CPU1: + + tipc_enable_bearer()->enable_bearer()->dev_add_pack() + +At time 1, tipc_enable_bearer() first holds tipc_net_lock, and then +wants to take ptype_lock to register TIPC protocol handler into the +networking stack. But the ptype_lock has been taken by dev_add_pack() +on CPU0, so at this time the dev_add_pack() running on CPU1 has to be +busy looping. + +Time 2 on CPU0: + + netif_receive_skb()->recv_msg()->tipc_recv_msg() + +At time 2, an incoming TIPC packet arrives at CPU0, hence +tipc_recv_msg() will be invoked. In tipc_recv_msg(), it first wants +to hold tipc_net_lock. At the moment, below scenario happens: + +On CPU0, below is our sequence of taking locks: + + lock(ptype_lock)->lock(tipc_net_lock) + +On CPU1, our sequence of taking locks looks like: + + lock(tipc_net_lock)->lock(ptype_lock) + +Obviously deadlock may happen in this case. + +But please note the deadlock possibly doesn't occur at all when the +first TIPC bearer is enabled. Before enable_bearer() -- running on +CPU1 does not hold ptype_lock, so the TIPC receive handler (i.e. +recv_msg()) is not registered successfully via dev_add_pack(), so +the tipc_recv_msg() cannot be called by recv_msg() even if a TIPC +message comes to CPU0. But when the second TIPC bearer is +registered, the deadlock can perhaps really happen. + +To fix it, we will push the work of registering TIPC protocol +handler into workqueue context. After the change, both paths taking +ptype_lock are always in process contexts, thus, the deadlock should +never occur. + +Signed-off-by: Ying Xue +Signed-off-by: Jon Maloy +Signed-off-by: Paul Gortmaker +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/tipc/eth_media.c | 15 ++++++++++++++- + 1 file changed, 14 insertions(+), 1 deletion(-) + +--- a/net/tipc/eth_media.c ++++ b/net/tipc/eth_media.c +@@ -53,6 +53,7 @@ struct eth_bearer { + struct tipc_bearer *bearer; + struct net_device *dev; + struct packet_type tipc_packet_type; ++ struct work_struct setup; + struct work_struct cleanup; + }; + +@@ -138,6 +139,17 @@ static int recv_msg(struct sk_buff *buf, + } + + /** ++ * setup_bearer - setup association between Ethernet bearer and interface ++ */ ++static void setup_bearer(struct work_struct *work) ++{ ++ struct eth_bearer *eb_ptr = ++ container_of(work, struct eth_bearer, setup); ++ ++ dev_add_pack(&eb_ptr->tipc_packet_type); ++} ++ ++/** + * enable_bearer - attach TIPC bearer to an Ethernet interface + */ + +@@ -181,7 +193,8 @@ static int enable_bearer(struct tipc_bea + eb_ptr->tipc_packet_type.func = recv_msg; + eb_ptr->tipc_packet_type.af_packet_priv = eb_ptr; + INIT_LIST_HEAD(&(eb_ptr->tipc_packet_type.list)); +- dev_add_pack(&eb_ptr->tipc_packet_type); ++ INIT_WORK(&eb_ptr->setup, setup_bearer); ++ schedule_work(&eb_ptr->setup); + + /* Associate TIPC bearer with Ethernet bearer */ + diff --git a/queue-3.4/tun-signedness-bug-in-tun_get_user.patch b/queue-3.4/tun-signedness-bug-in-tun_get_user.patch new file mode 100644 index 00000000000..5769fdcb8c8 --- /dev/null +++ b/queue-3.4/tun-signedness-bug-in-tun_get_user.patch @@ -0,0 +1,46 @@ +From 3cf27a163c54c80879076abb04c6ce40fb6f679b Mon Sep 17 00:00:00 2001 +From: Dan Carpenter +Date: Thu, 15 Aug 2013 15:52:57 +0300 +Subject: tun: signedness bug in tun_get_user() + +From: Dan Carpenter + +[ Upstream commit 15718ea0d844e4816dbd95d57a8a0e3e264ba90e ] + +The recent fix d9bf5f1309 "tun: compare with 0 instead of total_len" is +not totally correct. Because "len" and "sizeof()" are size_t type, that +means they are never less than zero. + +Signed-off-by: Dan Carpenter +Acked-by: Michael S. Tsirkin +Acked-by: Neil Horman +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/tun.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +--- a/drivers/net/tun.c ++++ b/drivers/net/tun.c +@@ -615,8 +615,9 @@ static ssize_t tun_get_user(struct tun_s + int offset = 0; + + if (!(tun->flags & TUN_NO_PI)) { +- if ((len -= sizeof(pi)) > count) ++ if (len < sizeof(pi)) + return -EINVAL; ++ len -= sizeof(pi); + + if (memcpy_fromiovecend((void *)&pi, iv, 0, sizeof(pi))) + return -EFAULT; +@@ -624,8 +625,9 @@ static ssize_t tun_get_user(struct tun_s + } + + if (tun->flags & TUN_VNET_HDR) { +- if ((len -= tun->vnet_hdr_sz) > count) ++ if (len < tun->vnet_hdr_sz) + return -EINVAL; ++ len -= tun->vnet_hdr_sz; + + if (memcpy_fromiovecend((void *)&gso, iv, offset, sizeof(gso))) + return -EFAULT; diff --git a/queue-3.4/vhost-zerocopy-poll-vq-in-zerocopy-callback.patch b/queue-3.4/vhost-zerocopy-poll-vq-in-zerocopy-callback.patch new file mode 100644 index 00000000000..b534e32f0df --- /dev/null +++ b/queue-3.4/vhost-zerocopy-poll-vq-in-zerocopy-callback.patch @@ -0,0 +1,31 @@ +From b732f7499646e4ba41eec865761de8d2d18a73dc Mon Sep 17 00:00:00 2001 +From: Jason Wang +Date: Tue, 6 Aug 2013 17:29:18 +0800 +Subject: vhost: zerocopy: poll vq in zerocopy callback + +From: Jason Wang + +commit c70aa540c7a9f67add11ad3161096fb95233aa2e upstream. + +We add used and signal guest in worker thread but did not poll the virtqueue +during the zero copy callback. This may lead the missing of adding and +signalling during zerocopy. Solve this by polling the virtqueue and let it +wakeup the worker during callback. + +Signed-off-by: Jason Wang +Signed-off-by: Michael S. Tsirkin +Signed-off-by: Greg Kroah-Hartman +--- + drivers/vhost/vhost.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/drivers/vhost/vhost.c ++++ b/drivers/vhost/vhost.c +@@ -1603,6 +1603,7 @@ void vhost_zerocopy_callback(struct ubuf + struct vhost_ubuf_ref *ubufs = ubuf->ctx; + struct vhost_virtqueue *vq = ubufs->vq; + ++ vhost_poll_queue(&vq->poll); + /* set len = 1 to mark this desc buffers done DMA */ + vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN; + kref_put(&ubufs->kref, vhost_zerocopy_done_signal);