From: Greg Kroah-Hartman Date: Sun, 12 Oct 2014 14:41:05 +0000 (+0200) Subject: 3.14-stable patches X-Git-Tag: v3.17.1~9 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=6b53653851b265db65bef7bc493374755756efb5;p=thirdparty%2Fkernel%2Fstable-queue.git 3.14-stable patches added patches: bonding-fix-div-by-zero-while-enslaving-and-transmitting.patch bridge-check-if-vlan-filtering-is-enabled-only-once.patch bridge-fix-br_should_learn-to-check-vlan_enabled.patch gro-fix-aggregation-for-skb-using-frag_list.patch hyperv-fix-a-bug-in-netvsc_start_xmit.patch i40e-don-t-stop-driver-probe-when-querying-dcb-config-fails.patch ip6_gre-fix-flowi6_proto-value-in-xmit-path.patch ipv6-fix-rtnl-locking-in-setsockopt-for-anycast-and-multicast.patch ipv6-restore-the-behavior-of-ipv6_sock_ac_drop.patch l2tp-fix-race-while-getting-pmtu-on-ppp-pseudo-wire.patch macvtap-fix-race-between-device-delete-and-open.patch myri10ge-check-for-dma-mapping-errors.patch net-allow-macvlans-to-move-to-net-namespace.patch net-always-untag-vlan-tagged-traffic-on-input.patch netlink-reset-network-header-before-passing-to-taps.patch openvswitch-fix-panic-with-multiple-vlan-headers.patch packet-handle-too-big-packets-for-packet_v3.patch revert-net-macb-add-pinctrl-consumer-support.patch rtnetlink-fix-vf-info-size.patch sctp-handle-association-restarts-when-the-socket-is-closed.patch sit-fix-ipip6_tunnel_lookup-device-matching-criteria.patch tcp-don-t-use-timestamp-from-repaired-skb-s-to-calculate-rtt-v2.patch tcp-fix-ssthresh-and-undo-for-consecutive-short-frto-episodes.patch tcp-fix-tcp_release_cb-to-dispatch-via-address-family-for-mtu_reduced.patch tcp-fixing-tlp-s-fin-recovery.patch team-avoid-race-condition-in-scheduling-delayed-work.patch tg3-allow-for-recieve-of-full-size-8021ad-frames.patch tg3-work-around-hw-fw-limitations-with-vlan-encapsulated-frames.patch vxlan-fix-incorrect-initializer-in-union-vxlan_addr.patch xfrm-generate-blackhole-routes-only-from-route-lookup-functions.patch xfrm-generate-queueing-routes-only-from-route-lookup-functions.patch --- diff --git a/queue-3.14/bonding-fix-div-by-zero-while-enslaving-and-transmitting.patch b/queue-3.14/bonding-fix-div-by-zero-while-enslaving-and-transmitting.patch new file mode 100644 index 00000000000..2299cf651fa --- /dev/null +++ b/queue-3.14/bonding-fix-div-by-zero-while-enslaving-and-transmitting.patch @@ -0,0 +1,152 @@ +From foo@baz Sun Oct 12 16:38:53 CEST 2014 +From: Nikolay Aleksandrov +Date: Fri, 12 Sep 2014 17:38:18 +0200 +Subject: bonding: fix div by zero while enslaving and transmitting + +From: Nikolay Aleksandrov + +[ Upstream commit 9a72c2da690d78e93cff24b9f616412508678dd5 ] + +The problem is that the slave is first linked and slave_cnt is +incremented afterwards leading to a div by zero in the modes that use it +as a modulus. What happens is that in bond_start_xmit() +bond_has_slaves() is used to evaluate further transmission and it becomes +true after the slave is linked in, but when slave_cnt is used in the xmit +path it is still 0, so fetch it once and transmit based on that. Since +it is used only in round-robin and XOR modes, the fix is only for them. +Thanks to Eric Dumazet for pointing out the fault in my first try to fix +this. + +Call trace (took it out of net-next kernel, but it's the same with net): +[46934.330038] divide error: 0000 [#1] SMP +[46934.330041] Modules linked in: bonding(O) 9p fscache +snd_hda_codec_generic crct10dif_pclmul +[46934.330041] bond0: Enslaving eth1 as an active interface with an up +link +[46934.330051] ppdev joydev crc32_pclmul crc32c_intel 9pnet_virtio +ghash_clmulni_intel snd_hda_intel 9pnet snd_hda_controller parport_pc +serio_raw pcspkr snd_hda_codec parport virtio_balloon virtio_console +snd_hwdep snd_pcm pvpanic i2c_piix4 snd_timer i2ccore snd soundcore +virtio_blk virtio_net virtio_pci virtio_ring virtio ata_generic +pata_acpi floppy [last unloaded: bonding] +[46934.330053] CPU: 1 PID: 3382 Comm: ping Tainted: G O +3.17.0-rc4+ #27 +[46934.330053] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 +[46934.330054] task: ffff88005aebf2c0 ti: ffff88005b728000 task.ti: +ffff88005b728000 +[46934.330059] RIP: 0010:[] [] +bond_start_xmit+0x1c3/0x450 [bonding] +[46934.330060] RSP: 0018:ffff88005b72b7f8 EFLAGS: 00010246 +[46934.330060] RAX: 0000000000000679 RBX: ffff88004b077000 RCX: +000000000000002a +[46934.330061] RDX: 0000000000000000 RSI: ffff88004b3f0500 RDI: +ffff88004b077940 +[46934.330061] RBP: ffff88005b72b830 R08: 00000000000000c0 R09: +ffff88004a83e000 +[46934.330062] R10: 000000000000ffff R11: ffff88004b1f12c0 R12: +ffff88004b3f0500 +[46934.330062] R13: ffff88004b3f0500 R14: 000000000000002a R15: +ffff88004b077940 +[46934.330063] FS: 00007fbd91a4c740(0000) GS:ffff88005f080000(0000) +knlGS:0000000000000000 +[46934.330064] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +[46934.330064] CR2: 00007f803a8bb000 CR3: 000000004b2c9000 CR4: +00000000000406e0 +[46934.330069] Stack: +[46934.330071] ffffffff811e6169 00000000e772fa05 ffff88004b077000 +ffff88004b3f0500 +[46934.330072] ffffffff81d17d18 000000000000002a 0000000000000000 +ffff88005b72b8a0 +[46934.330073] ffffffff81620108 ffffffff8161fe0e ffff88005b72b8c4 +ffff88005b302000 +[46934.330073] Call Trace: +[46934.330077] [] ? +__kmalloc_node_track_caller+0x119/0x300 +[46934.330084] [] dev_hard_start_xmit+0x188/0x410 +[46934.330086] [] ? harmonize_features+0x2e/0x90 +[46934.330088] [] __dev_queue_xmit+0x456/0x590 +[46934.330089] [] dev_queue_xmit+0x10/0x20 +[46934.330090] [] arp_xmit+0x22/0x60 +[46934.330091] [] arp_send.part.16+0x30/0x40 +[46934.330092] [] arp_solicit+0x115/0x2b0 +[46934.330094] [] ? copy_skb_header+0x17/0xa0 +[46934.330096] [] neigh_probe+0x4a/0x70 +[46934.330097] [] __neigh_event_send+0xac/0x230 +[46934.330098] [] neigh_resolve_output+0x13b/0x220 +[46934.330100] [] ? ip_forward_options+0x1c0/0x1c0 +[46934.330101] [] ip_finish_output+0x1f8/0x860 +[46934.330102] [] ip_output+0x58/0x90 +[46934.330103] [] ? __ip_local_out+0xa2/0xb0 +[46934.330104] [] ip_local_out_sk+0x30/0x40 +[46934.330105] [] ip_send_skb+0x16/0x50 +[46934.330106] [] ip_push_pending_frames+0x33/0x40 +[46934.330107] [] raw_sendmsg+0x88c/0xa30 +[46934.330110] [] ? skb_recv_datagram+0x41/0x60 +[46934.330111] [] ? raw_recvmsg+0xa9/0x1f0 +[46934.330113] [] inet_sendmsg+0x74/0xc0 +[46934.330114] [] ? inet_recvmsg+0x8b/0xb0 +[46934.330115] bond0: Adding slave eth2 +[46934.330116] [] sock_sendmsg+0x9c/0xe0 +[46934.330118] [] ? +move_addr_to_kernel.part.20+0x28/0x80 +[46934.330121] [] ? might_fault+0x47/0x50 +[46934.330122] [] ___sys_sendmsg+0x3a9/0x3c0 +[46934.330125] [] ? n_tty_write+0x3aa/0x530 +[46934.330127] [] ? __wake_up+0x44/0x50 +[46934.330129] [] ? fsnotify+0x238/0x310 +[46934.330130] [] __sys_sendmsg+0x51/0x90 +[46934.330131] [] SyS_sendmsg+0x12/0x20 +[46934.330134] [] system_call_fastpath+0x16/0x1b +[46934.330144] Code: 48 8b 10 4c 89 ee 4c 89 ff e8 aa bc ff ff 31 c0 e9 +1a ff ff ff 0f 1f 00 4c 89 ee 4c 89 ff e8 65 fb ff ff 31 d2 4c 89 ee 4c +89 ff b3 64 09 00 00 e8 02 bd ff ff 31 c0 e9 f2 fe ff ff 0f 1f 00 +[46934.330146] RIP [] bond_start_xmit+0x1c3/0x450 +[bonding] +[46934.330146] RSP + +CC: Eric Dumazet +CC: Andy Gospodarek +CC: Jay Vosburgh +CC: Veaceslav Falico +Fixes: 278b208375 ("bonding: initial RCU conversion") +Signed-off-by: Nikolay Aleksandrov +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/bonding/bond_main.c | 17 ++++++++++++++--- + 1 file changed, 14 insertions(+), 3 deletions(-) + +--- a/drivers/net/bonding/bond_main.c ++++ b/drivers/net/bonding/bond_main.c +@@ -3624,8 +3624,14 @@ static int bond_xmit_roundrobin(struct s + else + bond_xmit_slave_id(bond, skb, 0); + } else { +- slave_id = bond_rr_gen_slave_id(bond); +- bond_xmit_slave_id(bond, skb, slave_id % bond->slave_cnt); ++ int slave_cnt = ACCESS_ONCE(bond->slave_cnt); ++ ++ if (likely(slave_cnt)) { ++ slave_id = bond_rr_gen_slave_id(bond); ++ bond_xmit_slave_id(bond, skb, slave_id % slave_cnt); ++ } else { ++ dev_kfree_skb_any(skb); ++ } + } + + return NETDEV_TX_OK; +@@ -3656,8 +3662,13 @@ static int bond_xmit_activebackup(struct + static int bond_xmit_xor(struct sk_buff *skb, struct net_device *bond_dev) + { + struct bonding *bond = netdev_priv(bond_dev); ++ int slave_cnt = ACCESS_ONCE(bond->slave_cnt); + +- bond_xmit_slave_id(bond, skb, bond_xmit_hash(bond, skb, bond->slave_cnt)); ++ if (likely(slave_cnt)) ++ bond_xmit_slave_id(bond, skb, ++ bond_xmit_hash(bond, skb, bond->slave_cnt)); ++ else ++ dev_kfree_skb_any(skb); + + return NETDEV_TX_OK; + } diff --git a/queue-3.14/bridge-check-if-vlan-filtering-is-enabled-only-once.patch b/queue-3.14/bridge-check-if-vlan-filtering-is-enabled-only-once.patch new file mode 100644 index 00000000000..bf9ee062411 --- /dev/null +++ b/queue-3.14/bridge-check-if-vlan-filtering-is-enabled-only-once.patch @@ -0,0 +1,95 @@ +From foo@baz Sun Oct 12 16:38:53 CEST 2014 +From: Vlad Yasevich +Date: Fri, 12 Sep 2014 16:26:16 -0400 +Subject: bridge: Check if vlan filtering is enabled only once. + +From: Vlad Yasevich + +[ Upstream commit 20adfa1a81af00bf2027644507ad4fa9cd2849cf ] + +The bridge code checks if vlan filtering is enabled on both +ingress and egress. When the state flip happens, it +is possible for the bridge to currently be forwarding packets +and forwarding behavior becomes non-deterministic. Bridge +may drop packets on some interfaces, but not others. + +This patch solves this by caching the filtered state of the +packet into skb_cb on ingress. The skb_cb is guaranteed to +not be over-written between the time packet entres bridge +forwarding path and the time it leaves it. On egress, we +can then check the cached state to see if we need to +apply filtering information. + +Signed-off-by: Vladislav Yasevich +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/bridge/br_private.h | 3 +++ + net/bridge/br_vlan.c | 15 +++++++++++---- + 2 files changed, 14 insertions(+), 4 deletions(-) + +--- a/net/bridge/br_private.h ++++ b/net/bridge/br_private.h +@@ -302,6 +302,9 @@ struct br_input_skb_cb { + int igmp; + int mrouters_only; + #endif ++#ifdef CONFIG_BRIDGE_VLAN_FILTERING ++ bool vlan_filtered; ++#endif + }; + + #define BR_INPUT_SKB_CB(__skb) ((struct br_input_skb_cb *)(__skb)->cb) +--- a/net/bridge/br_vlan.c ++++ b/net/bridge/br_vlan.c +@@ -125,7 +125,8 @@ struct sk_buff *br_handle_vlan(struct ne + { + u16 vid; + +- if (!br->vlan_enabled) ++ /* If this packet was not filtered at input, let it pass */ ++ if (!BR_INPUT_SKB_CB(skb)->vlan_filtered) + goto out; + + /* Vlan filter table must be configured at this point. The +@@ -163,8 +164,10 @@ bool br_allowed_ingress(struct net_bridg + /* If VLAN filtering is disabled on the bridge, all packets are + * permitted. + */ +- if (!br->vlan_enabled) ++ if (!br->vlan_enabled) { ++ BR_INPUT_SKB_CB(skb)->vlan_filtered = false; + return true; ++ } + + /* If there are no vlan in the permitted list, all packets are + * rejected. +@@ -172,6 +175,8 @@ bool br_allowed_ingress(struct net_bridg + if (!v) + goto drop; + ++ BR_INPUT_SKB_CB(skb)->vlan_filtered = true; ++ + /* If vlan tx offload is disabled on bridge device and frame was + * sent from vlan device on the bridge device, it does not have + * HW accelerated vlan tag. +@@ -228,7 +233,8 @@ bool br_allowed_egress(struct net_bridge + { + u16 vid; + +- if (!br->vlan_enabled) ++ /* If this packet was not filtered at input, let it pass */ ++ if (!BR_INPUT_SKB_CB(skb)->vlan_filtered) + return true; + + if (!v) +@@ -247,7 +253,8 @@ bool br_should_learn(struct net_bridge_p + struct net_bridge *br = p->br; + struct net_port_vlans *v; + +- if (!br->vlan_enabled) ++ /* If filtering was disabled at input, let it pass. */ ++ if (!BR_INPUT_SKB_CB(skb)->vlan_filtered) + return true; + + v = rcu_dereference(p->vlan_info); diff --git a/queue-3.14/bridge-fix-br_should_learn-to-check-vlan_enabled.patch b/queue-3.14/bridge-fix-br_should_learn-to-check-vlan_enabled.patch new file mode 100644 index 00000000000..cb6e7b4183d --- /dev/null +++ b/queue-3.14/bridge-fix-br_should_learn-to-check-vlan_enabled.patch @@ -0,0 +1,35 @@ +From foo@baz Sun Oct 12 16:38:53 CEST 2014 +From: Vlad Yasevich +Date: Mon, 15 Sep 2014 15:24:26 -0400 +Subject: bridge: Fix br_should_learn to check vlan_enabled + +From: Vlad Yasevich + +[ Upstream commit c095f248e63ada504dd90c90baae673ae10ee3fe ] + +As Toshiaki Makita pointed out, the BRIDGE_INPUT_SKB_CB will +not be initialized in br_should_learn() as that function +is called only from br_handle_local_finish(). That is +an input handler for link-local ethernet traffic so it perfectly +correct to check br->vlan_enabled here. + +Reported-by: Toshiaki Makita +Fixes: 20adfa1 bridge: Check if vlan filtering is enabled only once. +Signed-off-by: Vladislav Yasevich +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/bridge/br_vlan.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/bridge/br_vlan.c ++++ b/net/bridge/br_vlan.c +@@ -254,7 +254,7 @@ bool br_should_learn(struct net_bridge_p + struct net_port_vlans *v; + + /* If filtering was disabled at input, let it pass. */ +- if (!BR_INPUT_SKB_CB(skb)->vlan_filtered) ++ if (!br->vlan_enabled) + return true; + + v = rcu_dereference(p->vlan_info); diff --git a/queue-3.14/gro-fix-aggregation-for-skb-using-frag_list.patch b/queue-3.14/gro-fix-aggregation-for-skb-using-frag_list.patch new file mode 100644 index 00000000000..7ba0a3fdace --- /dev/null +++ b/queue-3.14/gro-fix-aggregation-for-skb-using-frag_list.patch @@ -0,0 +1,41 @@ +From foo@baz Sun Oct 12 16:38:53 CEST 2014 +From: Eric Dumazet +Date: Mon, 29 Sep 2014 10:34:29 -0700 +Subject: gro: fix aggregation for skb using frag_list + +From: Eric Dumazet + +[ Upstream commit 73d3fe6d1c6d840763ceafa9afae0aaafa18c4b5 ] + +In commit 8a29111c7ca6 ("net: gro: allow to build full sized skb") +I added a regression for linear skb that traditionally force GRO +to use the frag_list fallback. + +Erez Shitrit found that at most two segments were aggregated and +the "if (skb_gro_len(p) != pinfo->gso_size)" test was failing. + +This is because pinfo at this spot still points to the last skb in the +chain, instead of the first one, where we find the correct gso_size +information. + +Signed-off-by: Eric Dumazet +Fixes: 8a29111c7ca6 ("net: gro: allow to build full sized skb") +Reported-by: Erez Shitrit +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/core/skbuff.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/net/core/skbuff.c ++++ b/net/core/skbuff.c +@@ -3140,6 +3140,9 @@ int skb_gro_receive(struct sk_buff **hea + NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD; + goto done; + } ++ /* switch back to head shinfo */ ++ pinfo = skb_shinfo(p); ++ + if (pinfo->frag_list) + goto merge; + if (skb_gro_len(p) != pinfo->gso_size) diff --git a/queue-3.14/hyperv-fix-a-bug-in-netvsc_start_xmit.patch b/queue-3.14/hyperv-fix-a-bug-in-netvsc_start_xmit.patch new file mode 100644 index 00000000000..83239e1c034 --- /dev/null +++ b/queue-3.14/hyperv-fix-a-bug-in-netvsc_start_xmit.patch @@ -0,0 +1,44 @@ +From foo@baz Sun Oct 12 16:38:53 CEST 2014 +From: KY Srinivasan +Date: Sun, 28 Sep 2014 22:16:43 -0700 +Subject: hyperv: Fix a bug in netvsc_start_xmit() + +From: KY Srinivasan + +[ Upstream commit dedb845ded56ded1c62f5398a94ffa8615d4592d ] + +After the packet is successfully sent, we should not touch the skb +as it may have been freed. This patch is based on the work done by +Long Li . + +In this version of the patch I have fixed issues pointed out by David. +David, please queue this up for stable. + +Signed-off-by: K. Y. Srinivasan +Tested-by: Long Li +Tested-by: Sitsofe Wheeler +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/hyperv/netvsc_drv.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/drivers/net/hyperv/netvsc_drv.c ++++ b/drivers/net/hyperv/netvsc_drv.c +@@ -146,6 +146,7 @@ static int netvsc_start_xmit(struct sk_b + struct hv_netvsc_packet *packet; + int ret; + unsigned int i, num_pages, npg_data; ++ u32 skb_length = skb->len; + + /* Add multipages for skb->data and additional 2 for RNDIS */ + npg_data = (((unsigned long)skb->data + skb_headlen(skb) - 1) +@@ -216,7 +217,7 @@ static int netvsc_start_xmit(struct sk_b + ret = rndis_filter_send(net_device_ctx->device_ctx, + packet); + if (ret == 0) { +- net->stats.tx_bytes += skb->len; ++ net->stats.tx_bytes += skb_length; + net->stats.tx_packets++; + } else { + kfree(packet); diff --git a/queue-3.14/i40e-don-t-stop-driver-probe-when-querying-dcb-config-fails.patch b/queue-3.14/i40e-don-t-stop-driver-probe-when-querying-dcb-config-fails.patch new file mode 100644 index 00000000000..87d3117c2eb --- /dev/null +++ b/queue-3.14/i40e-don-t-stop-driver-probe-when-querying-dcb-config-fails.patch @@ -0,0 +1,60 @@ +From foo@baz Sun Oct 12 16:38:53 CEST 2014 +From: Neerav Parikh +Date: Wed, 13 Aug 2014 04:30:55 -0700 +Subject: i40e: Don't stop driver probe when querying DCB config fails + +From: Neerav Parikh + +Commit id: 014269ff376f552363ecdab78d3d947fbe2237d9 in Linus's tree +should be queued up for stable 3.14 & 3.15 since the i40e driver will +not load when DCB is enabled, unless this patch is applied. + +In case of any AQ command to query port's DCB configuration fails +during driver's probe time; the probe fails and returns an error. + +This patch prevents this issue by continuing the driver probe even +when an error is returned. + +Also, added an error message to dump the AQ error status to show what +error caused the failure to get the DCB configuration from firmware. + +Change-ID: Ifd5663512588bca684069bb7d4fb586dd72221af +Signed-off-by: Neerav Parikh +Signed-off-by: Catherine Sullivan +Signed-off-by: Jeff Kirsher +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/intel/i40e/i40e_main.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +--- a/drivers/net/ethernet/intel/i40e/i40e_main.c ++++ b/drivers/net/ethernet/intel/i40e/i40e_main.c +@@ -4024,6 +4024,9 @@ static int i40e_init_pf_dcb(struct i40e_ + DCB_CAP_DCBX_VER_IEEE; + pf->flags |= I40E_FLAG_DCB_ENABLED; + } ++ } else { ++ dev_info(&pf->pdev->dev, "AQ Querying DCB configuration failed: %d\n", ++ pf->hw.aq.asq_last_status); + } + + out: +@@ -8003,7 +8006,7 @@ static int i40e_probe(struct pci_dev *pd + if (err) { + dev_info(&pdev->dev, "init_pf_dcb failed: %d\n", err); + pf->flags &= ~I40E_FLAG_DCB_ENABLED; +- goto err_init_dcb; ++ /* Continue without DCB enabled */ + } + #endif /* CONFIG_I40E_DCB */ + +@@ -8119,9 +8122,6 @@ err_vsis: + err_switch_setup: + i40e_reset_interrupt_capability(pf); + del_timer_sync(&pf->service_timer); +-#ifdef CONFIG_I40E_DCB +-err_init_dcb: +-#endif /* CONFIG_I40E_DCB */ + err_mac_addr: + err_configure_lan_hmc: + (void)i40e_shutdown_lan_hmc(hw); diff --git a/queue-3.14/ip6_gre-fix-flowi6_proto-value-in-xmit-path.patch b/queue-3.14/ip6_gre-fix-flowi6_proto-value-in-xmit-path.patch new file mode 100644 index 00000000000..c001023f923 --- /dev/null +++ b/queue-3.14/ip6_gre-fix-flowi6_proto-value-in-xmit-path.patch @@ -0,0 +1,42 @@ +From foo@baz Sun Oct 12 16:38:53 CEST 2014 +From: Nicolas Dichtel +Date: Thu, 2 Oct 2014 18:26:49 +0200 +Subject: ip6_gre: fix flowi6_proto value in xmit path + +From: Nicolas Dichtel + +[ Upstream commit 3be07244b7337760a3269d56b2f4a63e72218648 ] + +In xmit path, we build a flowi6 which will be used for the output route lookup. +We are sending a GRE packet, neither IPv4 nor IPv6 encapsulated packet, thus the +protocol should be IPPROTO_GRE. + +Fixes: c12b395a4664 ("gre: Support GRE over IPv6") +Reported-by: Matthieu Ternisien d'Ouville +Signed-off-by: Nicolas Dichtel +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv6/ip6_gre.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/net/ipv6/ip6_gre.c ++++ b/net/ipv6/ip6_gre.c +@@ -787,7 +787,7 @@ static inline int ip6gre_xmit_ipv4(struc + encap_limit = t->parms.encap_limit; + + memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); +- fl6.flowi6_proto = IPPROTO_IPIP; ++ fl6.flowi6_proto = IPPROTO_GRE; + + dsfield = ipv4_get_dsfield(iph); + +@@ -837,7 +837,7 @@ static inline int ip6gre_xmit_ipv6(struc + encap_limit = t->parms.encap_limit; + + memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); +- fl6.flowi6_proto = IPPROTO_IPV6; ++ fl6.flowi6_proto = IPPROTO_GRE; + + dsfield = ipv6_get_dsfield(ipv6h); + if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) diff --git a/queue-3.14/ipv6-fix-rtnl-locking-in-setsockopt-for-anycast-and-multicast.patch b/queue-3.14/ipv6-fix-rtnl-locking-in-setsockopt-for-anycast-and-multicast.patch new file mode 100644 index 00000000000..d33e0d337bc --- /dev/null +++ b/queue-3.14/ipv6-fix-rtnl-locking-in-setsockopt-for-anycast-and-multicast.patch @@ -0,0 +1,254 @@ +From foo@baz Sun Oct 12 16:38:53 CEST 2014 +From: Sabrina Dubroca +Date: Tue, 2 Sep 2014 10:29:29 +0200 +Subject: ipv6: fix rtnl locking in setsockopt for anycast and multicast + +From: Sabrina Dubroca + +[ Upstream commit a9ed4a2986e13011fcf4ed2d1a1647c53112f55b ] + +Calling setsockopt with IPV6_JOIN_ANYCAST or IPV6_LEAVE_ANYCAST +triggers the assertion in addrconf_join_solict()/addrconf_leave_solict() + +ipv6_sock_ac_join(), ipv6_sock_ac_drop(), ipv6_sock_ac_close() need to +take RTNL before calling ipv6_dev_ac_inc/dec. Same thing with +ipv6_sock_mc_join(), ipv6_sock_mc_drop(), ipv6_sock_mc_close() before +calling ipv6_dev_mc_inc/dec. + +This patch moves ASSERT_RTNL() up a level in the call stack. + +Signed-off-by: Cong Wang +Signed-off-by: Sabrina Dubroca +Reported-by: Tommi Rantala +Acked-by: Hannes Frederic Sowa +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv6/addrconf.c | 15 +++++---------- + net/ipv6/anycast.c | 12 ++++++++++++ + net/ipv6/mcast.c | 14 ++++++++++++++ + 3 files changed, 31 insertions(+), 10 deletions(-) + +--- a/net/ipv6/addrconf.c ++++ b/net/ipv6/addrconf.c +@@ -1684,14 +1684,12 @@ void addrconf_dad_failure(struct inet6_i + addrconf_mod_dad_work(ifp, 0); + } + +-/* Join to solicited addr multicast group. */ +- ++/* Join to solicited addr multicast group. ++ * caller must hold RTNL */ + void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr) + { + struct in6_addr maddr; + +- ASSERT_RTNL(); +- + if (dev->flags&(IFF_LOOPBACK|IFF_NOARP)) + return; + +@@ -1699,12 +1697,11 @@ void addrconf_join_solict(struct net_dev + ipv6_dev_mc_inc(dev, &maddr); + } + ++/* caller must hold RTNL */ + void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr) + { + struct in6_addr maddr; + +- ASSERT_RTNL(); +- + if (idev->dev->flags&(IFF_LOOPBACK|IFF_NOARP)) + return; + +@@ -1712,12 +1709,11 @@ void addrconf_leave_solict(struct inet6_ + __ipv6_dev_mc_dec(idev, &maddr); + } + ++/* caller must hold RTNL */ + static void addrconf_join_anycast(struct inet6_ifaddr *ifp) + { + struct in6_addr addr; + +- ASSERT_RTNL(); +- + if (ifp->prefix_len >= 127) /* RFC 6164 */ + return; + ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len); +@@ -1726,12 +1722,11 @@ static void addrconf_join_anycast(struct + ipv6_dev_ac_inc(ifp->idev->dev, &addr); + } + ++/* caller must hold RTNL */ + static void addrconf_leave_anycast(struct inet6_ifaddr *ifp) + { + struct in6_addr addr; + +- ASSERT_RTNL(); +- + if (ifp->prefix_len >= 127) /* RFC 6164 */ + return; + ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len); +--- a/net/ipv6/anycast.c ++++ b/net/ipv6/anycast.c +@@ -77,6 +77,7 @@ int ipv6_sock_ac_join(struct sock *sk, i + pac->acl_next = NULL; + pac->acl_addr = *addr; + ++ rtnl_lock(); + rcu_read_lock(); + if (ifindex == 0) { + struct rt6_info *rt; +@@ -137,6 +138,7 @@ int ipv6_sock_ac_join(struct sock *sk, i + + error: + rcu_read_unlock(); ++ rtnl_unlock(); + if (pac) + sock_kfree_s(sk, pac, sizeof(*pac)); + return err; +@@ -171,13 +173,17 @@ int ipv6_sock_ac_drop(struct sock *sk, i + + spin_unlock_bh(&ipv6_sk_ac_lock); + ++ rtnl_lock(); + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, pac->acl_ifindex); + if (dev) + ipv6_dev_ac_dec(dev, &pac->acl_addr); + rcu_read_unlock(); ++ rtnl_unlock(); + + sock_kfree_s(sk, pac, sizeof(*pac)); ++ if (!dev) ++ return -ENODEV; + return 0; + } + +@@ -198,6 +204,7 @@ void ipv6_sock_ac_close(struct sock *sk) + spin_unlock_bh(&ipv6_sk_ac_lock); + + prev_index = 0; ++ rtnl_lock(); + rcu_read_lock(); + while (pac) { + struct ipv6_ac_socklist *next = pac->acl_next; +@@ -212,6 +219,7 @@ void ipv6_sock_ac_close(struct sock *sk) + pac = next; + } + rcu_read_unlock(); ++ rtnl_unlock(); + } + + static void aca_put(struct ifacaddr6 *ac) +@@ -233,6 +241,8 @@ int ipv6_dev_ac_inc(struct net_device *d + struct rt6_info *rt; + int err; + ++ ASSERT_RTNL(); ++ + idev = in6_dev_get(dev); + + if (idev == NULL) +@@ -302,6 +312,8 @@ int __ipv6_dev_ac_dec(struct inet6_dev * + { + struct ifacaddr6 *aca, *prev_aca; + ++ ASSERT_RTNL(); ++ + write_lock_bh(&idev->lock); + prev_aca = NULL; + for (aca = idev->ac_list; aca; aca = aca->aca_next) { +--- a/net/ipv6/mcast.c ++++ b/net/ipv6/mcast.c +@@ -172,6 +172,7 @@ int ipv6_sock_mc_join(struct sock *sk, i + mc_lst->next = NULL; + mc_lst->addr = *addr; + ++ rtnl_lock(); + rcu_read_lock(); + if (ifindex == 0) { + struct rt6_info *rt; +@@ -185,6 +186,7 @@ int ipv6_sock_mc_join(struct sock *sk, i + + if (dev == NULL) { + rcu_read_unlock(); ++ rtnl_unlock(); + sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); + return -ENODEV; + } +@@ -202,6 +204,7 @@ int ipv6_sock_mc_join(struct sock *sk, i + + if (err) { + rcu_read_unlock(); ++ rtnl_unlock(); + sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); + return err; + } +@@ -212,6 +215,7 @@ int ipv6_sock_mc_join(struct sock *sk, i + spin_unlock(&ipv6_sk_mc_lock); + + rcu_read_unlock(); ++ rtnl_unlock(); + + return 0; + } +@@ -229,6 +233,7 @@ int ipv6_sock_mc_drop(struct sock *sk, i + if (!ipv6_addr_is_multicast(addr)) + return -EINVAL; + ++ rtnl_lock(); + spin_lock(&ipv6_sk_mc_lock); + for (lnk = &np->ipv6_mc_list; + (mc_lst = rcu_dereference_protected(*lnk, +@@ -252,12 +257,15 @@ int ipv6_sock_mc_drop(struct sock *sk, i + } else + (void) ip6_mc_leave_src(sk, mc_lst, NULL); + rcu_read_unlock(); ++ rtnl_unlock(); ++ + atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc); + kfree_rcu(mc_lst, rcu); + return 0; + } + } + spin_unlock(&ipv6_sk_mc_lock); ++ rtnl_unlock(); + + return -EADDRNOTAVAIL; + } +@@ -302,6 +310,7 @@ void ipv6_sock_mc_close(struct sock *sk) + if (!rcu_access_pointer(np->ipv6_mc_list)) + return; + ++ rtnl_lock(); + spin_lock(&ipv6_sk_mc_lock); + while ((mc_lst = rcu_dereference_protected(np->ipv6_mc_list, + lockdep_is_held(&ipv6_sk_mc_lock))) != NULL) { +@@ -328,6 +337,7 @@ void ipv6_sock_mc_close(struct sock *sk) + spin_lock(&ipv6_sk_mc_lock); + } + spin_unlock(&ipv6_sk_mc_lock); ++ rtnl_unlock(); + } + + int ip6_mc_source(int add, int omode, struct sock *sk, +@@ -845,6 +855,8 @@ int ipv6_dev_mc_inc(struct net_device *d + struct ifmcaddr6 *mc; + struct inet6_dev *idev; + ++ ASSERT_RTNL(); ++ + /* we need to take a reference on idev */ + idev = in6_dev_get(dev); + +@@ -916,6 +928,8 @@ int __ipv6_dev_mc_dec(struct inet6_dev * + { + struct ifmcaddr6 *ma, **map; + ++ ASSERT_RTNL(); ++ + write_lock_bh(&idev->lock); + for (map = &idev->mc_list; (ma=*map) != NULL; map = &ma->next) { + if (ipv6_addr_equal(&ma->mca_addr, addr)) { diff --git a/queue-3.14/ipv6-restore-the-behavior-of-ipv6_sock_ac_drop.patch b/queue-3.14/ipv6-restore-the-behavior-of-ipv6_sock_ac_drop.patch new file mode 100644 index 00000000000..795d540b3a7 --- /dev/null +++ b/queue-3.14/ipv6-restore-the-behavior-of-ipv6_sock_ac_drop.patch @@ -0,0 +1,38 @@ +From foo@baz Sun Oct 12 16:38:53 CEST 2014 +From: WANG Cong +Date: Fri, 5 Sep 2014 14:33:00 -0700 +Subject: ipv6: restore the behavior of ipv6_sock_ac_drop() + +From: WANG Cong + +[ Upstream commit de185ab46cb02df9738b0d898b0c3a89181c5526 ] + +It is possible that the interface is already gone after joining +the list of anycast on this interface as we don't hold a refcount +for the device, in this case we are safe to ignore the error. + +What's more important, for API compatibility we should not +change this behavior for applications even if it were correct. + +Fixes: commit a9ed4a2986e13011 ("ipv6: fix rtnl locking in setsockopt for anycast and multicast") +Cc: Sabrina Dubroca +Cc: David S. Miller +Signed-off-by: Cong Wang +Acked-by: Hannes Frederic Sowa +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv6/anycast.c | 2 -- + 1 file changed, 2 deletions(-) + +--- a/net/ipv6/anycast.c ++++ b/net/ipv6/anycast.c +@@ -182,8 +182,6 @@ int ipv6_sock_ac_drop(struct sock *sk, i + rtnl_unlock(); + + sock_kfree_s(sk, pac, sizeof(*pac)); +- if (!dev) +- return -ENODEV; + return 0; + } + diff --git a/queue-3.14/l2tp-fix-race-while-getting-pmtu-on-ppp-pseudo-wire.patch b/queue-3.14/l2tp-fix-race-while-getting-pmtu-on-ppp-pseudo-wire.patch new file mode 100644 index 00000000000..a789dcf784d --- /dev/null +++ b/queue-3.14/l2tp-fix-race-while-getting-pmtu-on-ppp-pseudo-wire.patch @@ -0,0 +1,76 @@ +From foo@baz Sun Oct 12 16:38:53 CEST 2014 +From: Guillaume Nault +Date: Wed, 3 Sep 2014 14:12:55 +0200 +Subject: l2tp: fix race while getting PMTU on PPP pseudo-wire + +From: Guillaume Nault + +[ Upstream commit eed4d839b0cdf9d84b0a9bc63de90fd5e1e886fb ] + +Use dst_entry held by sk_dst_get() to retrieve tunnel's PMTU. + +The dst_mtu(__sk_dst_get(tunnel->sock)) call was racy. __sk_dst_get() +could return NULL if tunnel->sock->sk_dst_cache was reset just before the +call, thus making dst_mtu() dereference a NULL pointer: + +[ 1937.661598] BUG: unable to handle kernel NULL pointer dereference at 0000000000000020 +[ 1937.664005] IP: [] pppol2tp_connect+0x33d/0x41e [l2tp_ppp] +[ 1937.664005] PGD daf0c067 PUD d9f93067 PMD 0 +[ 1937.664005] Oops: 0000 [#1] SMP +[ 1937.664005] Modules linked in: l2tp_ppp l2tp_netlink l2tp_core ip6table_filter ip6_tables iptable_filter ip_tables ebtable_nat ebtables x_tables udp_tunnel pppoe pppox ppp_generic slhc deflate ctr twofish_generic twofish_x86_64_3way xts lrw gf128mul glue_helper twofish_x86_64 twofish_common blowfish_generic blowfish_x86_64 blowfish_common des_generic cbc xcbc rmd160 sha512_generic hmac crypto_null af_key xfrm_algo 8021q garp bridge stp llc tun atmtcp clip atm ext3 mbcache jbd iTCO_wdt coretemp kvm_intel iTCO_vendor_support kvm pcspkr evdev ehci_pci lpc_ich mfd_core i5400_edac edac_core i5k_amb shpchp button processor thermal_sys xfs crc32c_generic libcrc32c dm_mod usbhid sg hid sr_mod sd_mod cdrom crc_t10dif crct10dif_common ata_generic ahci ata_piix tg3 libahci libata uhci_hcd ptp ehci_hcd pps_core usbcore scsi_mod libphy usb_common [last unloaded: l2tp_core] +[ 1937.664005] CPU: 0 PID: 10022 Comm: l2tpstress Tainted: G O 3.17.0-rc1 #1 +[ 1937.664005] Hardware name: HP ProLiant DL160 G5, BIOS O12 08/22/2008 +[ 1937.664005] task: ffff8800d8fda790 ti: ffff8800c43c4000 task.ti: ffff8800c43c4000 +[ 1937.664005] RIP: 0010:[] [] pppol2tp_connect+0x33d/0x41e [l2tp_ppp] +[ 1937.664005] RSP: 0018:ffff8800c43c7de8 EFLAGS: 00010282 +[ 1937.664005] RAX: ffff8800da8a7240 RBX: ffff8800d8c64600 RCX: 000001c325a137b5 +[ 1937.664005] RDX: 8c6318c6318c6320 RSI: 000000000000010c RDI: 0000000000000000 +[ 1937.664005] RBP: ffff8800c43c7ea8 R08: 0000000000000000 R09: 0000000000000000 +[ 1937.664005] R10: ffffffffa048e2c0 R11: ffff8800d8c64600 R12: ffff8800ca7a5000 +[ 1937.664005] R13: ffff8800c439bf40 R14: 000000000000000c R15: 0000000000000009 +[ 1937.664005] FS: 00007fd7f610f700(0000) GS:ffff88011a600000(0000) knlGS:0000000000000000 +[ 1937.664005] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b +[ 1937.664005] CR2: 0000000000000020 CR3: 00000000d9d75000 CR4: 00000000000027e0 +[ 1937.664005] Stack: +[ 1937.664005] ffffffffa049da80 ffff8800d8fda790 000000000000005b ffff880000000009 +[ 1937.664005] ffff8800daf3f200 0000000000000003 ffff8800c43c7e48 ffffffff81109b57 +[ 1937.664005] ffffffff81109b0e ffffffff8114c566 0000000000000000 0000000000000000 +[ 1937.664005] Call Trace: +[ 1937.664005] [] ? pppol2tp_connect+0x235/0x41e [l2tp_ppp] +[ 1937.664005] [] ? might_fault+0x9e/0xa5 +[ 1937.664005] [] ? might_fault+0x55/0xa5 +[ 1937.664005] [] ? rcu_read_unlock+0x1c/0x26 +[ 1937.664005] [] SYSC_connect+0x87/0xb1 +[ 1937.664005] [] ? sysret_check+0x1b/0x56 +[ 1937.664005] [] ? trace_hardirqs_on_caller+0x145/0x1a1 +[ 1937.664005] [] ? trace_hardirqs_on_thunk+0x3a/0x3f +[ 1937.664005] [] ? spin_lock+0x9/0xb +[ 1937.664005] [] SyS_connect+0x9/0xb +[ 1937.664005] [] system_call_fastpath+0x16/0x1b +[ 1937.664005] Code: 10 2a 84 81 e8 65 76 bd e0 65 ff 0c 25 10 bb 00 00 4d 85 ed 74 37 48 8b 85 60 ff ff ff 48 8b 80 88 01 00 00 48 8b b8 10 02 00 00 <48> 8b 47 20 ff 50 20 85 c0 74 0f 83 e8 28 89 83 10 01 00 00 89 +[ 1937.664005] RIP [] pppol2tp_connect+0x33d/0x41e [l2tp_ppp] +[ 1937.664005] RSP +[ 1937.664005] CR2: 0000000000000020 +[ 1939.559375] ---[ end trace 82d44500f28f8708 ]--- + +Fixes: f34c4a35d879 ("l2tp: take PMTU from tunnel UDP socket") +Signed-off-by: Guillaume Nault +Acked-by: Eric Dumazet +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/l2tp/l2tp_ppp.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/net/l2tp/l2tp_ppp.c ++++ b/net/l2tp/l2tp_ppp.c +@@ -758,7 +758,8 @@ static int pppol2tp_connect(struct socke + /* If PMTU discovery was enabled, use the MTU that was discovered */ + dst = sk_dst_get(tunnel->sock); + if (dst != NULL) { +- u32 pmtu = dst_mtu(__sk_dst_get(tunnel->sock)); ++ u32 pmtu = dst_mtu(dst); ++ + if (pmtu != 0) + session->mtu = session->mru = pmtu - + PPPOL2TP_HEADER_OVERHEAD; diff --git a/queue-3.14/macvtap-fix-race-between-device-delete-and-open.patch b/queue-3.14/macvtap-fix-race-between-device-delete-and-open.patch new file mode 100644 index 00000000000..eb2491a8007 --- /dev/null +++ b/queue-3.14/macvtap-fix-race-between-device-delete-and-open.patch @@ -0,0 +1,95 @@ +From foo@baz Sun Oct 12 16:38:53 CEST 2014 +From: Vlad Yasevich +Date: Mon, 22 Sep 2014 16:34:17 -0400 +Subject: macvtap: Fix race between device delete and open. + +From: Vlad Yasevich + +[ Upstream commit 40b8fe45d1f094e3babe7b2dc2b71557ab71401d ] + +In macvtap device delete and open calls can race and +this causes a list curruption of the vlan queue_list. + +The race intself is triggered by the idr accessors +that located the vlan device. The device is stored +into and removed from the idr under both an rtnl and +a mutex. However, when attempting to locate the device +in idr, only a mutex is taken. As a result, once cpu +perfoming a delete may take an rtnl and wait for the mutex, +while another cput doing an open() will take the idr +mutex first to fetch the device pointer and later take +an rtnl to add a queue for the device which may have +just gotten deleted. + +With this patch, we now hold the rtnl for the duration +of the macvtap_open() call thus making sure that +open will not race with delete. + +CC: Michael S. Tsirkin +CC: Jason Wang +Signed-off-by: Vladislav Yasevich +Acked-by: Jason Wang +Acked-by: Michael S. Tsirkin +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/macvtap.c | 18 ++++++++---------- + 1 file changed, 8 insertions(+), 10 deletions(-) + +--- a/drivers/net/macvtap.c ++++ b/drivers/net/macvtap.c +@@ -112,17 +112,15 @@ out: + return err; + } + ++/* Requires RTNL */ + static int macvtap_set_queue(struct net_device *dev, struct file *file, + struct macvtap_queue *q) + { + struct macvlan_dev *vlan = netdev_priv(dev); +- int err = -EBUSY; + +- rtnl_lock(); + if (vlan->numqueues == MAX_MACVTAP_QUEUES) +- goto out; ++ return -EBUSY; + +- err = 0; + rcu_assign_pointer(q->vlan, vlan); + rcu_assign_pointer(vlan->taps[vlan->numvtaps], q); + sock_hold(&q->sk); +@@ -136,9 +134,7 @@ static int macvtap_set_queue(struct net_ + vlan->numvtaps++; + vlan->numqueues++; + +-out: +- rtnl_unlock(); +- return err; ++ return 0; + } + + static int macvtap_disable_queue(struct macvtap_queue *q) +@@ -454,11 +450,12 @@ static void macvtap_sock_destruct(struct + static int macvtap_open(struct inode *inode, struct file *file) + { + struct net *net = current->nsproxy->net_ns; +- struct net_device *dev = dev_get_by_macvtap_minor(iminor(inode)); ++ struct net_device *dev; + struct macvtap_queue *q; +- int err; ++ int err = -ENODEV; + +- err = -ENODEV; ++ rtnl_lock(); ++ dev = dev_get_by_macvtap_minor(iminor(inode)); + if (!dev) + goto out; + +@@ -498,6 +495,7 @@ out: + if (dev) + dev_put(dev); + ++ rtnl_unlock(); + return err; + } + diff --git a/queue-3.14/myri10ge-check-for-dma-mapping-errors.patch b/queue-3.14/myri10ge-check-for-dma-mapping-errors.patch new file mode 100644 index 00000000000..45147a11c0b --- /dev/null +++ b/queue-3.14/myri10ge-check-for-dma-mapping-errors.patch @@ -0,0 +1,175 @@ +From foo@baz Sun Oct 12 16:38:53 CEST 2014 +From: Stanislaw Gruszka +Date: Tue, 12 Aug 2014 10:35:19 +0200 +Subject: myri10ge: check for DMA mapping errors + +From: Stanislaw Gruszka + +[ Upstream commit 10545937e866ccdbb7ab583031dbdcc6b14e4eb4 ] + +On IOMMU systems DMA mapping can fail, we need to check for +that possibility. + +Signed-off-by: Stanislaw Gruszka +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/myricom/myri10ge/myri10ge.c | 88 +++++++++++++++-------- + 1 file changed, 58 insertions(+), 30 deletions(-) + +--- a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c ++++ b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c +@@ -872,6 +872,10 @@ static int myri10ge_dma_test(struct myri + return -ENOMEM; + dmatest_bus = pci_map_page(mgp->pdev, dmatest_page, 0, PAGE_SIZE, + DMA_BIDIRECTIONAL); ++ if (unlikely(pci_dma_mapping_error(mgp->pdev, dmatest_bus))) { ++ __free_page(dmatest_page); ++ return -ENOMEM; ++ } + + /* Run a small DMA test. + * The magic multipliers to the length tell the firmware +@@ -1293,6 +1297,7 @@ myri10ge_alloc_rx_pages(struct myri10ge_ + int bytes, int watchdog) + { + struct page *page; ++ dma_addr_t bus; + int idx; + #if MYRI10GE_ALLOC_SIZE > 4096 + int end_offset; +@@ -1317,11 +1322,21 @@ myri10ge_alloc_rx_pages(struct myri10ge_ + rx->watchdog_needed = 1; + return; + } ++ ++ bus = pci_map_page(mgp->pdev, page, 0, ++ MYRI10GE_ALLOC_SIZE, ++ PCI_DMA_FROMDEVICE); ++ if (unlikely(pci_dma_mapping_error(mgp->pdev, bus))) { ++ __free_pages(page, MYRI10GE_ALLOC_ORDER); ++ if (rx->fill_cnt - rx->cnt < 16) ++ rx->watchdog_needed = 1; ++ return; ++ } ++ + rx->page = page; + rx->page_offset = 0; +- rx->bus = pci_map_page(mgp->pdev, page, 0, +- MYRI10GE_ALLOC_SIZE, +- PCI_DMA_FROMDEVICE); ++ rx->bus = bus; ++ + } + rx->info[idx].page = rx->page; + rx->info[idx].page_offset = rx->page_offset; +@@ -2765,6 +2780,35 @@ myri10ge_submit_req(struct myri10ge_tx_b + mb(); + } + ++static void myri10ge_unmap_tx_dma(struct myri10ge_priv *mgp, ++ struct myri10ge_tx_buf *tx, int idx) ++{ ++ unsigned int len; ++ int last_idx; ++ ++ /* Free any DMA resources we've alloced and clear out the skb slot */ ++ last_idx = (idx + 1) & tx->mask; ++ idx = tx->req & tx->mask; ++ do { ++ len = dma_unmap_len(&tx->info[idx], len); ++ if (len) { ++ if (tx->info[idx].skb != NULL) ++ pci_unmap_single(mgp->pdev, ++ dma_unmap_addr(&tx->info[idx], ++ bus), len, ++ PCI_DMA_TODEVICE); ++ else ++ pci_unmap_page(mgp->pdev, ++ dma_unmap_addr(&tx->info[idx], ++ bus), len, ++ PCI_DMA_TODEVICE); ++ dma_unmap_len_set(&tx->info[idx], len, 0); ++ tx->info[idx].skb = NULL; ++ } ++ idx = (idx + 1) & tx->mask; ++ } while (idx != last_idx); ++} ++ + /* + * Transmit a packet. We need to split the packet so that a single + * segment does not cross myri10ge->tx_boundary, so this makes segment +@@ -2788,7 +2832,7 @@ static netdev_tx_t myri10ge_xmit(struct + u32 low; + __be32 high_swapped; + unsigned int len; +- int idx, last_idx, avail, frag_cnt, frag_idx, count, mss, max_segments; ++ int idx, avail, frag_cnt, frag_idx, count, mss, max_segments; + u16 pseudo_hdr_offset, cksum_offset, queue; + int cum_len, seglen, boundary, rdma_count; + u8 flags, odd_flag; +@@ -2885,9 +2929,12 @@ again: + + /* map the skb for DMA */ + len = skb_headlen(skb); ++ bus = pci_map_single(mgp->pdev, skb->data, len, PCI_DMA_TODEVICE); ++ if (unlikely(pci_dma_mapping_error(mgp->pdev, bus))) ++ goto drop; ++ + idx = tx->req & tx->mask; + tx->info[idx].skb = skb; +- bus = pci_map_single(mgp->pdev, skb->data, len, PCI_DMA_TODEVICE); + dma_unmap_addr_set(&tx->info[idx], bus, bus); + dma_unmap_len_set(&tx->info[idx], len, len); + +@@ -2986,12 +3033,16 @@ again: + break; + + /* map next fragment for DMA */ +- idx = (count + tx->req) & tx->mask; + frag = &skb_shinfo(skb)->frags[frag_idx]; + frag_idx++; + len = skb_frag_size(frag); + bus = skb_frag_dma_map(&mgp->pdev->dev, frag, 0, len, + DMA_TO_DEVICE); ++ if (unlikely(pci_dma_mapping_error(mgp->pdev, bus))) { ++ myri10ge_unmap_tx_dma(mgp, tx, idx); ++ goto drop; ++ } ++ idx = (count + tx->req) & tx->mask; + dma_unmap_addr_set(&tx->info[idx], bus, bus); + dma_unmap_len_set(&tx->info[idx], len, len); + } +@@ -3022,31 +3073,8 @@ again: + return NETDEV_TX_OK; + + abort_linearize: +- /* Free any DMA resources we've alloced and clear out the skb +- * slot so as to not trip up assertions, and to avoid a +- * double-free if linearizing fails */ ++ myri10ge_unmap_tx_dma(mgp, tx, idx); + +- last_idx = (idx + 1) & tx->mask; +- idx = tx->req & tx->mask; +- tx->info[idx].skb = NULL; +- do { +- len = dma_unmap_len(&tx->info[idx], len); +- if (len) { +- if (tx->info[idx].skb != NULL) +- pci_unmap_single(mgp->pdev, +- dma_unmap_addr(&tx->info[idx], +- bus), len, +- PCI_DMA_TODEVICE); +- else +- pci_unmap_page(mgp->pdev, +- dma_unmap_addr(&tx->info[idx], +- bus), len, +- PCI_DMA_TODEVICE); +- dma_unmap_len_set(&tx->info[idx], len, 0); +- tx->info[idx].skb = NULL; +- } +- idx = (idx + 1) & tx->mask; +- } while (idx != last_idx); + if (skb_is_gso(skb)) { + netdev_err(mgp->dev, "TSO but wanted to linearize?!?!?\n"); + goto drop; diff --git a/queue-3.14/net-allow-macvlans-to-move-to-net-namespace.patch b/queue-3.14/net-allow-macvlans-to-move-to-net-namespace.patch new file mode 100644 index 00000000000..283c3cfd1f8 --- /dev/null +++ b/queue-3.14/net-allow-macvlans-to-move-to-net-namespace.patch @@ -0,0 +1,46 @@ +From foo@baz Sun Oct 12 16:38:53 CEST 2014 +From: Francesco Ruggeri +Date: Wed, 17 Sep 2014 10:40:44 -0700 +Subject: net: allow macvlans to move to net namespace + +From: Francesco Ruggeri + +[ Upstream commit 0d0162e7a33d3710b9604e7c68c0f31f5c457428 ] + +I cannot move a macvlan interface created on top of a bonding interface +to a different namespace: + +% ip netns add dummy0 +% ip link add link bond0 mac0 type macvlan +% ip link set mac0 netns dummy0 +RTNETLINK answers: Invalid argument +% + +The problem seems to be that commit f9399814927a ("bonding: Don't allow +bond devices to change network namespaces.") sets NETIF_F_NETNS_LOCAL +on bonding interfaces, and commit 797f87f83b60 ("macvlan: fix netdev +feature propagation from lower device") causes macvlan interfaces +to inherit its features from the lower device. + +NETIF_F_NETNS_LOCAL should not be inherited from the lower device +by a macvlan. +Patch tested on 3.16. + +Signed-off-by: Francesco Ruggeri +Acked-by: Cong Wang +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/macvlan.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/drivers/net/macvlan.c ++++ b/drivers/net/macvlan.c +@@ -709,6 +709,7 @@ static netdev_features_t macvlan_fix_fea + features, + mask); + features |= ALWAYS_ON_FEATURES; ++ features &= ~NETIF_F_NETNS_LOCAL; + + return features; + } diff --git a/queue-3.14/net-always-untag-vlan-tagged-traffic-on-input.patch b/queue-3.14/net-always-untag-vlan-tagged-traffic-on-input.patch new file mode 100644 index 00000000000..6e77a30fa56 --- /dev/null +++ b/queue-3.14/net-always-untag-vlan-tagged-traffic-on-input.patch @@ -0,0 +1,266 @@ +From foo@baz Sun Oct 12 16:38:53 CEST 2014 +From: Vlad Yasevich +Date: Fri, 8 Aug 2014 14:42:13 -0400 +Subject: net: Always untag vlan-tagged traffic on input. + +From: Vlad Yasevich + +[ Upstream commit 0d5501c1c828fb97d02af50aa9d2b1a5498b94e4 ] + +Currently the functionality to untag traffic on input resides +as part of the vlan module and is build only when VLAN support +is enabled in the kernel. When VLAN is disabled, the function +vlan_untag() turns into a stub and doesn't really untag the +packets. This seems to create an interesting interaction +between VMs supporting checksum offloading and some network drivers. + +There are some drivers that do not allow the user to change +tx-vlan-offload feature of the driver. These drivers also seem +to assume that any VLAN-tagged traffic they transmit will +have the vlan information in the vlan_tci and not in the vlan +header already in the skb. When transmitting skbs that already +have tagged data with partial checksum set, the checksum doesn't +appear to be updated correctly by the card thus resulting in a +failure to establish TCP connections. + +The following is a packet trace taken on the receiver where a +sender is a VM with a VLAN configued. The host VM is running on +doest not have VLAN support and the outging interface on the +host is tg3: +10:12:43.503055 52:54:00:ae:42:3f > 28:d2:44:7d:c2:de, ethertype 802.1Q +(0x8100), length 78: vlan 100, p 0, ethertype IPv4, (tos 0x0, ttl 64, id 27243, +offset 0, flags [DF], proto TCP (6), length 60) + 10.0.100.1.58545 > 10.0.100.10.ircu-2: Flags [S], cksum 0xdc39 (incorrect +-> 0x48d9), seq 1069378582, win 29200, options [mss 1460,sackOK,TS val +4294837885 ecr 0,nop,wscale 7], length 0 +10:12:44.505556 52:54:00:ae:42:3f > 28:d2:44:7d:c2:de, ethertype 802.1Q +(0x8100), length 78: vlan 100, p 0, ethertype IPv4, (tos 0x0, ttl 64, id 27244, +offset 0, flags [DF], proto TCP (6), length 60) + 10.0.100.1.58545 > 10.0.100.10.ircu-2: Flags [S], cksum 0xdc39 (incorrect +-> 0x44ee), seq 1069378582, win 29200, options [mss 1460,sackOK,TS val +4294838888 ecr 0,nop,wscale 7], length 0 + +This connection finally times out. + +I've only access to the TG3 hardware in this configuration thus have +only tested this with TG3 driver. There are a lot of other drivers +that do not permit user changes to vlan acceleration features, and +I don't know if they all suffere from a similar issue. + +The patch attempt to fix this another way. It moves the vlan header +stipping code out of the vlan module and always builds it into the +kernel network core. This way, even if vlan is not supported on +a virtualizatoin host, the virtual machines running on top of such +host will still work with VLANs enabled. + +CC: Patrick McHardy +CC: Nithin Nayak Sujir +CC: Michael Chan +CC: Jiri Pirko +Signed-off-by: Vladislav Yasevich +Acked-by: Jiri Pirko +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/if_vlan.h | 6 ----- + include/linux/skbuff.h | 1 + net/8021q/vlan_core.c | 53 ------------------------------------------------ + net/bridge/br_vlan.c | 2 - + net/core/dev.c | 2 - + net/core/netpoll.c | 2 - + net/core/skbuff.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++ + 7 files changed, 57 insertions(+), 62 deletions(-) + +--- a/include/linux/if_vlan.h ++++ b/include/linux/if_vlan.h +@@ -186,7 +186,6 @@ vlan_dev_get_egress_qos_mask(struct net_ + } + + extern bool vlan_do_receive(struct sk_buff **skb); +-extern struct sk_buff *vlan_untag(struct sk_buff *skb); + + extern int vlan_vid_add(struct net_device *dev, __be16 proto, u16 vid); + extern void vlan_vid_del(struct net_device *dev, __be16 proto, u16 vid); +@@ -228,11 +227,6 @@ static inline bool vlan_do_receive(struc + return false; + } + +-static inline struct sk_buff *vlan_untag(struct sk_buff *skb) +-{ +- return skb; +-} +- + static inline int vlan_vid_add(struct net_device *dev, __be16 proto, u16 vid) + { + return 0; +--- a/include/linux/skbuff.h ++++ b/include/linux/skbuff.h +@@ -2458,6 +2458,7 @@ int skb_shift(struct sk_buff *tgt, struc + void skb_scrub_packet(struct sk_buff *skb, bool xnet); + unsigned int skb_gso_transport_seglen(const struct sk_buff *skb); + struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features); ++struct sk_buff *skb_vlan_untag(struct sk_buff *skb); + + struct skb_checksum_ops { + __wsum (*update)(const void *mem, int len, __wsum wsum); +--- a/net/8021q/vlan_core.c ++++ b/net/8021q/vlan_core.c +@@ -106,59 +106,6 @@ u16 vlan_dev_vlan_id(const struct net_de + } + EXPORT_SYMBOL(vlan_dev_vlan_id); + +-static struct sk_buff *vlan_reorder_header(struct sk_buff *skb) +-{ +- if (skb_cow(skb, skb_headroom(skb)) < 0) { +- kfree_skb(skb); +- return NULL; +- } +- +- memmove(skb->data - ETH_HLEN, skb->data - VLAN_ETH_HLEN, 2 * ETH_ALEN); +- skb->mac_header += VLAN_HLEN; +- return skb; +-} +- +-struct sk_buff *vlan_untag(struct sk_buff *skb) +-{ +- struct vlan_hdr *vhdr; +- u16 vlan_tci; +- +- if (unlikely(vlan_tx_tag_present(skb))) { +- /* vlan_tci is already set-up so leave this for another time */ +- return skb; +- } +- +- skb = skb_share_check(skb, GFP_ATOMIC); +- if (unlikely(!skb)) +- goto err_free; +- +- if (unlikely(!pskb_may_pull(skb, VLAN_HLEN))) +- goto err_free; +- +- vhdr = (struct vlan_hdr *) skb->data; +- vlan_tci = ntohs(vhdr->h_vlan_TCI); +- __vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci); +- +- skb_pull_rcsum(skb, VLAN_HLEN); +- vlan_set_encap_proto(skb, vhdr); +- +- skb = vlan_reorder_header(skb); +- if (unlikely(!skb)) +- goto err_free; +- +- skb_reset_network_header(skb); +- skb_reset_transport_header(skb); +- skb_reset_mac_len(skb); +- +- return skb; +- +-err_free: +- kfree_skb(skb); +- return NULL; +-} +-EXPORT_SYMBOL(vlan_untag); +- +- + /* + * vlan info and vid list + */ +--- a/net/bridge/br_vlan.c ++++ b/net/bridge/br_vlan.c +@@ -179,7 +179,7 @@ bool br_allowed_ingress(struct net_bridg + if (unlikely(!vlan_tx_tag_present(skb) && + (skb->protocol == htons(ETH_P_8021Q) || + skb->protocol == htons(ETH_P_8021AD)))) { +- skb = vlan_untag(skb); ++ skb = skb_vlan_untag(skb); + if (unlikely(!skb)) + return false; + } +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -3554,7 +3554,7 @@ another_round: + + if (skb->protocol == cpu_to_be16(ETH_P_8021Q) || + skb->protocol == cpu_to_be16(ETH_P_8021AD)) { +- skb = vlan_untag(skb); ++ skb = skb_vlan_untag(skb); + if (unlikely(!skb)) + goto unlock; + } +--- a/net/core/netpoll.c ++++ b/net/core/netpoll.c +@@ -788,7 +788,7 @@ int __netpoll_rx(struct sk_buff *skb, st + } + + if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) { +- skb = vlan_untag(skb); ++ skb = skb_vlan_untag(skb); + if (unlikely(!skb)) + goto out; + } +--- a/net/core/skbuff.c ++++ b/net/core/skbuff.c +@@ -62,6 +62,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -3963,3 +3964,55 @@ unsigned int skb_gso_transport_seglen(co + return shinfo->gso_size; + } + EXPORT_SYMBOL_GPL(skb_gso_transport_seglen); ++ ++static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb) ++{ ++ if (skb_cow(skb, skb_headroom(skb)) < 0) { ++ kfree_skb(skb); ++ return NULL; ++ } ++ ++ memmove(skb->data - ETH_HLEN, skb->data - VLAN_ETH_HLEN, 2 * ETH_ALEN); ++ skb->mac_header += VLAN_HLEN; ++ return skb; ++} ++ ++struct sk_buff *skb_vlan_untag(struct sk_buff *skb) ++{ ++ struct vlan_hdr *vhdr; ++ u16 vlan_tci; ++ ++ if (unlikely(vlan_tx_tag_present(skb))) { ++ /* vlan_tci is already set-up so leave this for another time */ ++ return skb; ++ } ++ ++ skb = skb_share_check(skb, GFP_ATOMIC); ++ if (unlikely(!skb)) ++ goto err_free; ++ ++ if (unlikely(!pskb_may_pull(skb, VLAN_HLEN))) ++ goto err_free; ++ ++ vhdr = (struct vlan_hdr *)skb->data; ++ vlan_tci = ntohs(vhdr->h_vlan_TCI); ++ __vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci); ++ ++ skb_pull_rcsum(skb, VLAN_HLEN); ++ vlan_set_encap_proto(skb, vhdr); ++ ++ skb = skb_reorder_vlan_header(skb); ++ if (unlikely(!skb)) ++ goto err_free; ++ ++ skb_reset_network_header(skb); ++ skb_reset_transport_header(skb); ++ skb_reset_mac_len(skb); ++ ++ return skb; ++ ++err_free: ++ kfree_skb(skb); ++ return NULL; ++} ++EXPORT_SYMBOL(skb_vlan_untag); diff --git a/queue-3.14/netlink-reset-network-header-before-passing-to-taps.patch b/queue-3.14/netlink-reset-network-header-before-passing-to-taps.patch new file mode 100644 index 00000000000..3fca1ca6a3f --- /dev/null +++ b/queue-3.14/netlink-reset-network-header-before-passing-to-taps.patch @@ -0,0 +1,41 @@ +From foo@baz Sun Oct 12 16:38:53 CEST 2014 +From: Daniel Borkmann +Date: Thu, 7 Aug 2014 22:22:47 +0200 +Subject: netlink: reset network header before passing to taps + +From: Daniel Borkmann + +[ Upstream commit 4e48ed883c72e78c5a910f8831ffe90c9b18f0ec ] + +netlink doesn't set any network header offset thus when the skb is +being passed to tap devices via dev_queue_xmit_nit(), it emits klog +false positives due to it being unset like: + + ... + [ 124.990397] protocol 0000 is buggy, dev nlmon0 + [ 124.990411] protocol 0000 is buggy, dev nlmon0 + ... + +So just reset the network header before passing to the device; for +packet sockets that just means nothing will change - mac and net +offset hold the same value just as before. + +Reported-by: Marcel Holtmann +Signed-off-by: Daniel Borkmann +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/netlink/af_netlink.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/netlink/af_netlink.c ++++ b/net/netlink/af_netlink.c +@@ -205,7 +205,7 @@ static int __netlink_deliver_tap_skb(str + nskb->protocol = htons((u16) sk->sk_protocol); + nskb->pkt_type = netlink_is_kernel(sk) ? + PACKET_KERNEL : PACKET_USER; +- ++ skb_reset_network_header(nskb); + ret = dev_queue_xmit(nskb); + if (unlikely(ret > 0)) + ret = net_xmit_errno(ret); diff --git a/queue-3.14/openvswitch-fix-panic-with-multiple-vlan-headers.patch b/queue-3.14/openvswitch-fix-panic-with-multiple-vlan-headers.patch new file mode 100644 index 00000000000..1b602dd7e46 --- /dev/null +++ b/queue-3.14/openvswitch-fix-panic-with-multiple-vlan-headers.patch @@ -0,0 +1,55 @@ +From foo@baz Sun Oct 12 16:38:53 CEST 2014 +From: Jiri Benc +Date: Thu, 21 Aug 2014 21:33:44 +0200 +Subject: openvswitch: fix panic with multiple vlan headers + +From: Jiri Benc + +[ Upstream commit 2ba5af42a7b59ef01f9081234d8855140738defd ] + +When there are multiple vlan headers present in a received frame, the first +one is put into vlan_tci and protocol is set to ETH_P_8021Q. Anything in the +skb beyond the VLAN TPID may be still non-linear, including the inner TCI +and ethertype. While ovs_flow_extract takes care of IP and IPv6 headers, it +does nothing with ETH_P_8021Q. Later, if OVS_ACTION_ATTR_POP_VLAN is +executed, __pop_vlan_tci pulls the next vlan header into vlan_tci. + +This leads to two things: + +1. Part of the resulting ethernet header is in the non-linear part of the + skb. When eth_type_trans is called later as the result of + OVS_ACTION_ATTR_OUTPUT, kernel BUGs in __skb_pull. Also, __pop_vlan_tci + is in fact accessing random data when it reads past the TPID. + +2. network_header points into the ethernet header instead of behind it. + mac_len is set to a wrong value (10), too. + +Reported-by: Yulong Pei +Signed-off-by: Jiri Benc +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/openvswitch/actions.c | 5 +++++ + 1 file changed, 5 insertions(+) + +--- a/net/openvswitch/actions.c ++++ b/net/openvswitch/actions.c +@@ -42,6 +42,9 @@ static int do_execute_actions(struct dat + + static int make_writable(struct sk_buff *skb, int write_len) + { ++ if (!pskb_may_pull(skb, write_len)) ++ return -ENOMEM; ++ + if (!skb_cloned(skb) || skb_clone_writable(skb, write_len)) + return 0; + +@@ -70,6 +73,8 @@ static int __pop_vlan_tci(struct sk_buff + + vlan_set_encap_proto(skb, vhdr); + skb->mac_header += VLAN_HLEN; ++ if (skb_network_offset(skb) < ETH_HLEN) ++ skb_set_network_header(skb, ETH_HLEN); + skb_reset_mac_len(skb); + + return 0; diff --git a/queue-3.14/packet-handle-too-big-packets-for-packet_v3.patch b/queue-3.14/packet-handle-too-big-packets-for-packet_v3.patch new file mode 100644 index 00000000000..8d31c5f5afe --- /dev/null +++ b/queue-3.14/packet-handle-too-big-packets-for-packet_v3.patch @@ -0,0 +1,83 @@ +From foo@baz Sun Oct 12 16:38:53 CEST 2014 +From: Eric Dumazet +Date: Fri, 15 Aug 2014 09:16:04 -0700 +Subject: packet: handle too big packets for PACKET_V3 + +From: Eric Dumazet + +[ Upstream commit dc808110bb62b64a448696ecac3938902c92e1ab ] + +af_packet can currently overwrite kernel memory by out of bound +accesses, because it assumed a [new] block can always hold one frame. + +This is not generally the case, even if most existing tools do it right. + +This patch clamps too long frames as API permits, and issue a one time +error on syslog. + +[ 394.357639] tpacket_rcv: packet too big, clamped from 5042 to 3966. macoff=82 + +In this example, packet header tp_snaplen was set to 3966, +and tp_len was set to 5042 (skb->len) + +Signed-off-by: Eric Dumazet +Fixes: f6fb8f100b80 ("af-packet: TPACKET_V3 flexible buffer implementation.") +Acked-by: Daniel Borkmann +Acked-by: Neil Horman +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/packet/af_packet.c | 17 +++++++++++++++++ + net/packet/internal.h | 1 + + 2 files changed, 18 insertions(+) + +--- a/net/packet/af_packet.c ++++ b/net/packet/af_packet.c +@@ -635,6 +635,7 @@ static void init_prb_bdqc(struct packet_ + p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov); + p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv; + ++ p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv); + prb_init_ft_ops(p1, req_u); + prb_setup_retire_blk_timer(po, tx_ring); + prb_open_block(p1, pbd); +@@ -1946,6 +1947,18 @@ static int tpacket_rcv(struct sk_buff *s + if ((int)snaplen < 0) + snaplen = 0; + } ++ } else if (unlikely(macoff + snaplen > ++ GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) { ++ u32 nval; ++ ++ nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff; ++ pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n", ++ snaplen, nval, macoff); ++ snaplen = nval; ++ if (unlikely((int)snaplen < 0)) { ++ snaplen = 0; ++ macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len; ++ } + } + spin_lock(&sk->sk_receive_queue.lock); + h.raw = packet_current_rx_frame(po, skb, +@@ -3779,6 +3792,10 @@ static int packet_set_ring(struct sock * + goto out; + if (unlikely(req->tp_block_size & (PAGE_SIZE - 1))) + goto out; ++ if (po->tp_version >= TPACKET_V3 && ++ (int)(req->tp_block_size - ++ BLK_PLUS_PRIV(req_u->req3.tp_sizeof_priv)) <= 0) ++ goto out; + if (unlikely(req->tp_frame_size < po->tp_hdrlen + + po->tp_reserve)) + goto out; +--- a/net/packet/internal.h ++++ b/net/packet/internal.h +@@ -29,6 +29,7 @@ struct tpacket_kbdq_core { + char *pkblk_start; + char *pkblk_end; + int kblk_size; ++ unsigned int max_frame_len; + unsigned int knum_blocks; + uint64_t knxt_seq_num; + char *prev; diff --git a/queue-3.14/revert-net-macb-add-pinctrl-consumer-support.patch b/queue-3.14/revert-net-macb-add-pinctrl-consumer-support.patch new file mode 100644 index 00000000000..d04d63ba843 --- /dev/null +++ b/queue-3.14/revert-net-macb-add-pinctrl-consumer-support.patch @@ -0,0 +1,55 @@ +From foo@baz Sun Oct 12 16:38:53 CEST 2014 +From: Soren Brinkmann +Date: Mon, 22 Sep 2014 16:49:08 -0700 +Subject: Revert "net/macb: add pinctrl consumer support" + +From: Soren Brinkmann + +[ Upstream commit 9026968abe7ad102f4ac5c6d96d733643f75399c ] + +This reverts commit 8ef29f8aae524bd51298fb10ac6a5ce6c4c5a3d8. +The driver core already calls pinctrl_get() and claims the default +state. There is no need to replicate this in the driver. +Acked-by: Nicolas Ferre + +Acked-by: Nicolas Ferre +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/cadence/macb.c | 11 ----------- + 1 file changed, 11 deletions(-) + +--- a/drivers/net/ethernet/cadence/macb.c ++++ b/drivers/net/ethernet/cadence/macb.c +@@ -30,7 +30,6 @@ + #include + #include + #include +-#include + + #include "macb.h" + +@@ -1810,7 +1809,6 @@ static int __init macb_probe(struct plat + struct phy_device *phydev; + u32 config; + int err = -ENXIO; +- struct pinctrl *pinctrl; + const char *mac; + + regs = platform_get_resource(pdev, IORESOURCE_MEM, 0); +@@ -1819,15 +1817,6 @@ static int __init macb_probe(struct plat + goto err_out; + } + +- pinctrl = devm_pinctrl_get_select_default(&pdev->dev); +- if (IS_ERR(pinctrl)) { +- err = PTR_ERR(pinctrl); +- if (err == -EPROBE_DEFER) +- goto err_out; +- +- dev_warn(&pdev->dev, "No pinctrl provided\n"); +- } +- + err = -ENOMEM; + dev = alloc_etherdev(sizeof(*bp)); + if (!dev) diff --git a/queue-3.14/rtnetlink-fix-vf-info-size.patch b/queue-3.14/rtnetlink-fix-vf-info-size.patch new file mode 100644 index 00000000000..b6ea0f54123 --- /dev/null +++ b/queue-3.14/rtnetlink-fix-vf-info-size.patch @@ -0,0 +1,36 @@ +From foo@baz Sun Oct 12 16:38:53 CEST 2014 +From: Jiri Benc +Date: Fri, 8 Aug 2014 16:44:32 +0200 +Subject: rtnetlink: fix VF info size + +From: Jiri Benc + +[ Upstream commit 945a36761fd7877660f630bbdeb4ff9ff80d1935 ] + +Commit 1d8faf48c74b8 ("net/core: Add VF link state control") added new +attribute to IFLA_VF_INFO group in rtnl_fill_ifinfo but did not adjust size +of the allocated memory in if_nlmsg_size/rtnl_vfinfo_size. As the result, we +may trigger warnings in rtnl_getlink and similar functions when many VF +links are enabled, as the information does not fit into the allocated skb. + +Fixes: 1d8faf48c74b8 ("net/core: Add VF link state control") +Reported-by: Yulong Pei +Signed-off-by: Jiri Benc +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/core/rtnetlink.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/net/core/rtnetlink.c ++++ b/net/core/rtnetlink.c +@@ -799,7 +799,8 @@ static inline int rtnl_vfinfo_size(const + (nla_total_size(sizeof(struct ifla_vf_mac)) + + nla_total_size(sizeof(struct ifla_vf_vlan)) + + nla_total_size(sizeof(struct ifla_vf_tx_rate)) + +- nla_total_size(sizeof(struct ifla_vf_spoofchk))); ++ nla_total_size(sizeof(struct ifla_vf_spoofchk)) + ++ nla_total_size(sizeof(struct ifla_vf_link_state))); + return size; + } else + return 0; diff --git a/queue-3.14/sctp-handle-association-restarts-when-the-socket-is-closed.patch b/queue-3.14/sctp-handle-association-restarts-when-the-socket-is-closed.patch new file mode 100644 index 00000000000..5db42af7058 --- /dev/null +++ b/queue-3.14/sctp-handle-association-restarts-when-the-socket-is-closed.patch @@ -0,0 +1,79 @@ +From foo@baz Sun Oct 12 16:38:53 CEST 2014 +From: Vlad Yasevich +Date: Fri, 3 Oct 2014 18:16:20 -0400 +Subject: sctp: handle association restarts when the socket is closed. + +From: Vlad Yasevich + +[ Upstream commit bdf6fa52f01b941d4a80372d56de465bdbbd1d23 ] + +Currently association restarts do not take into consideration the +state of the socket. When a restart happens, the current assocation +simply transitions into established state. This creates a condition +where a remote system, through a the restart procedure, may create a +local association that is no way reachable by user. The conditions +to trigger this are as follows: + 1) Remote does not acknoledge some data causing data to remain + outstanding. + 2) Local application calls close() on the socket. Since data + is still outstanding, the association is placed in SHUTDOWN_PENDING + state. However, the socket is closed. + 3) The remote tries to create a new association, triggering a restart + on the local system. The association moves from SHUTDOWN_PENDING + to ESTABLISHED. At this point, it is no longer reachable by + any socket on the local system. + +This patch addresses the above situation by moving the newly ESTABLISHED +association into SHUTDOWN-SENT state and bundling a SHUTDOWN after +the COOKIE-ACK chunk. This way, the restarted associate immidiately +enters the shutdown procedure and forces the termination of the +unreachable association. + +Reported-by: David Laight +Signed-off-by: Vlad Yasevich +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + include/net/sctp/command.h | 2 +- + net/sctp/sm_statefuns.c | 19 ++++++++++++++++--- + 2 files changed, 17 insertions(+), 4 deletions(-) + +--- a/include/net/sctp/command.h ++++ b/include/net/sctp/command.h +@@ -115,7 +115,7 @@ typedef enum { + * analysis of the state functions, but in reality just taken from + * thin air in the hopes othat we don't trigger a kernel panic. + */ +-#define SCTP_MAX_NUM_COMMANDS 14 ++#define SCTP_MAX_NUM_COMMANDS 20 + + typedef union { + __s32 i32; +--- a/net/sctp/sm_statefuns.c ++++ b/net/sctp/sm_statefuns.c +@@ -1775,9 +1775,22 @@ static sctp_disposition_t sctp_sf_do_dup + /* Update the content of current association. */ + sctp_add_cmd_sf(commands, SCTP_CMD_UPDATE_ASSOC, SCTP_ASOC(new_asoc)); + sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ev)); +- sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, +- SCTP_STATE(SCTP_STATE_ESTABLISHED)); +- sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl)); ++ if (sctp_state(asoc, SHUTDOWN_PENDING) && ++ (sctp_sstate(asoc->base.sk, CLOSING) || ++ sock_flag(asoc->base.sk, SOCK_DEAD))) { ++ /* if were currently in SHUTDOWN_PENDING, but the socket ++ * has been closed by user, don't transition to ESTABLISHED. ++ * Instead trigger SHUTDOWN bundled with COOKIE_ACK. ++ */ ++ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl)); ++ return sctp_sf_do_9_2_start_shutdown(net, ep, asoc, ++ SCTP_ST_CHUNK(0), NULL, ++ commands); ++ } else { ++ sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, ++ SCTP_STATE(SCTP_STATE_ESTABLISHED)); ++ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl)); ++ } + return SCTP_DISPOSITION_CONSUME; + + nomem_ev: diff --git a/queue-3.14/series b/queue-3.14/series new file mode 100644 index 00000000000..e9235f8b9af --- /dev/null +++ b/queue-3.14/series @@ -0,0 +1,31 @@ +netlink-reset-network-header-before-passing-to-taps.patch +rtnetlink-fix-vf-info-size.patch +net-always-untag-vlan-tagged-traffic-on-input.patch +myri10ge-check-for-dma-mapping-errors.patch +i40e-don-t-stop-driver-probe-when-querying-dcb-config-fails.patch +tcp-don-t-use-timestamp-from-repaired-skb-s-to-calculate-rtt-v2.patch +sit-fix-ipip6_tunnel_lookup-device-matching-criteria.patch +tcp-fix-tcp_release_cb-to-dispatch-via-address-family-for-mtu_reduced.patch +tcp-fix-ssthresh-and-undo-for-consecutive-short-frto-episodes.patch +packet-handle-too-big-packets-for-packet_v3.patch +openvswitch-fix-panic-with-multiple-vlan-headers.patch +vxlan-fix-incorrect-initializer-in-union-vxlan_addr.patch +l2tp-fix-race-while-getting-pmtu-on-ppp-pseudo-wire.patch +ipv6-fix-rtnl-locking-in-setsockopt-for-anycast-and-multicast.patch +bonding-fix-div-by-zero-while-enslaving-and-transmitting.patch +ipv6-restore-the-behavior-of-ipv6_sock_ac_drop.patch +bridge-check-if-vlan-filtering-is-enabled-only-once.patch +bridge-fix-br_should_learn-to-check-vlan_enabled.patch +net-allow-macvlans-to-move-to-net-namespace.patch +tg3-work-around-hw-fw-limitations-with-vlan-encapsulated-frames.patch +tg3-allow-for-recieve-of-full-size-8021ad-frames.patch +xfrm-generate-blackhole-routes-only-from-route-lookup-functions.patch +xfrm-generate-queueing-routes-only-from-route-lookup-functions.patch +macvtap-fix-race-between-device-delete-and-open.patch +revert-net-macb-add-pinctrl-consumer-support.patch +gro-fix-aggregation-for-skb-using-frag_list.patch +hyperv-fix-a-bug-in-netvsc_start_xmit.patch +ip6_gre-fix-flowi6_proto-value-in-xmit-path.patch +team-avoid-race-condition-in-scheduling-delayed-work.patch +sctp-handle-association-restarts-when-the-socket-is-closed.patch +tcp-fixing-tlp-s-fin-recovery.patch diff --git a/queue-3.14/sit-fix-ipip6_tunnel_lookup-device-matching-criteria.patch b/queue-3.14/sit-fix-ipip6_tunnel_lookup-device-matching-criteria.patch new file mode 100644 index 00000000000..1c379c00d61 --- /dev/null +++ b/queue-3.14/sit-fix-ipip6_tunnel_lookup-device-matching-criteria.patch @@ -0,0 +1,55 @@ +From foo@baz Sun Oct 12 16:38:53 CEST 2014 +From: Shmulik Ladkani +Date: Thu, 14 Aug 2014 15:27:20 +0300 +Subject: sit: Fix ipip6_tunnel_lookup device matching criteria + +From: Shmulik Ladkani + +[ Upstream commit bc8fc7b8f825ef17a0fb9e68c18ce94fa66ab337 ] + +As of 4fddbf5d78 ("sit: strictly restrict incoming traffic to tunnel link device"), +when looking up a tunnel, tunnel's underlying interface (t->parms.link) +is verified to match incoming traffic's ingress device. + +However the comparison was incorrectly based on skb->dev->iflink. + +Instead, dev->ifindex should be used, which correctly represents the +interface from which the IP stack hands the ipip6 packets. + +This allows setting up sit tunnels bound to vlan interfaces (otherwise +incoming ipip6 traffic on the vlan interface was dropped due to +ipip6_tunnel_lookup match failure). + +Signed-off-by: Shmulik Ladkani +Acked-by: Nicolas Dichtel +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv6/sit.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +--- a/net/ipv6/sit.c ++++ b/net/ipv6/sit.c +@@ -101,19 +101,19 @@ static struct ip_tunnel *ipip6_tunnel_lo + for_each_ip_tunnel_rcu(t, sitn->tunnels_r_l[h0 ^ h1]) { + if (local == t->parms.iph.saddr && + remote == t->parms.iph.daddr && +- (!dev || !t->parms.link || dev->iflink == t->parms.link) && ++ (!dev || !t->parms.link || dev->ifindex == t->parms.link) && + (t->dev->flags & IFF_UP)) + return t; + } + for_each_ip_tunnel_rcu(t, sitn->tunnels_r[h0]) { + if (remote == t->parms.iph.daddr && +- (!dev || !t->parms.link || dev->iflink == t->parms.link) && ++ (!dev || !t->parms.link || dev->ifindex == t->parms.link) && + (t->dev->flags & IFF_UP)) + return t; + } + for_each_ip_tunnel_rcu(t, sitn->tunnels_l[h1]) { + if (local == t->parms.iph.saddr && +- (!dev || !t->parms.link || dev->iflink == t->parms.link) && ++ (!dev || !t->parms.link || dev->ifindex == t->parms.link) && + (t->dev->flags & IFF_UP)) + return t; + } diff --git a/queue-3.14/tcp-don-t-use-timestamp-from-repaired-skb-s-to-calculate-rtt-v2.patch b/queue-3.14/tcp-don-t-use-timestamp-from-repaired-skb-s-to-calculate-rtt-v2.patch new file mode 100644 index 00000000000..521db648b48 --- /dev/null +++ b/queue-3.14/tcp-don-t-use-timestamp-from-repaired-skb-s-to-calculate-rtt-v2.patch @@ -0,0 +1,116 @@ +From foo@baz Sun Oct 12 16:38:53 CEST 2014 +From: Andrey Vagin +Date: Wed, 13 Aug 2014 16:03:10 +0400 +Subject: tcp: don't use timestamp from repaired skb-s to calculate RTT (v2) + +From: Andrey Vagin + +[ Upstream commit 9d186cac7ffb1831e9f34cb4a3a8b22abb9dd9d4 ] + +We don't know right timestamp for repaired skb-s. Wrong RTT estimations +isn't good, because some congestion modules heavily depends on it. + +This patch adds the TCPCB_REPAIRED flag, which is included in +TCPCB_RETRANS. + +Thanks to Eric for the advice how to fix this issue. + +This patch fixes the warning: +[ 879.562947] WARNING: CPU: 0 PID: 2825 at net/ipv4/tcp_input.c:3078 tcp_ack+0x11f5/0x1380() +[ 879.567253] CPU: 0 PID: 2825 Comm: socket-tcpbuf-l Not tainted 3.16.0-next-20140811 #1 +[ 879.567829] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 +[ 879.568177] 0000000000000000 00000000c532680c ffff880039643d00 ffffffff817aa2d2 +[ 879.568776] 0000000000000000 ffff880039643d38 ffffffff8109afbd ffff880039d6ba80 +[ 879.569386] ffff88003a449800 000000002983d6bd 0000000000000000 000000002983d6bc +[ 879.569982] Call Trace: +[ 879.570264] [] dump_stack+0x4d/0x66 +[ 879.570599] [] warn_slowpath_common+0x7d/0xa0 +[ 879.570935] [] warn_slowpath_null+0x1a/0x20 +[ 879.571292] [] tcp_ack+0x11f5/0x1380 +[ 879.571614] [] tcp_rcv_established+0x1ed/0x710 +[ 879.571958] [] tcp_v4_do_rcv+0x10a/0x370 +[ 879.572315] [] release_sock+0x89/0x1d0 +[ 879.572642] [] do_tcp_setsockopt.isra.36+0x120/0x860 +[ 879.573000] [] ? rcu_read_lock_held+0x6e/0x80 +[ 879.573352] [] tcp_setsockopt+0x32/0x40 +[ 879.573678] [] sock_common_setsockopt+0x14/0x20 +[ 879.574031] [] SyS_setsockopt+0x80/0xf0 +[ 879.574393] [] system_call_fastpath+0x16/0x1b +[ 879.574730] ---[ end trace a17cbc38eb8c5c00 ]--- + +v2: moving setting of skb->when for repaired skb-s in tcp_write_xmit, + where it's set for other skb-s. + +Fixes: 431a91242d8d ("tcp: timestamp SYN+DATA messages") +Fixes: 740b0f1841f6 ("tcp: switch rtt estimations to usec resolution") +Cc: Eric Dumazet +Cc: Pavel Emelyanov +Cc: "David S. Miller" +Signed-off-by: Andrey Vagin +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + include/net/tcp.h | 4 +++- + net/ipv4/tcp.c | 14 +++++++------- + net/ipv4/tcp_output.c | 5 ++++- + 3 files changed, 14 insertions(+), 9 deletions(-) + +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -720,8 +720,10 @@ struct tcp_skb_cb { + #define TCPCB_SACKED_RETRANS 0x02 /* SKB retransmitted */ + #define TCPCB_LOST 0x04 /* SKB is lost */ + #define TCPCB_TAGBITS 0x07 /* All tag bits */ ++#define TCPCB_REPAIRED 0x10 /* SKB repaired (no skb_mstamp) */ + #define TCPCB_EVER_RETRANS 0x80 /* Ever retransmitted frame */ +-#define TCPCB_RETRANS (TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS) ++#define TCPCB_RETRANS (TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS| \ ++ TCPCB_REPAIRED) + + __u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */ + /* 1 byte hole */ +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -1175,13 +1175,6 @@ new_segment: + goto wait_for_memory; + + /* +- * All packets are restored as if they have +- * already been sent. +- */ +- if (tp->repair) +- TCP_SKB_CB(skb)->when = tcp_time_stamp; +- +- /* + * Check whether we can use HW checksum. + */ + if (sk->sk_route_caps & NETIF_F_ALL_CSUM) +@@ -1190,6 +1183,13 @@ new_segment: + skb_entail(sk, skb); + copy = size_goal; + max = size_goal; ++ ++ /* All packets are restored as if they have ++ * already been sent. skb_mstamp isn't set to ++ * avoid wrong rtt estimation. ++ */ ++ if (tp->repair) ++ TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED; + } + + /* Try to append data to the end of skb. */ +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -1876,8 +1876,11 @@ static bool tcp_write_xmit(struct sock * + tso_segs = tcp_init_tso_segs(sk, skb, mss_now); + BUG_ON(!tso_segs); + +- if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) ++ if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { ++ /* "when" is used as a start point for the retransmit timer */ ++ TCP_SKB_CB(skb)->when = tcp_time_stamp; + goto repair; /* Skip network transmission */ ++ } + + cwnd_quota = tcp_cwnd_test(tp, skb); + if (!cwnd_quota) { diff --git a/queue-3.14/tcp-fix-ssthresh-and-undo-for-consecutive-short-frto-episodes.patch b/queue-3.14/tcp-fix-ssthresh-and-undo-for-consecutive-short-frto-episodes.patch new file mode 100644 index 00000000000..689900ed671 --- /dev/null +++ b/queue-3.14/tcp-fix-ssthresh-and-undo-for-consecutive-short-frto-episodes.patch @@ -0,0 +1,81 @@ +From foo@baz Sun Oct 12 16:38:53 CEST 2014 +From: Neal Cardwell +Date: Thu, 14 Aug 2014 16:13:07 -0400 +Subject: tcp: fix ssthresh and undo for consecutive short FRTO episodes + +From: Neal Cardwell + +[ Upstream commit 0c9ab09223fe9922baeb22546c9a90d774a4bde6 ] + +Fix TCP FRTO logic so that it always notices when snd_una advances, +indicating that any RTO after that point will be a new and distinct +loss episode. + +Previously there was a very specific sequence that could cause FRTO to +fail to notice a new loss episode had started: + +(1) RTO timer fires, enter FRTO and retransmit packet 1 in write queue +(2) receiver ACKs packet 1 +(3) FRTO sends 2 more packets +(4) RTO timer fires again (should start a new loss episode) + +The problem was in step (3) above, where tcp_process_loss() returned +early (in the spot marked "Step 2.b"), so that it never got to the +logic to clear icsk_retransmits. Thus icsk_retransmits stayed +non-zero. Thus in step (4) tcp_enter_loss() would see the non-zero +icsk_retransmits, decide that this RTO is not a new episode, and +decide not to cut ssthresh and remember the current cwnd and ssthresh +for undo. + +There were two main consequences to the bug that we have +observed. First, ssthresh was not decreased in step (4). Second, when +there was a series of such FRTO (1-4) sequences that happened to be +followed by an FRTO undo, we would restore the cwnd and ssthresh from +before the entire series started (instead of the cwnd and ssthresh +from before the most recent RTO). This could result in cwnd and +ssthresh being restored to values much bigger than the proper values. + +Signed-off-by: Neal Cardwell +Signed-off-by: Yuchung Cheng +Fixes: e33099f96d99c ("tcp: implement RFC5682 F-RTO") +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/tcp_input.c | 8 +++----- + 1 file changed, 3 insertions(+), 5 deletions(-) + +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -2678,7 +2678,6 @@ static void tcp_enter_recovery(struct so + */ + static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) + { +- struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + bool recovered = !before(tp->snd_una, tp->high_seq); + +@@ -2704,12 +2703,9 @@ static void tcp_process_loss(struct sock + + if (recovered) { + /* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */ +- icsk->icsk_retransmits = 0; + tcp_try_undo_recovery(sk); + return; + } +- if (flag & FLAG_DATA_ACKED) +- icsk->icsk_retransmits = 0; + if (tcp_is_reno(tp)) { + /* A Reno DUPACK means new data in F-RTO step 2.b above are + * delivered. Lower inflight to clock out (re)tranmissions. +@@ -3398,8 +3394,10 @@ static int tcp_ack(struct sock *sk, cons + icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) + tcp_rearm_rto(sk); + +- if (after(ack, prior_snd_una)) ++ if (after(ack, prior_snd_una)) { + flag |= FLAG_SND_UNA_ADVANCED; ++ icsk->icsk_retransmits = 0; ++ } + + prior_fackets = tp->fackets_out; + prior_in_flight = tcp_packets_in_flight(tp); diff --git a/queue-3.14/tcp-fix-tcp_release_cb-to-dispatch-via-address-family-for-mtu_reduced.patch b/queue-3.14/tcp-fix-tcp_release_cb-to-dispatch-via-address-family-for-mtu_reduced.patch new file mode 100644 index 00000000000..40dfc410c17 --- /dev/null +++ b/queue-3.14/tcp-fix-tcp_release_cb-to-dispatch-via-address-family-for-mtu_reduced.patch @@ -0,0 +1,134 @@ +From foo@baz Sun Oct 12 16:38:53 CEST 2014 +From: Neal Cardwell +Date: Thu, 14 Aug 2014 12:40:05 -0400 +Subject: tcp: fix tcp_release_cb() to dispatch via address family for mtu_reduced() + +From: Neal Cardwell + +[ Upstream commit 4fab9071950c2021d846e18351e0f46a1cffd67b ] + +Make sure we use the correct address-family-specific function for +handling MTU reductions from within tcp_release_cb(). + +Previously AF_INET6 sockets were incorrectly always using the IPv6 +code path when sometimes they were handling IPv4 traffic and thus had +an IPv4 dst. + +Signed-off-by: Neal Cardwell +Signed-off-by: Eric Dumazet +Diagnosed-by: Willem de Bruijn +Fixes: 563d34d057862 ("tcp: dont drop MTU reduction indications") +Reviewed-by: Hannes Frederic Sowa +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + include/net/inet_connection_sock.h | 1 + + include/net/sock.h | 1 - + include/net/tcp.h | 1 + + net/ipv4/tcp_ipv4.c | 5 +++-- + net/ipv4/tcp_output.c | 2 +- + net/ipv6/tcp_ipv6.c | 3 ++- + 6 files changed, 8 insertions(+), 5 deletions(-) + +--- a/include/net/inet_connection_sock.h ++++ b/include/net/inet_connection_sock.h +@@ -62,6 +62,7 @@ struct inet_connection_sock_af_ops { + void (*addr2sockaddr)(struct sock *sk, struct sockaddr *); + int (*bind_conflict)(const struct sock *sk, + const struct inet_bind_bucket *tb, bool relax); ++ void (*mtu_reduced)(struct sock *sk); + }; + + /** inet_connection_sock - INET connection oriented sock +--- a/include/net/sock.h ++++ b/include/net/sock.h +@@ -969,7 +969,6 @@ struct proto { + struct sk_buff *skb); + + void (*release_cb)(struct sock *sk); +- void (*mtu_reduced)(struct sock *sk); + + /* Keeping track of sk's, looking them up, and port selection methods. */ + void (*hash)(struct sock *sk); +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -453,6 +453,7 @@ const u8 *tcp_parse_md5sig_option(const + */ + + void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb); ++void tcp_v4_mtu_reduced(struct sock *sk); + int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb); + struct sock *tcp_create_openreq_child(struct sock *sk, + struct request_sock *req, +--- a/net/ipv4/tcp_ipv4.c ++++ b/net/ipv4/tcp_ipv4.c +@@ -269,7 +269,7 @@ EXPORT_SYMBOL(tcp_v4_connect); + * It can be called through tcp_release_cb() if socket was owned by user + * at the time tcp_v4_err() was called to handle ICMP message. + */ +-static void tcp_v4_mtu_reduced(struct sock *sk) ++void tcp_v4_mtu_reduced(struct sock *sk) + { + struct dst_entry *dst; + struct inet_sock *inet = inet_sk(sk); +@@ -300,6 +300,7 @@ static void tcp_v4_mtu_reduced(struct so + tcp_simple_retransmit(sk); + } /* else let the usual retransmit timer handle it */ + } ++EXPORT_SYMBOL(tcp_v4_mtu_reduced); + + static void do_redirect(struct sk_buff *skb, struct sock *sk) + { +@@ -2117,6 +2118,7 @@ const struct inet_connection_sock_af_ops + .compat_setsockopt = compat_ip_setsockopt, + .compat_getsockopt = compat_ip_getsockopt, + #endif ++ .mtu_reduced = tcp_v4_mtu_reduced, + }; + EXPORT_SYMBOL(ipv4_specific); + +@@ -2736,7 +2738,6 @@ struct proto tcp_prot = { + .sendpage = tcp_sendpage, + .backlog_rcv = tcp_v4_do_rcv, + .release_cb = tcp_release_cb, +- .mtu_reduced = tcp_v4_mtu_reduced, + .hash = inet_hash, + .unhash = inet_unhash, + .get_port = inet_csk_get_port, +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -787,7 +787,7 @@ void tcp_release_cb(struct sock *sk) + __sock_put(sk); + } + if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) { +- sk->sk_prot->mtu_reduced(sk); ++ inet_csk(sk)->icsk_af_ops->mtu_reduced(sk); + __sock_put(sk); + } + } +--- a/net/ipv6/tcp_ipv6.c ++++ b/net/ipv6/tcp_ipv6.c +@@ -1668,6 +1668,7 @@ static const struct inet_connection_sock + .compat_setsockopt = compat_ipv6_setsockopt, + .compat_getsockopt = compat_ipv6_getsockopt, + #endif ++ .mtu_reduced = tcp_v6_mtu_reduced, + }; + + #ifdef CONFIG_TCP_MD5SIG +@@ -1699,6 +1700,7 @@ static const struct inet_connection_sock + .compat_setsockopt = compat_ipv6_setsockopt, + .compat_getsockopt = compat_ipv6_getsockopt, + #endif ++ .mtu_reduced = tcp_v4_mtu_reduced, + }; + + #ifdef CONFIG_TCP_MD5SIG +@@ -1935,7 +1937,6 @@ struct proto tcpv6_prot = { + .sendpage = tcp_sendpage, + .backlog_rcv = tcp_v6_do_rcv, + .release_cb = tcp_release_cb, +- .mtu_reduced = tcp_v6_mtu_reduced, + .hash = tcp_v6_hash, + .unhash = inet_unhash, + .get_port = inet_csk_get_port, diff --git a/queue-3.14/tcp-fixing-tlp-s-fin-recovery.patch b/queue-3.14/tcp-fixing-tlp-s-fin-recovery.patch new file mode 100644 index 00000000000..5b65b3e38f1 --- /dev/null +++ b/queue-3.14/tcp-fixing-tlp-s-fin-recovery.patch @@ -0,0 +1,37 @@ +From foo@baz Sun Oct 12 16:38:53 CEST 2014 +From: Per Hurtig +Date: Thu, 12 Jun 2014 17:08:32 +0200 +Subject: tcp: fixing TLP's FIN recovery + +From: Per Hurtig + +[ Upstream commit bef1909ee3ed1ca39231b260a8d3b4544ecd0c8f ] + +Fix to a problem observed when losing a FIN segment that does not +contain data. In such situations, TLP is unable to recover from +*any* tail loss and instead adds at least PTO ms to the +retransmission process, i.e., RTO = RTO + PTO. + +Signed-off-by: Per Hurtig +Signed-off-by: Eric Dumazet +Acked-by: Nandita Dukkipati +Acked-by: Neal Cardwell +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/tcp_output.c | 4 +--- + 1 file changed, 1 insertion(+), 3 deletions(-) + +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -2069,9 +2069,7 @@ void tcp_send_loss_probe(struct sock *sk + if (WARN_ON(!skb || !tcp_skb_pcount(skb))) + goto rearm_timer; + +- /* Probe with zero data doesn't trigger fast recovery. */ +- if (skb->len > 0) +- err = __tcp_retransmit_skb(sk, skb); ++ err = __tcp_retransmit_skb(sk, skb); + + /* Record snd_nxt for loss detection. */ + if (likely(!err)) diff --git a/queue-3.14/team-avoid-race-condition-in-scheduling-delayed-work.patch b/queue-3.14/team-avoid-race-condition-in-scheduling-delayed-work.patch new file mode 100644 index 00000000000..fa7d2d9202a --- /dev/null +++ b/queue-3.14/team-avoid-race-condition-in-scheduling-delayed-work.patch @@ -0,0 +1,68 @@ +From foo@baz Sun Oct 12 16:38:53 CEST 2014 +From: Joe Lawrence +Date: Fri, 3 Oct 2014 09:58:34 -0400 +Subject: team: avoid race condition in scheduling delayed work + +From: Joe Lawrence + +[ Upstream commit 47549650abd13d873fd2e5fc218db19e21031074 ] + +When team_notify_peers and team_mcast_rejoin are called, they both reset +their respective .count_pending atomic variable. Then when the actual +worker function is executed, the variable is atomically decremented. +This pattern introduces a potential race condition where the +.count_pending rolls over and the worker function keeps rescheduling +until .count_pending decrements to zero again: + +THREAD 1 THREAD 2 + +======== ======== +team_notify_peers(teamX) + atomic_set count_pending = 1 + schedule_delayed_work + team_notify_peers(teamX) + atomic_set count_pending = 1 +team_notify_peers_work + atomic_dec_and_test + count_pending = 0 + (return) + schedule_delayed_work + team_notify_peers_work + atomic_dec_and_test + count_pending = -1 + schedule_delayed_work + (repeat until count_pending = 0) + +Instead of assigning a new value to .count_pending, use atomic_add to +tack-on the additional desired worker function invocations. + +Signed-off-by: Joe Lawrence +Acked-by: Jiri Pirko +Fixes: fc423ff00df3a19554414ee ("team: add peer notification") +Fixes: 492b200efdd20b8fcfdac87 ("team: add support for sending multicast rejoins") +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/team/team.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/drivers/net/team/team.c ++++ b/drivers/net/team/team.c +@@ -647,7 +647,7 @@ static void team_notify_peers(struct tea + { + if (!team->notify_peers.count || !netif_running(team->dev)) + return; +- atomic_set(&team->notify_peers.count_pending, team->notify_peers.count); ++ atomic_add(team->notify_peers.count, &team->notify_peers.count_pending); + schedule_delayed_work(&team->notify_peers.dw, 0); + } + +@@ -687,7 +687,7 @@ static void team_mcast_rejoin(struct tea + { + if (!team->mcast_rejoin.count || !netif_running(team->dev)) + return; +- atomic_set(&team->mcast_rejoin.count_pending, team->mcast_rejoin.count); ++ atomic_add(team->mcast_rejoin.count, &team->mcast_rejoin.count_pending); + schedule_delayed_work(&team->mcast_rejoin.dw, 0); + } + diff --git a/queue-3.14/tg3-allow-for-recieve-of-full-size-8021ad-frames.patch b/queue-3.14/tg3-allow-for-recieve-of-full-size-8021ad-frames.patch new file mode 100644 index 00000000000..db6f3974388 --- /dev/null +++ b/queue-3.14/tg3-allow-for-recieve-of-full-size-8021ad-frames.patch @@ -0,0 +1,41 @@ +From foo@baz Sun Oct 12 16:38:53 CEST 2014 +From: Vlad Yasevich +Date: Tue, 30 Sep 2014 19:39:36 -0400 +Subject: tg3: Allow for recieve of full-size 8021AD frames + +From: Vlad Yasevich + +[ Upstream commit 7d3083ee36b51e425b6abd76778a2046906b0fd3 ] + +When receiving a vlan-tagged frame that still contains +a vlan header, the length of the packet will be greater +then MTU+ETH_HLEN since it will account of the extra +vlan header. TG3 checks this for the case for 802.1Q, +but not for 802.1ad. As a result, full sized 802.1ad +frames get dropped by the card. + +Add a check for 802.1ad protocol when receving full +sized frames. + +Suggested-by: Prashant Sreedharan +CC: Prashant Sreedharan +CC: Michael Chan +Signed-off-by: Vladislav Yasevich +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/broadcom/tg3.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/drivers/net/ethernet/broadcom/tg3.c ++++ b/drivers/net/ethernet/broadcom/tg3.c +@@ -6923,7 +6923,8 @@ static int tg3_rx(struct tg3_napi *tnapi + skb->protocol = eth_type_trans(skb, tp->dev); + + if (len > (tp->dev->mtu + ETH_HLEN) && +- skb->protocol != htons(ETH_P_8021Q)) { ++ skb->protocol != htons(ETH_P_8021Q) && ++ skb->protocol != htons(ETH_P_8021AD)) { + dev_kfree_skb(skb); + goto drop_it_no_recycle; + } diff --git a/queue-3.14/tg3-work-around-hw-fw-limitations-with-vlan-encapsulated-frames.patch b/queue-3.14/tg3-work-around-hw-fw-limitations-with-vlan-encapsulated-frames.patch new file mode 100644 index 00000000000..24482c95101 --- /dev/null +++ b/queue-3.14/tg3-work-around-hw-fw-limitations-with-vlan-encapsulated-frames.patch @@ -0,0 +1,68 @@ +From foo@baz Sun Oct 12 16:38:53 CEST 2014 +From: Vlad Yasevich +Date: Thu, 18 Sep 2014 10:31:17 -0400 +Subject: tg3: Work around HW/FW limitations with vlan encapsulated frames + +From: Vlad Yasevich + +[ Upstream commit 476c18850c6cbaa3f2bb661ae9710645081563b9 ] + +TG3 appears to have an issue performing TSO and checksum offloading +correclty when the frame has been vlan encapsulated (non-accelrated). +In these cases, tcp checksum is not correctly updated. + +This patch attempts to work around this issue. After the patch, +802.1ad vlans start working correctly over tg3 devices. + +CC: Prashant Sreedharan +CC: Michael Chan +Signed-off-by: Vladislav Yasevich +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/broadcom/tg3.c | 20 ++++++++++++++++++-- + 1 file changed, 18 insertions(+), 2 deletions(-) + +--- a/drivers/net/ethernet/broadcom/tg3.c ++++ b/drivers/net/ethernet/broadcom/tg3.c +@@ -7915,8 +7915,6 @@ static netdev_tx_t tg3_start_xmit(struct + + entry = tnapi->tx_prod; + base_flags = 0; +- if (skb->ip_summed == CHECKSUM_PARTIAL) +- base_flags |= TXD_FLAG_TCPUDP_CSUM; + + mss = skb_shinfo(skb)->gso_size; + if (mss) { +@@ -7932,6 +7930,13 @@ static netdev_tx_t tg3_start_xmit(struct + + hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb) - ETH_HLEN; + ++ /* HW/FW can not correctly segment packets that have been ++ * vlan encapsulated. ++ */ ++ if (skb->protocol == htons(ETH_P_8021Q) || ++ skb->protocol == htons(ETH_P_8021AD)) ++ return tg3_tso_bug(tp, skb); ++ + if (!skb_is_gso_v6(skb)) { + iph->check = 0; + iph->tot_len = htons(mss + hdr_len); +@@ -7978,6 +7983,17 @@ static netdev_tx_t tg3_start_xmit(struct + base_flags |= tsflags << 12; + } + } ++ } else if (skb->ip_summed == CHECKSUM_PARTIAL) { ++ /* HW/FW can not correctly checksum packets that have been ++ * vlan encapsulated. ++ */ ++ if (skb->protocol == htons(ETH_P_8021Q) || ++ skb->protocol == htons(ETH_P_8021AD)) { ++ if (skb_checksum_help(skb)) ++ goto drop; ++ } else { ++ base_flags |= TXD_FLAG_TCPUDP_CSUM; ++ } + } + + if (tg3_flag(tp, USE_JUMBO_BDFLAG) && diff --git a/queue-3.14/vxlan-fix-incorrect-initializer-in-union-vxlan_addr.patch b/queue-3.14/vxlan-fix-incorrect-initializer-in-union-vxlan_addr.patch new file mode 100644 index 00000000000..be539e74325 --- /dev/null +++ b/queue-3.14/vxlan-fix-incorrect-initializer-in-union-vxlan_addr.patch @@ -0,0 +1,74 @@ +From foo@baz Sun Oct 12 16:38:53 CEST 2014 +From: Gerhard Stenzel +Date: Fri, 22 Aug 2014 21:34:16 +0200 +Subject: vxlan: fix incorrect initializer in union vxlan_addr + +From: Gerhard Stenzel + +[ Upstream commit a45e92a599e77ee6a850eabdd0141633fde03915 ] + +The first initializer in the following + + union vxlan_addr ipa = { + .sin.sin_addr.s_addr = tip, + .sa.sa_family = AF_INET, + }; + +is optimised away by the compiler, due to the second initializer, +therefore initialising .sin.sin_addr.s_addr always to 0. +This results in netlink messages indicating a L3 miss never contain the +missed IP address. This was observed with GCC 4.8 and 4.9. I do not know about previous versions. +The problem affects user space programs relying on an IP address being +sent as part of a netlink message indicating a L3 miss. + +Changing + .sa.sa_family = AF_INET, +to + .sin.sin_family = AF_INET, +fixes the problem. + +Signed-off-by: Gerhard Stenzel +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/vxlan.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +--- a/drivers/net/vxlan.c ++++ b/drivers/net/vxlan.c +@@ -1334,7 +1334,7 @@ static int arp_reduce(struct net_device + } else if (vxlan->flags & VXLAN_F_L3MISS) { + union vxlan_addr ipa = { + .sin.sin_addr.s_addr = tip, +- .sa.sa_family = AF_INET, ++ .sin.sin_family = AF_INET, + }; + + vxlan_ip_miss(dev, &ipa); +@@ -1495,7 +1495,7 @@ static int neigh_reduce(struct net_devic + } else if (vxlan->flags & VXLAN_F_L3MISS) { + union vxlan_addr ipa = { + .sin6.sin6_addr = msg->target, +- .sa.sa_family = AF_INET6, ++ .sin6.sin6_family = AF_INET6, + }; + + vxlan_ip_miss(dev, &ipa); +@@ -1528,7 +1528,7 @@ static bool route_shortcircuit(struct ne + if (!n && (vxlan->flags & VXLAN_F_L3MISS)) { + union vxlan_addr ipa = { + .sin.sin_addr.s_addr = pip->daddr, +- .sa.sa_family = AF_INET, ++ .sin.sin_family = AF_INET, + }; + + vxlan_ip_miss(dev, &ipa); +@@ -1549,7 +1549,7 @@ static bool route_shortcircuit(struct ne + if (!n && (vxlan->flags & VXLAN_F_L3MISS)) { + union vxlan_addr ipa = { + .sin6.sin6_addr = pip6->daddr, +- .sa.sa_family = AF_INET6, ++ .sin6.sin6_family = AF_INET6, + }; + + vxlan_ip_miss(dev, &ipa); diff --git a/queue-3.14/xfrm-generate-blackhole-routes-only-from-route-lookup-functions.patch b/queue-3.14/xfrm-generate-blackhole-routes-only-from-route-lookup-functions.patch new file mode 100644 index 00000000000..b9644d889f0 --- /dev/null +++ b/queue-3.14/xfrm-generate-blackhole-routes-only-from-route-lookup-functions.patch @@ -0,0 +1,130 @@ +From foo@baz Sun Oct 12 16:38:53 CEST 2014 +From: Steffen Klassert +Date: Tue, 16 Sep 2014 10:08:40 +0200 +Subject: xfrm: Generate blackhole routes only from route lookup functions + +From: Steffen Klassert + +[ Upstream commit f92ee61982d6da15a9e49664ecd6405a15a2ee56 ] + +Currently we genarate a blackhole route route whenever we have +matching policies but can not resolve the states. Here we assume +that dst_output() is called to kill the balckholed packets. +Unfortunately this assumption is not true in all cases, so +it is possible that these packets leave the system unwanted. + +We fix this by generating blackhole routes only from the +route lookup functions, here we can guarantee a call to +dst_output() afterwards. + +Fixes: 2774c131b1d ("xfrm: Handle blackhole route creation via afinfo.") +Reported-by: Konstantinos Kolelis +Signed-off-by: Steffen Klassert +Signed-off-by: Greg Kroah-Hartman +--- + include/net/dst.h | 15 ++++++++++++++- + net/ipv4/route.c | 6 +++--- + net/ipv6/ip6_output.c | 4 ++-- + net/xfrm/xfrm_policy.c | 18 +++++++++++++++++- + 4 files changed, 36 insertions(+), 7 deletions(-) + +--- a/include/net/dst.h ++++ b/include/net/dst.h +@@ -476,7 +476,16 @@ static inline struct dst_entry *xfrm_loo + int flags) + { + return dst_orig; +-} ++} ++ ++static inline struct dst_entry *xfrm_lookup_route(struct net *net, ++ struct dst_entry *dst_orig, ++ const struct flowi *fl, ++ struct sock *sk, ++ int flags) ++{ ++ return dst_orig; ++} + + static inline struct xfrm_state *dst_xfrm(const struct dst_entry *dst) + { +@@ -488,6 +497,10 @@ struct dst_entry *xfrm_lookup(struct net + const struct flowi *fl, struct sock *sk, + int flags); + ++struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig, ++ const struct flowi *fl, struct sock *sk, ++ int flags); ++ + /* skb attached with this dst needs transformation if dst->xfrm is valid */ + static inline struct xfrm_state *dst_xfrm(const struct dst_entry *dst) + { +--- a/net/ipv4/route.c ++++ b/net/ipv4/route.c +@@ -2268,9 +2268,9 @@ struct rtable *ip_route_output_flow(stru + return rt; + + if (flp4->flowi4_proto) +- rt = (struct rtable *) xfrm_lookup(net, &rt->dst, +- flowi4_to_flowi(flp4), +- sk, 0); ++ rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst, ++ flowi4_to_flowi(flp4), ++ sk, 0); + + return rt; + } +--- a/net/ipv6/ip6_output.c ++++ b/net/ipv6/ip6_output.c +@@ -1008,7 +1008,7 @@ struct dst_entry *ip6_dst_lookup_flow(st + if (final_dst) + fl6->daddr = *final_dst; + +- return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); ++ return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); + } + EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); + +@@ -1040,7 +1040,7 @@ struct dst_entry *ip6_sk_dst_lookup_flow + if (final_dst) + fl6->daddr = *final_dst; + +- return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); ++ return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); + } + EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); + +--- a/net/xfrm/xfrm_policy.c ++++ b/net/xfrm/xfrm_policy.c +@@ -2155,7 +2155,7 @@ struct dst_entry *xfrm_lookup(struct net + xfrm_pols_put(pols, drop_pols); + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES); + +- return make_blackhole(net, family, dst_orig); ++ return ERR_PTR(-EREMOTE); + } + + err = -EAGAIN; +@@ -2212,6 +2212,22 @@ dropdst: + } + EXPORT_SYMBOL(xfrm_lookup); + ++/* Callers of xfrm_lookup_route() must ensure a call to dst_output(). ++ * Otherwise we may send out blackholed packets. ++ */ ++struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig, ++ const struct flowi *fl, ++ struct sock *sk, int flags) ++{ ++ struct dst_entry *dst = xfrm_lookup(net, dst_orig, fl, sk, flags); ++ ++ if (IS_ERR(dst) && PTR_ERR(dst) == -EREMOTE) ++ return make_blackhole(net, dst_orig->ops->family, dst_orig); ++ ++ return dst; ++} ++EXPORT_SYMBOL(xfrm_lookup_route); ++ + static inline int + xfrm_secpath_reject(int idx, struct sk_buff *skb, const struct flowi *fl) + { diff --git a/queue-3.14/xfrm-generate-queueing-routes-only-from-route-lookup-functions.patch b/queue-3.14/xfrm-generate-queueing-routes-only-from-route-lookup-functions.patch new file mode 100644 index 00000000000..36677d38910 --- /dev/null +++ b/queue-3.14/xfrm-generate-queueing-routes-only-from-route-lookup-functions.patch @@ -0,0 +1,149 @@ +From foo@baz Sun Oct 12 16:38:53 CEST 2014 +From: Steffen Klassert +Date: Tue, 16 Sep 2014 10:08:49 +0200 +Subject: xfrm: Generate queueing routes only from route lookup functions + +From: Steffen Klassert + +[ Upstream commit b8c203b2d2fc961bafd53b41d5396bbcdec55998 ] + +Currently we genarate a queueing route if we have matching policies +but can not resolve the states and the sysctl xfrm_larval_drop is +disabled. Here we assume that dst_output() is called to kill the +queued packets. Unfortunately this assumption is not true in all +cases, so it is possible that these packets leave the system unwanted. + +We fix this by generating queueing routes only from the +route lookup functions, here we can guarantee a call to +dst_output() afterwards. + +Fixes: a0073fe18e71 ("xfrm: Add a state resolution packet queue") +Reported-by: Konstantinos Kolelis +Signed-off-by: Steffen Klassert +Signed-off-by: Greg Kroah-Hartman +--- + include/net/dst.h | 1 + + net/xfrm/xfrm_policy.c | 32 ++++++++++++++++++++++++-------- + 2 files changed, 25 insertions(+), 8 deletions(-) + +--- a/include/net/dst.h ++++ b/include/net/dst.h +@@ -466,6 +466,7 @@ void dst_init(void); + /* Flags for xfrm_lookup flags argument. */ + enum { + XFRM_LOOKUP_ICMP = 1 << 0, ++ XFRM_LOOKUP_QUEUE = 1 << 1, + }; + + struct flowi; +--- a/net/xfrm/xfrm_policy.c ++++ b/net/xfrm/xfrm_policy.c +@@ -41,6 +41,11 @@ + + static struct dst_entry *xfrm_policy_sk_bundles; + ++struct xfrm_flo { ++ struct dst_entry *dst_orig; ++ u8 flags; ++}; ++ + static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock); + static struct xfrm_policy_afinfo __rcu *xfrm_policy_afinfo[NPROTO] + __read_mostly; +@@ -1889,13 +1894,14 @@ static int xdst_queue_output(struct sk_b + } + + static struct xfrm_dst *xfrm_create_dummy_bundle(struct net *net, +- struct dst_entry *dst, ++ struct xfrm_flo *xflo, + const struct flowi *fl, + int num_xfrms, + u16 family) + { + int err; + struct net_device *dev; ++ struct dst_entry *dst; + struct dst_entry *dst1; + struct xfrm_dst *xdst; + +@@ -1903,9 +1909,12 @@ static struct xfrm_dst *xfrm_create_dumm + if (IS_ERR(xdst)) + return xdst; + +- if (net->xfrm.sysctl_larval_drop || num_xfrms <= 0) ++ if (!(xflo->flags & XFRM_LOOKUP_QUEUE) || ++ net->xfrm.sysctl_larval_drop || ++ num_xfrms <= 0) + return xdst; + ++ dst = xflo->dst_orig; + dst1 = &xdst->u.dst; + dst_hold(dst); + xdst->route = dst; +@@ -1947,7 +1956,7 @@ static struct flow_cache_object * + xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, + struct flow_cache_object *oldflo, void *ctx) + { +- struct dst_entry *dst_orig = (struct dst_entry *)ctx; ++ struct xfrm_flo *xflo = (struct xfrm_flo *)ctx; + struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; + struct xfrm_dst *xdst, *new_xdst; + int num_pols = 0, num_xfrms = 0, i, err, pol_dead; +@@ -1988,7 +1997,8 @@ xfrm_bundle_lookup(struct net *net, cons + goto make_dummy_bundle; + } + +- new_xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family, dst_orig); ++ new_xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family, ++ xflo->dst_orig); + if (IS_ERR(new_xdst)) { + err = PTR_ERR(new_xdst); + if (err != -EAGAIN) +@@ -2022,7 +2032,7 @@ make_dummy_bundle: + /* We found policies, but there's no bundles to instantiate: + * either because the policy blocks, has no transformations or + * we could not build template (no xfrm_states).*/ +- xdst = xfrm_create_dummy_bundle(net, dst_orig, fl, num_xfrms, family); ++ xdst = xfrm_create_dummy_bundle(net, xflo, fl, num_xfrms, family); + if (IS_ERR(xdst)) { + xfrm_pols_put(pols, num_pols); + return ERR_CAST(xdst); +@@ -2121,13 +2131,18 @@ struct dst_entry *xfrm_lookup(struct net + } + + if (xdst == NULL) { ++ struct xfrm_flo xflo; ++ ++ xflo.dst_orig = dst_orig; ++ xflo.flags = flags; ++ + /* To accelerate a bit... */ + if ((dst_orig->flags & DST_NOXFRM) || + !net->xfrm.policy_count[XFRM_POLICY_OUT]) + goto nopol; + + flo = flow_cache_lookup(net, fl, family, dir, +- xfrm_bundle_lookup, dst_orig); ++ xfrm_bundle_lookup, &xflo); + if (flo == NULL) + goto nopol; + if (IS_ERR(flo)) { +@@ -2219,7 +2234,8 @@ struct dst_entry *xfrm_lookup_route(stru + const struct flowi *fl, + struct sock *sk, int flags) + { +- struct dst_entry *dst = xfrm_lookup(net, dst_orig, fl, sk, flags); ++ struct dst_entry *dst = xfrm_lookup(net, dst_orig, fl, sk, ++ flags | XFRM_LOOKUP_QUEUE); + + if (IS_ERR(dst) && PTR_ERR(dst) == -EREMOTE) + return make_blackhole(net, dst_orig->ops->family, dst_orig); +@@ -2493,7 +2509,7 @@ int __xfrm_route_forward(struct sk_buff + + skb_dst_force(skb); + +- dst = xfrm_lookup(net, skb_dst(skb), &fl, NULL, 0); ++ dst = xfrm_lookup(net, skb_dst(skb), &fl, NULL, XFRM_LOOKUP_QUEUE); + if (IS_ERR(dst)) { + res = 0; + dst = NULL;