From 1737c20766f85901da2e8456aba8f7da121fa9dd Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 17 Jan 2015 18:19:31 -0800 Subject: [PATCH] 3.18-stable patches added patches: alx-fix-alx_poll.patch batman-adv-avoid-null-dereferences-and-fix-if-check.patch batman-adv-calculate-extra-tail-size-based-on-queued-fragments.patch batman-adv-unify-fragment-size-calculation.patch enic-fix-rx-skb-checksum.patch geneve-fix-races-between-socket-add-and-release.patch geneve-remove-socket-and-offload-handlers-at-destruction.patch gre-fix-the-inner-mac-header-in-nbma-tunnel-xmit-path.patch in6-fix-conflict-with-glibc.patch net-core-handle-csum-for-checksum_complete-vxlan-forwarding.patch net-drop-the-packet-when-fails-to-do-software-segmentation-or-header-check.patch net-fix-stacked-vlan-offload-features-computation.patch net-generalize-ndo_gso_check-to-ndo_features_check.patch net-mlx4-cache-line-cqe-eqe-stride-fixes.patch net-mlx4_core-correcly-update-the-mtt-s-offset-in-the-mr-re-reg-flow.patch net-mlx4_en-doorbell-is-byteswapped-in-little-endian-archs.patch net-reset-secmark-when-scrubbing-packet.patch netlink-always-copy-on-mmap-tx.patch netlink-don-t-reorder-loads-stores-before-marking-mmap-netlink-frame-as-available.patch tcp-do-not-apply-tso-segment-limit-to-non-tso-packets.patch tcp6-don-t-move-ip6cb-before-xfrm6_policy_check.patch team-avoid-possible-underflow-of-count_pending-value-for-notify_peers-and-mcast_rejoin.patch tg3-tg3_disable_ints-using-uninitialized-mailbox-value-to-disable-interrupts.patch xen-netback-fixing-the-propagation-of-the-transmit-shaper-timeout.patch xen-netback-support-frontends-without-feature-rx-notify-again.patch --- queue-3.18/alx-fix-alx_poll.patch | 110 +++++++ ...d-null-dereferences-and-fix-if-check.patch | 43 +++ ...-tail-size-based-on-queued-fragments.patch | 61 ++++ ...-adv-unify-fragment-size-calculation.patch | 43 +++ queue-3.18/enic-fix-rx-skb-checksum.patch | 73 +++++ ...races-between-socket-add-and-release.patch | 61 ++++ ...-and-offload-handlers-at-destruction.patch | 59 ++++ ...-mac-header-in-nbma-tunnel-xmit-path.patch | 69 +++++ queue-3.18/in6-fix-conflict-with-glibc.patch | 70 +++++ ...r-checksum_complete-vxlan-forwarding.patch | 63 ++++ ...oftware-segmentation-or-header-check.patch | 34 +++ ...ed-vlan-offload-features-computation.patch | 47 +++ ...-ndo_gso_check-to-ndo_features_check.patch | 285 ++++++++++++++++++ ...mlx4-cache-line-cqe-eqe-stride-fixes.patch | 59 ++++ ...e-mtt-s-offset-in-the-mr-re-reg-flow.patch | 55 ++++ ...s-byteswapped-in-little-endian-archs.patch | 46 +++ ...-reset-secmark-when-scrubbing-packet.patch | 37 +++ .../netlink-always-copy-on-mmap-tx.patch | 127 ++++++++ ...king-mmap-netlink-frame-as-available.patch | 50 +++ ...tso-segment-limit-to-non-tso-packets.patch | 54 ++++ ...move-ip6cb-before-xfrm6_policy_check.patch | 109 +++++++ ...ue-for-notify_peers-and-mcast_rejoin.patch | 94 ++++++ ...-mailbox-value-to-disable-interrupts.patch | 87 ++++++ ...ation-of-the-transmit-shaper-timeout.patch | 34 +++ ...ends-without-feature-rx-notify-again.patch | 180 +++++++++++ 25 files changed, 1950 insertions(+) create mode 100644 queue-3.18/alx-fix-alx_poll.patch create mode 100644 queue-3.18/batman-adv-avoid-null-dereferences-and-fix-if-check.patch create mode 100644 queue-3.18/batman-adv-calculate-extra-tail-size-based-on-queued-fragments.patch create mode 100644 queue-3.18/batman-adv-unify-fragment-size-calculation.patch create mode 100644 queue-3.18/enic-fix-rx-skb-checksum.patch create mode 100644 queue-3.18/geneve-fix-races-between-socket-add-and-release.patch create mode 100644 queue-3.18/geneve-remove-socket-and-offload-handlers-at-destruction.patch create mode 100644 queue-3.18/gre-fix-the-inner-mac-header-in-nbma-tunnel-xmit-path.patch create mode 100644 queue-3.18/in6-fix-conflict-with-glibc.patch create mode 100644 queue-3.18/net-core-handle-csum-for-checksum_complete-vxlan-forwarding.patch create mode 100644 queue-3.18/net-drop-the-packet-when-fails-to-do-software-segmentation-or-header-check.patch create mode 100644 queue-3.18/net-fix-stacked-vlan-offload-features-computation.patch create mode 100644 queue-3.18/net-generalize-ndo_gso_check-to-ndo_features_check.patch create mode 100644 queue-3.18/net-mlx4-cache-line-cqe-eqe-stride-fixes.patch create mode 100644 queue-3.18/net-mlx4_core-correcly-update-the-mtt-s-offset-in-the-mr-re-reg-flow.patch create mode 100644 queue-3.18/net-mlx4_en-doorbell-is-byteswapped-in-little-endian-archs.patch create mode 100644 queue-3.18/net-reset-secmark-when-scrubbing-packet.patch create mode 100644 queue-3.18/netlink-always-copy-on-mmap-tx.patch create mode 100644 queue-3.18/netlink-don-t-reorder-loads-stores-before-marking-mmap-netlink-frame-as-available.patch create mode 100644 queue-3.18/tcp-do-not-apply-tso-segment-limit-to-non-tso-packets.patch create mode 100644 queue-3.18/tcp6-don-t-move-ip6cb-before-xfrm6_policy_check.patch create mode 100644 queue-3.18/team-avoid-possible-underflow-of-count_pending-value-for-notify_peers-and-mcast_rejoin.patch create mode 100644 queue-3.18/tg3-tg3_disable_ints-using-uninitialized-mailbox-value-to-disable-interrupts.patch create mode 100644 queue-3.18/xen-netback-fixing-the-propagation-of-the-transmit-shaper-timeout.patch create mode 100644 queue-3.18/xen-netback-support-frontends-without-feature-rx-notify-again.patch diff --git a/queue-3.18/alx-fix-alx_poll.patch b/queue-3.18/alx-fix-alx_poll.patch new file mode 100644 index 00000000000..9524923878c --- /dev/null +++ b/queue-3.18/alx-fix-alx_poll.patch @@ -0,0 +1,110 @@ +From foo@baz Sat Jan 17 18:12:21 PST 2015 +From: Eric Dumazet +Date: Sun, 11 Jan 2015 10:32:18 -0800 +Subject: alx: fix alx_poll() + +From: Eric Dumazet + +[ Upstream commit 7a05dc64e2e4c611d89007b125b20c0d2a4d31a5 ] + +Commit d75b1ade567f ("net: less interrupt masking in NAPI") uncovered +wrong alx_poll() behavior. + +A NAPI poll() handler is supposed to return exactly the budget when/if +napi_complete() has not been called. + +It is also supposed to return number of frames that were received, so +that netdev_budget can have a meaning. + +Also, in case of TX pressure, we still have to dequeue received +packets : alx_clean_rx_irq() has to be called even if +alx_clean_tx_irq(alx) returns false, otherwise device is half duplex. + +Signed-off-by: Eric Dumazet +Fixes: d75b1ade567f ("net: less interrupt masking in NAPI") +Reported-by: Oded Gabbay +Bisected-by: Oded Gabbay +Tested-by: Oded Gabbay +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/atheros/alx/main.c | 24 +++++++++++++----------- + 1 file changed, 13 insertions(+), 11 deletions(-) + +--- a/drivers/net/ethernet/atheros/alx/main.c ++++ b/drivers/net/ethernet/atheros/alx/main.c +@@ -184,15 +184,16 @@ static void alx_schedule_reset(struct al + schedule_work(&alx->reset_wk); + } + +-static bool alx_clean_rx_irq(struct alx_priv *alx, int budget) ++static int alx_clean_rx_irq(struct alx_priv *alx, int budget) + { + struct alx_rx_queue *rxq = &alx->rxq; + struct alx_rrd *rrd; + struct alx_buffer *rxb; + struct sk_buff *skb; + u16 length, rfd_cleaned = 0; ++ int work = 0; + +- while (budget > 0) { ++ while (work < budget) { + rrd = &rxq->rrd[rxq->rrd_read_idx]; + if (!(rrd->word3 & cpu_to_le32(1 << RRD_UPDATED_SHIFT))) + break; +@@ -203,7 +204,7 @@ static bool alx_clean_rx_irq(struct alx_ + ALX_GET_FIELD(le32_to_cpu(rrd->word0), + RRD_NOR) != 1) { + alx_schedule_reset(alx); +- return 0; ++ return work; + } + + rxb = &rxq->bufs[rxq->read_idx]; +@@ -243,7 +244,7 @@ static bool alx_clean_rx_irq(struct alx_ + } + + napi_gro_receive(&alx->napi, skb); +- budget--; ++ work++; + + next_pkt: + if (++rxq->read_idx == alx->rx_ringsz) +@@ -258,21 +259,22 @@ next_pkt: + if (rfd_cleaned) + alx_refill_rx_ring(alx, GFP_ATOMIC); + +- return budget > 0; ++ return work; + } + + static int alx_poll(struct napi_struct *napi, int budget) + { + struct alx_priv *alx = container_of(napi, struct alx_priv, napi); + struct alx_hw *hw = &alx->hw; +- bool complete = true; + unsigned long flags; ++ bool tx_complete; ++ int work; + +- complete = alx_clean_tx_irq(alx) && +- alx_clean_rx_irq(alx, budget); ++ tx_complete = alx_clean_tx_irq(alx); ++ work = alx_clean_rx_irq(alx, budget); + +- if (!complete) +- return 1; ++ if (!tx_complete || work == budget) ++ return budget; + + napi_complete(&alx->napi); + +@@ -284,7 +286,7 @@ static int alx_poll(struct napi_struct * + + alx_post_write(hw); + +- return 0; ++ return work; + } + + static irqreturn_t alx_intr_handle(struct alx_priv *alx, u32 intr) diff --git a/queue-3.18/batman-adv-avoid-null-dereferences-and-fix-if-check.patch b/queue-3.18/batman-adv-avoid-null-dereferences-and-fix-if-check.patch new file mode 100644 index 00000000000..ce55508a881 --- /dev/null +++ b/queue-3.18/batman-adv-avoid-null-dereferences-and-fix-if-check.patch @@ -0,0 +1,43 @@ +From foo@baz Sat Jan 17 18:12:21 PST 2015 +From: Antonio Quartulli +Date: Sat, 20 Dec 2014 13:48:57 +0100 +Subject: batman-adv: avoid NULL dereferences and fix if check + +From: Antonio Quartulli + +[ Upstream commit 0d1644919578db525b9a7b6c8197ce02adbfce26 ] + +Gateway having bandwidth_down equal to zero are not accepted +at all and so never added to the Gateway list. +For this reason checking the bandwidth_down member in +batadv_gw_out_of_range() is useless. + +This is probably a copy/paste error and this check was supposed +to be "!gw_node" only. Moreover, the way the check is written +now may also lead to a NULL dereference. + +Fix this by rewriting the if-condition properly. + +Introduced by 414254e342a0d58144de40c3da777521ebaeeb07 +("batman-adv: tvlv - gateway download/upload bandwidth container") + +Signed-off-by: Antonio Quartulli +Reported-by: David Binderman +Signed-off-by: Marek Lindner +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/batman-adv/gateway_client.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/batman-adv/gateway_client.c ++++ b/net/batman-adv/gateway_client.c +@@ -810,7 +810,7 @@ bool batadv_gw_out_of_range(struct batad + goto out; + + gw_node = batadv_gw_node_get(bat_priv, orig_dst_node); +- if (!gw_node->bandwidth_down == 0) ++ if (!gw_node) + goto out; + + switch (atomic_read(&bat_priv->gw_mode)) { diff --git a/queue-3.18/batman-adv-calculate-extra-tail-size-based-on-queued-fragments.patch b/queue-3.18/batman-adv-calculate-extra-tail-size-based-on-queued-fragments.patch new file mode 100644 index 00000000000..4e57a2ac617 --- /dev/null +++ b/queue-3.18/batman-adv-calculate-extra-tail-size-based-on-queued-fragments.patch @@ -0,0 +1,61 @@ +From foo@baz Sat Jan 17 18:12:21 PST 2015 +From: Sven Eckelmann +Date: Sat, 20 Dec 2014 13:48:55 +0100 +Subject: batman-adv: Calculate extra tail size based on queued fragments +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Sven Eckelmann + +[ Upstream commit 5b6698b0e4a37053de35cc24ee695b98a7eb712b ] + +The fragmentation code was replaced in 610bfc6bc99bc83680d190ebc69359a05fc7f605 +("batman-adv: Receive fragmented packets and merge"). The new code provided a +mostly unused parameter skb for the merging function. It is used inside the +function to calculate the additionally needed skb tailroom. But instead of +increasing its own tailroom, it is only increasing the tailroom of the first +queued skb. This is not correct in some situations because the first queued +entry can be a different one than the parameter. + +An observed problem was: + +1. packet with size 104, total_size 1464, fragno 1 was received + - packet is queued +2. packet with size 1400, total_size 1464, fragno 0 was received + - packet is queued at the end of the list +3. enough data was received and can be given to the merge function + (1464 == (1400 - 20) + (104 - 20)) + - merge functions gets 1400 byte large packet as skb argument +4. merge function gets first entry in queue (104 byte) + - stored as skb_out +5. merge function calculates the required extra tail as total_size - skb->len + - pskb_expand_head tail of skb_out with 64 bytes +6. merge function tries to squeeze the extra 1380 bytes from the second queued + skb (1400 byte aka skb parameter) in the 64 extra tail bytes of skb_out + +Instead calculate the extra required tail bytes for skb_out also using skb_out +instead of using the parameter skb. The skb parameter is only used to get the +total_size from the last received packet. This is also the total_size used to +decide that all fragments were received. + +Reported-by: Philipp Psurek +Signed-off-by: Sven Eckelmann +Acked-by: Martin Hundebøll +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/batman-adv/fragmentation.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/batman-adv/fragmentation.c ++++ b/net/batman-adv/fragmentation.c +@@ -251,7 +251,7 @@ batadv_frag_merge_packets(struct hlist_h + kfree(entry); + + /* Make room for the rest of the fragments. */ +- if (pskb_expand_head(skb_out, 0, size - skb->len, GFP_ATOMIC) < 0) { ++ if (pskb_expand_head(skb_out, 0, size - skb_out->len, GFP_ATOMIC) < 0) { + kfree_skb(skb_out); + skb_out = NULL; + goto free; diff --git a/queue-3.18/batman-adv-unify-fragment-size-calculation.patch b/queue-3.18/batman-adv-unify-fragment-size-calculation.patch new file mode 100644 index 00000000000..7b7f59bad4a --- /dev/null +++ b/queue-3.18/batman-adv-unify-fragment-size-calculation.patch @@ -0,0 +1,43 @@ +From foo@baz Sat Jan 17 18:12:21 PST 2015 +From: Sven Eckelmann +Date: Sat, 20 Dec 2014 13:48:56 +0100 +Subject: batman-adv: Unify fragment size calculation + +From: Sven Eckelmann + +[ Upstream commit 0402e444cd199389b7fe47be68a67b817e09e097 ] + +The fragmentation code was replaced in 610bfc6bc99bc83680d190ebc69359a05fc7f605 +("batman-adv: Receive fragmented packets and merge") by an implementation which +can handle up to 16 fragments of a packet. The packet is prepared for the split +in fragments by the function batadv_frag_send_packet and the actual split is +done by batadv_frag_create. + +Both functions calculate the size of a fragment themself. But their calculation +differs because batadv_frag_send_packet also subtracts ETH_HLEN. Therefore, +the check in batadv_frag_send_packet "can a full fragment can be created?" may +return true even when batadv_frag_create cannot create a full fragment. + +The function batadv_frag_create doesn't check the size of the skb before +splitting it and therefore might try to create a larger fragment than the +remaining buffer. This creates an integer underflow and an invalid len is given +to skb_split. + +Signed-off-by: Sven Eckelmann +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/batman-adv/fragmentation.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/batman-adv/fragmentation.c ++++ b/net/batman-adv/fragmentation.c +@@ -434,7 +434,7 @@ bool batadv_frag_send_packet(struct sk_b + * fragments larger than BATADV_FRAG_MAX_FRAG_SIZE + */ + mtu = min_t(unsigned, mtu, BATADV_FRAG_MAX_FRAG_SIZE); +- max_fragment_size = (mtu - header_size - ETH_HLEN); ++ max_fragment_size = mtu - header_size; + max_packet_size = max_fragment_size * BATADV_FRAG_MAX_FRAGMENTS; + + /* Don't even try to fragment, if we need more than 16 fragments */ diff --git a/queue-3.18/enic-fix-rx-skb-checksum.patch b/queue-3.18/enic-fix-rx-skb-checksum.patch new file mode 100644 index 00000000000..856fdfd1d45 --- /dev/null +++ b/queue-3.18/enic-fix-rx-skb-checksum.patch @@ -0,0 +1,73 @@ +From foo@baz Sat Jan 17 18:12:21 PST 2015 +From: Govindarajulu Varadarajan <_govind@gmx.com> +Date: Thu, 18 Dec 2014 15:58:42 +0530 +Subject: enic: fix rx skb checksum + +From: Govindarajulu Varadarajan <_govind@gmx.com> + +[ Upstream commit 17e96834fd35997ca7cdfbf15413bcd5a36ad448 ] + +Hardware always provides compliment of IP pseudo checksum. Stack expects +whole packet checksum without pseudo checksum if CHECKSUM_COMPLETE is set. + +This causes checksum error in nf & ovs. + +kernel: qg-19546f09-f2: hw csum failure +kernel: CPU: 9 PID: 0 Comm: swapper/9 Tainted: GF O-------------- 3.10.0-123.8.1.el7.x86_64 #1 +kernel: Hardware name: Cisco Systems Inc UCSB-B200-M3/UCSB-B200-M3, BIOS B200M3.2.2.3.0.080820141339 08/08/2014 +kernel: ffff881218f40000 df68243feb35e3a8 ffff881237a43ab8 ffffffff815e237b +kernel: ffff881237a43ad0 ffffffff814cd4ca ffff8829ec71eb00 ffff881237a43af0 +kernel: ffffffff814c6232 0000000000000286 ffff8829ec71eb00 ffff881237a43b00 +kernel: Call Trace: +kernel: [] dump_stack+0x19/0x1b +kernel: [] netdev_rx_csum_fault+0x3a/0x40 +kernel: [] __skb_checksum_complete_head+0x62/0x70 +kernel: [] __skb_checksum_complete+0x11/0x20 +kernel: [] nf_ip_checksum+0xcc/0x100 +kernel: [] icmp_error+0x1f7/0x35c [nf_conntrack_ipv4] +kernel: [] ? netif_rx+0xb9/0x1d0 +kernel: [] ? internal_dev_recv+0xdb/0x130 [openvswitch] +kernel: [] nf_conntrack_in+0xf0/0xa80 [nf_conntrack] +kernel: [] ? inet_del_offload+0x40/0x40 +kernel: [] ipv4_conntrack_in+0x22/0x30 [nf_conntrack_ipv4] +kernel: [] nf_iterate+0xaa/0xc0 +kernel: [] ? inet_del_offload+0x40/0x40 +kernel: [] nf_hook_slow+0x84/0x140 +kernel: [] ? inet_del_offload+0x40/0x40 +kernel: [] ip_rcv+0x344/0x380 + +Hardware verifies IP & tcp/udp header checksum but does not provide payload +checksum, use CHECKSUM_UNNECESSARY. Set it only if its valid IP tcp/udp packet. + +Cc: Jiri Benc +Cc: Stefan Assmann +Reported-by: Sunil Choudhary +Signed-off-by: Govindarajulu Varadarajan <_govind@gmx.com> +Reviewed-by: Jiri Benc +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/cisco/enic/enic_main.c | 12 ++++++++---- + 1 file changed, 8 insertions(+), 4 deletions(-) + +--- a/drivers/net/ethernet/cisco/enic/enic_main.c ++++ b/drivers/net/ethernet/cisco/enic/enic_main.c +@@ -1059,10 +1059,14 @@ static void enic_rq_indicate_buf(struct + PKT_HASH_TYPE_L4 : PKT_HASH_TYPE_L3); + } + +- if ((netdev->features & NETIF_F_RXCSUM) && !csum_not_calc) { +- skb->csum = htons(checksum); +- skb->ip_summed = CHECKSUM_COMPLETE; +- } ++ /* Hardware does not provide whole packet checksum. It only ++ * provides pseudo checksum. Since hw validates the packet ++ * checksum but not provide us the checksum value. use ++ * CHECSUM_UNNECESSARY. ++ */ ++ if ((netdev->features & NETIF_F_RXCSUM) && tcp_udp_csum_ok && ++ ipv4_csum_ok) ++ skb->ip_summed = CHECKSUM_UNNECESSARY; + + if (vlan_stripped) + __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tci); diff --git a/queue-3.18/geneve-fix-races-between-socket-add-and-release.patch b/queue-3.18/geneve-fix-races-between-socket-add-and-release.patch new file mode 100644 index 00000000000..07cc4fa4da7 --- /dev/null +++ b/queue-3.18/geneve-fix-races-between-socket-add-and-release.patch @@ -0,0 +1,61 @@ +From foo@baz Sat Jan 17 18:12:21 PST 2015 +From: Jesse Gross +Date: Tue, 16 Dec 2014 18:25:32 -0800 +Subject: geneve: Fix races between socket add and release. + +From: Jesse Gross + +[ Upstream commit 12069401d895ff84076a50189ca842c0696b84b2 ] + +Currently, searching for a socket to add a reference to is not +synchronized with deletion of sockets. This can result in use +after free if there is another operation that is removing a +socket at the same time. Solving this requires both holding the +appropriate lock and checking the refcount to ensure that it +has not already hit zero. + +Inspired by a related (but not exactly the same) issue in the +VXLAN driver. + +Fixes: 0b5e8b8e ("net: Add Geneve tunneling protocol driver") +CC: Andy Zhou +Signed-off-by: Jesse Gross +Acked-by: Thomas Graf +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/geneve.c | 13 +++++++------ + 1 file changed, 7 insertions(+), 6 deletions(-) + +--- a/net/ipv4/geneve.c ++++ b/net/ipv4/geneve.c +@@ -302,6 +302,7 @@ struct geneve_sock *geneve_sock_add(stru + geneve_rcv_t *rcv, void *data, + bool no_share, bool ipv6) + { ++ struct geneve_net *gn = net_generic(net, geneve_net_id); + struct geneve_sock *gs; + + gs = geneve_socket_create(net, port, rcv, data, ipv6); +@@ -311,15 +312,15 @@ struct geneve_sock *geneve_sock_add(stru + if (no_share) /* Return error if sharing is not allowed. */ + return ERR_PTR(-EINVAL); + ++ spin_lock(&gn->sock_lock); + gs = geneve_find_sock(net, port); +- if (gs) { +- if (gs->rcv == rcv) +- atomic_inc(&gs->refcnt); +- else ++ if (gs && ((gs->rcv != rcv) || ++ !atomic_add_unless(&gs->refcnt, 1, 0))) + gs = ERR_PTR(-EBUSY); +- } else { ++ spin_unlock(&gn->sock_lock); ++ ++ if (!gs) + gs = ERR_PTR(-EINVAL); +- } + + return gs; + } diff --git a/queue-3.18/geneve-remove-socket-and-offload-handlers-at-destruction.patch b/queue-3.18/geneve-remove-socket-and-offload-handlers-at-destruction.patch new file mode 100644 index 00000000000..91a384ef976 --- /dev/null +++ b/queue-3.18/geneve-remove-socket-and-offload-handlers-at-destruction.patch @@ -0,0 +1,59 @@ +From foo@baz Sat Jan 17 18:12:21 PST 2015 +From: Jesse Gross +Date: Tue, 16 Dec 2014 18:25:31 -0800 +Subject: geneve: Remove socket and offload handlers at destruction. + +From: Jesse Gross + +[ Upstream commit 7ed767f73192d6daf673c6d885cd02d5f280ac1f ] + +Sockets aren't currently removed from the the global list when +they are destroyed. In addition, offload handlers need to be cleaned +up as well. + +Fixes: 0b5e8b8e ("net: Add Geneve tunneling protocol driver") +CC: Andy Zhou +Signed-off-by: Jesse Gross +Acked-by: Thomas Graf +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/geneve.c | 17 +++++++++++++++++ + 1 file changed, 17 insertions(+) + +--- a/net/ipv4/geneve.c ++++ b/net/ipv4/geneve.c +@@ -165,6 +165,15 @@ static void geneve_notify_add_rx_port(st + } + } + ++static void geneve_notify_del_rx_port(struct geneve_sock *gs) ++{ ++ struct sock *sk = gs->sock->sk; ++ sa_family_t sa_family = sk->sk_family; ++ ++ if (sa_family == AF_INET) ++ udp_del_offload(&gs->udp_offloads); ++} ++ + /* Callback from net/ipv4/udp.c to receive packets */ + static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) + { +@@ -318,9 +327,17 @@ EXPORT_SYMBOL_GPL(geneve_sock_add); + + void geneve_sock_release(struct geneve_sock *gs) + { ++ struct net *net = sock_net(gs->sock->sk); ++ struct geneve_net *gn = net_generic(net, geneve_net_id); ++ + if (!atomic_dec_and_test(&gs->refcnt)) + return; + ++ spin_lock(&gn->sock_lock); ++ hlist_del_rcu(&gs->hlist); ++ geneve_notify_del_rx_port(gs); ++ spin_unlock(&gn->sock_lock); ++ + queue_work(geneve_wq, &gs->del_work); + } + EXPORT_SYMBOL_GPL(geneve_sock_release); diff --git a/queue-3.18/gre-fix-the-inner-mac-header-in-nbma-tunnel-xmit-path.patch b/queue-3.18/gre-fix-the-inner-mac-header-in-nbma-tunnel-xmit-path.patch new file mode 100644 index 00000000000..87062f01080 --- /dev/null +++ b/queue-3.18/gre-fix-the-inner-mac-header-in-nbma-tunnel-xmit-path.patch @@ -0,0 +1,69 @@ +From foo@baz Sat Jan 17 18:12:21 PST 2015 +From: =?UTF-8?q?Timo=20Ter=C3=A4s?= +Date: Mon, 15 Dec 2014 09:24:13 +0200 +Subject: gre: fix the inner mac header in nbma tunnel xmit path +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: =?UTF-8?q?Timo=20Ter=C3=A4s?= + +[ Upstream commit 8a0033a947403569caeca45fa5e6f7ba60d51974 ] + +The NBMA GRE tunnels temporarily push GRE header that contain the +per-packet NBMA destination on the skb via header ops early in xmit +path. It is the later pulled before the real GRE header is constructed. + +The inner mac was thus set differently in nbma case: the GRE header +has been pushed by neighbor layer, and mac header points to beginning +of the temporary gre header (set by dev_queue_xmit). + +Now that the offloads expect mac header to point to the gre payload, +fix the xmit patch to: + - pull first the temporary gre header away + - and reset mac header to point to gre payload + +This fixes tso to work again with nbma tunnels. + +Fixes: 14051f0452a2 ("gre: Use inner mac length when computing tunnel length") +Signed-off-by: Timo Teräs +Cc: Tom Herbert +Cc: Alexander Duyck +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/ip_gre.c | 9 +++++---- + 1 file changed, 5 insertions(+), 4 deletions(-) + +--- a/net/ipv4/ip_gre.c ++++ b/net/ipv4/ip_gre.c +@@ -252,10 +252,6 @@ static netdev_tx_t ipgre_xmit(struct sk_ + struct ip_tunnel *tunnel = netdev_priv(dev); + const struct iphdr *tnl_params; + +- skb = gre_handle_offloads(skb, !!(tunnel->parms.o_flags&TUNNEL_CSUM)); +- if (IS_ERR(skb)) +- goto out; +- + if (dev->header_ops) { + /* Need space for new headers */ + if (skb_cow_head(skb, dev->needed_headroom - +@@ -268,6 +264,7 @@ static netdev_tx_t ipgre_xmit(struct sk_ + * to gre header. + */ + skb_pull(skb, tunnel->hlen + sizeof(struct iphdr)); ++ skb_reset_mac_header(skb); + } else { + if (skb_cow_head(skb, dev->needed_headroom)) + goto free_skb; +@@ -275,6 +272,10 @@ static netdev_tx_t ipgre_xmit(struct sk_ + tnl_params = &tunnel->parms.iph; + } + ++ skb = gre_handle_offloads(skb, !!(tunnel->parms.o_flags&TUNNEL_CSUM)); ++ if (IS_ERR(skb)) ++ goto out; ++ + __gre_xmit(skb, dev, tnl_params, skb->protocol); + + return NETDEV_TX_OK; diff --git a/queue-3.18/in6-fix-conflict-with-glibc.patch b/queue-3.18/in6-fix-conflict-with-glibc.patch new file mode 100644 index 00000000000..711a8954501 --- /dev/null +++ b/queue-3.18/in6-fix-conflict-with-glibc.patch @@ -0,0 +1,70 @@ +From foo@baz Sat Jan 17 18:12:21 PST 2015 +From: stephen hemminger +Date: Sat, 20 Dec 2014 12:15:49 -0800 +Subject: in6: fix conflict with glibc + +From: stephen hemminger + +[ Upstream commit 6d08acd2d32e3e877579315dc3202d7a5f336d98 ] + +Resolve conflicts between glibc definition of IPV6 socket options +and those defined in Linux headers. Looks like earlier efforts to +solve this did not cover all the definitions. + +It resolves warnings during iproute2 build. +Please consider for stable as well. + +Signed-off-by: Stephen Hemminger +Acked-by: Hannes Frederic Sowa +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + include/uapi/linux/in6.h | 3 ++- + include/uapi/linux/libc-compat.h | 3 +++ + 2 files changed, 5 insertions(+), 1 deletion(-) + +--- a/include/uapi/linux/in6.h ++++ b/include/uapi/linux/in6.h +@@ -149,7 +149,7 @@ struct in6_flowlabel_req { + /* + * IPV6 socket options + */ +- ++#if __UAPI_DEF_IPV6_OPTIONS + #define IPV6_ADDRFORM 1 + #define IPV6_2292PKTINFO 2 + #define IPV6_2292HOPOPTS 3 +@@ -196,6 +196,7 @@ struct in6_flowlabel_req { + + #define IPV6_IPSEC_POLICY 34 + #define IPV6_XFRM_POLICY 35 ++#endif + + /* + * Multicast: +--- a/include/uapi/linux/libc-compat.h ++++ b/include/uapi/linux/libc-compat.h +@@ -69,6 +69,7 @@ + #define __UAPI_DEF_SOCKADDR_IN6 0 + #define __UAPI_DEF_IPV6_MREQ 0 + #define __UAPI_DEF_IPPROTO_V6 0 ++#define __UAPI_DEF_IPV6_OPTIONS 0 + + #else + +@@ -82,6 +83,7 @@ + #define __UAPI_DEF_SOCKADDR_IN6 1 + #define __UAPI_DEF_IPV6_MREQ 1 + #define __UAPI_DEF_IPPROTO_V6 1 ++#define __UAPI_DEF_IPV6_OPTIONS 1 + + #endif /* _NETINET_IN_H */ + +@@ -103,6 +105,7 @@ + #define __UAPI_DEF_SOCKADDR_IN6 1 + #define __UAPI_DEF_IPV6_MREQ 1 + #define __UAPI_DEF_IPPROTO_V6 1 ++#define __UAPI_DEF_IPV6_OPTIONS 1 + + /* Definitions for xattr.h */ + #define __UAPI_DEF_XATTR 1 diff --git a/queue-3.18/net-core-handle-csum-for-checksum_complete-vxlan-forwarding.patch b/queue-3.18/net-core-handle-csum-for-checksum_complete-vxlan-forwarding.patch new file mode 100644 index 00000000000..62fb06d709e --- /dev/null +++ b/queue-3.18/net-core-handle-csum-for-checksum_complete-vxlan-forwarding.patch @@ -0,0 +1,63 @@ +From foo@baz Sat Jan 17 18:12:21 PST 2015 +From: Jay Vosburgh +Date: Fri, 19 Dec 2014 15:32:00 -0800 +Subject: net/core: Handle csum for CHECKSUM_COMPLETE VXLAN forwarding + +From: Jay Vosburgh + +[ Upstream commit 2c26d34bbcc0b3f30385d5587aa232289e2eed8e ] + +When using VXLAN tunnels and a sky2 device, I have experienced +checksum failures of the following type: + +[ 4297.761899] eth0: hw csum failure +[...] +[ 4297.765223] Call Trace: +[ 4297.765224] [] dump_stack+0x46/0x58 +[ 4297.765235] [] netdev_rx_csum_fault+0x42/0x50 +[ 4297.765238] [] ? skb_push+0x40/0x40 +[ 4297.765240] [] __skb_checksum_complete+0xbc/0xd0 +[ 4297.765243] [] tcp_v4_rcv+0x2e2/0x950 +[ 4297.765246] [] ? ip_rcv_finish+0x360/0x360 + + These are reliably reproduced in a network topology of: + +container:eth0 == host(OVS VXLAN on VLAN) == bond0 == eth0 (sky2) -> switch + + When VXLAN encapsulated traffic is received from a similarly +configured peer, the above warning is generated in the receive +processing of the encapsulated packet. Note that the warning is +associated with the container eth0. + + The skbs from sky2 have ip_summed set to CHECKSUM_COMPLETE, and +because the packet is an encapsulated Ethernet frame, the checksum +generated by the hardware includes the inner protocol and Ethernet +headers. + + The receive code is careful to update the skb->csum, except in +__dev_forward_skb, as called by dev_forward_skb. __dev_forward_skb +calls eth_type_trans, which in turn calls skb_pull_inline(skb, ETH_HLEN) +to skip over the Ethernet header, but does not update skb->csum when +doing so. + + This patch resolves the problem by adding a call to +skb_postpull_rcsum to update the skb->csum after the call to +eth_type_trans. + +Signed-off-by: Jay Vosburgh +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/core/dev.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -1697,6 +1697,7 @@ int __dev_forward_skb(struct net_device + + skb_scrub_packet(skb, true); + skb->protocol = eth_type_trans(skb, dev); ++ skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); + + return 0; + } diff --git a/queue-3.18/net-drop-the-packet-when-fails-to-do-software-segmentation-or-header-check.patch b/queue-3.18/net-drop-the-packet-when-fails-to-do-software-segmentation-or-header-check.patch new file mode 100644 index 00000000000..34befa564f8 --- /dev/null +++ b/queue-3.18/net-drop-the-packet-when-fails-to-do-software-segmentation-or-header-check.patch @@ -0,0 +1,34 @@ +From foo@baz Sat Jan 17 18:12:21 PST 2015 +From: Jason Wang +Date: Fri, 19 Dec 2014 11:09:13 +0800 +Subject: net: drop the packet when fails to do software segmentation or header check + +From: Jason Wang + +[ Upstream commit af6dabc9c70ae3f307685b1f32f52d60b1bf0527 ] + +Commit cecda693a969816bac5e470e1d9c9c0ef5567bca ("net: keep original skb +which only needs header checking during software GSO") keeps the original +skb for packets that only needs header check, but it doesn't drop the +packet if software segmentation or header check were failed. + +Fixes cecda693a9 ("net: keep original skb which only needs header checking during software GSO") +Cc: Eric Dumazet +Signed-off-by: Jason Wang +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/core/dev.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -2680,7 +2680,7 @@ static struct sk_buff *validate_xmit_skb + + segs = skb_gso_segment(skb, features); + if (IS_ERR(segs)) { +- segs = NULL; ++ goto out_kfree_skb; + } else if (segs) { + consume_skb(skb); + skb = segs; diff --git a/queue-3.18/net-fix-stacked-vlan-offload-features-computation.patch b/queue-3.18/net-fix-stacked-vlan-offload-features-computation.patch new file mode 100644 index 00000000000..e9b30e2577e --- /dev/null +++ b/queue-3.18/net-fix-stacked-vlan-offload-features-computation.patch @@ -0,0 +1,47 @@ +From foo@baz Sat Jan 17 18:12:21 PST 2015 +From: Toshiaki Makita +Date: Mon, 22 Dec 2014 19:04:14 +0900 +Subject: net: Fix stacked vlan offload features computation + +From: Toshiaki Makita + +[ Upstream commit 796f2da81bead71ffc91ef70912cd8d1827bf756 ] + +When vlan tags are stacked, it is very likely that the outer tag is stored +in skb->vlan_tci and skb->protocol shows the inner tag's vlan_proto. +Currently netif_skb_features() first looks at skb->protocol even if there +is the outer tag in vlan_tci, thus it incorrectly retrieves the protocol +encapsulated by the inner vlan instead of the inner vlan protocol. +This allows GSO packets to be passed to HW and they end up being +corrupted. + +Fixes: 58e998c6d239 ("offloading: Force software GSO for multiple vlan tags.") +Signed-off-by: Toshiaki Makita +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/core/dev.c | 13 ++++++++----- + 1 file changed, 8 insertions(+), 5 deletions(-) + +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -2573,11 +2573,14 @@ netdev_features_t netif_skb_features(str + if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs) + features &= ~NETIF_F_GSO_MASK; + +- if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) { +- struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; +- protocol = veh->h_vlan_encapsulated_proto; +- } else if (!vlan_tx_tag_present(skb)) { +- return harmonize_features(skb, features); ++ if (!vlan_tx_tag_present(skb)) { ++ if (unlikely(protocol == htons(ETH_P_8021Q) || ++ protocol == htons(ETH_P_8021AD))) { ++ struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; ++ protocol = veh->h_vlan_encapsulated_proto; ++ } else { ++ return harmonize_features(skb, features); ++ } + } + + features = netdev_intersect_features(features, diff --git a/queue-3.18/net-generalize-ndo_gso_check-to-ndo_features_check.patch b/queue-3.18/net-generalize-ndo_gso_check-to-ndo_features_check.patch new file mode 100644 index 00000000000..9941c5c98cc --- /dev/null +++ b/queue-3.18/net-generalize-ndo_gso_check-to-ndo_features_check.patch @@ -0,0 +1,285 @@ +From foo@baz Sat Jan 17 18:12:21 PST 2015 +From: Jesse Gross +Date: Tue, 23 Dec 2014 22:37:26 -0800 +Subject: net: Generalize ndo_gso_check to ndo_features_check + +From: Jesse Gross + +[ Upstream commit 5f35227ea34bb616c436d9da47fc325866c428f3 ] + +GSO isn't the only offload feature with restrictions that +potentially can't be expressed with the current features mechanism. +Checksum is another although it's a general issue that could in +theory apply to anything. Even if it may be possible to +implement these restrictions in other ways, it can result in +duplicate code or inefficient per-packet behavior. + +This generalizes ndo_gso_check so that drivers can remove any +features that don't make sense for a given packet, similar to +netif_skb_features(). It also converts existing driver +restrictions to the new format, completing the work that was +done to support tunnel protocols since the issues apply to +checksums as well. + +By actually removing features from the set that are used to do +offloading, it solves another problem with the existing +interface. In these cases, GSO would run with the original set +of features and not do anything because it appears that +segmentation is not required. + +CC: Tom Herbert +CC: Joe Stringer +CC: Eric Dumazet +CC: Hayes Wang +Signed-off-by: Jesse Gross +Acked-by: Tom Herbert +Fixes: 04ffcb255f22 ("net: Add ndo_gso_check") +Tested-by: Hayes Wang +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/emulex/benet/be_main.c | 8 ++++-- + drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 10 ++++---- + drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c | 8 ++++-- + include/linux/netdevice.h | 20 +++++++++------- + include/net/vxlan.h | 28 +++++++++++++++++++---- + net/core/dev.c | 23 +++++++++++------- + 6 files changed, 65 insertions(+), 32 deletions(-) + +--- a/drivers/net/ethernet/emulex/benet/be_main.c ++++ b/drivers/net/ethernet/emulex/benet/be_main.c +@@ -4427,9 +4427,11 @@ static void be_del_vxlan_port(struct net + be16_to_cpu(port)); + } + +-static bool be_gso_check(struct sk_buff *skb, struct net_device *dev) ++static netdev_features_t be_features_check(struct sk_buff *skb, ++ struct net_device *dev, ++ netdev_features_t features) + { +- return vxlan_gso_check(skb); ++ return vxlan_features_check(skb, features); + } + #endif + +@@ -4460,7 +4462,7 @@ static const struct net_device_ops be_ne + #ifdef CONFIG_BE2NET_VXLAN + .ndo_add_vxlan_port = be_add_vxlan_port, + .ndo_del_vxlan_port = be_del_vxlan_port, +- .ndo_gso_check = be_gso_check, ++ .ndo_features_check = be_features_check, + #endif + }; + +--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c ++++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +@@ -2363,9 +2363,11 @@ static void mlx4_en_del_vxlan_port(struc + queue_work(priv->mdev->workqueue, &priv->vxlan_del_task); + } + +-static bool mlx4_en_gso_check(struct sk_buff *skb, struct net_device *dev) ++static netdev_features_t mlx4_en_features_check(struct sk_buff *skb, ++ struct net_device *dev, ++ netdev_features_t features) + { +- return vxlan_gso_check(skb); ++ return vxlan_features_check(skb, features); + } + #endif + +@@ -2398,7 +2400,7 @@ static const struct net_device_ops mlx4_ + #ifdef CONFIG_MLX4_EN_VXLAN + .ndo_add_vxlan_port = mlx4_en_add_vxlan_port, + .ndo_del_vxlan_port = mlx4_en_del_vxlan_port, +- .ndo_gso_check = mlx4_en_gso_check, ++ .ndo_features_check = mlx4_en_features_check, + #endif + }; + +@@ -2432,7 +2434,7 @@ static const struct net_device_ops mlx4_ + #ifdef CONFIG_MLX4_EN_VXLAN + .ndo_add_vxlan_port = mlx4_en_add_vxlan_port, + .ndo_del_vxlan_port = mlx4_en_del_vxlan_port, +- .ndo_gso_check = mlx4_en_gso_check, ++ .ndo_features_check = mlx4_en_features_check, + #endif + }; + +--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c ++++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c +@@ -504,9 +504,11 @@ static void qlcnic_del_vxlan_port(struct + adapter->flags |= QLCNIC_DEL_VXLAN_PORT; + } + +-static bool qlcnic_gso_check(struct sk_buff *skb, struct net_device *dev) ++static netdev_features_t qlcnic_features_check(struct sk_buff *skb, ++ struct net_device *dev, ++ netdev_features_t features) + { +- return vxlan_gso_check(skb); ++ return vxlan_features_check(skb, features); + } + #endif + +@@ -531,7 +533,7 @@ static const struct net_device_ops qlcni + #ifdef CONFIG_QLCNIC_VXLAN + .ndo_add_vxlan_port = qlcnic_add_vxlan_port, + .ndo_del_vxlan_port = qlcnic_del_vxlan_port, +- .ndo_gso_check = qlcnic_gso_check, ++ .ndo_features_check = qlcnic_features_check, + #endif + #ifdef CONFIG_NET_POLL_CONTROLLER + .ndo_poll_controller = qlcnic_poll_controller, +--- a/include/linux/netdevice.h ++++ b/include/linux/netdevice.h +@@ -998,12 +998,15 @@ typedef u16 (*select_queue_fallback_t)(s + * Callback to use for xmit over the accelerated station. This + * is used in place of ndo_start_xmit on accelerated net + * devices. +- * bool (*ndo_gso_check) (struct sk_buff *skb, +- * struct net_device *dev); ++ * netdev_features_t (*ndo_features_check) (struct sk_buff *skb, ++ * struct net_device *dev ++ * netdev_features_t features); + * Called by core transmit path to determine if device is capable of +- * performing GSO on a packet. The device returns true if it is +- * able to GSO the packet, false otherwise. If the return value is +- * false the stack will do software GSO. ++ * performing offload operations on a given packet. This is to give ++ * the device an opportunity to implement any restrictions that cannot ++ * be otherwise expressed by feature flags. The check is called with ++ * the set of features that the stack has calculated and it returns ++ * those the driver believes to be appropriate. + */ + struct net_device_ops { + int (*ndo_init)(struct net_device *dev); +@@ -1153,8 +1156,9 @@ struct net_device_ops { + struct net_device *dev, + void *priv); + int (*ndo_get_lock_subclass)(struct net_device *dev); +- bool (*ndo_gso_check) (struct sk_buff *skb, +- struct net_device *dev); ++ netdev_features_t (*ndo_features_check) (struct sk_buff *skb, ++ struct net_device *dev, ++ netdev_features_t features); + }; + + /** +@@ -3584,8 +3588,6 @@ static inline bool netif_needs_gso(struc + netdev_features_t features) + { + return skb_is_gso(skb) && (!skb_gso_ok(skb, features) || +- (dev->netdev_ops->ndo_gso_check && +- !dev->netdev_ops->ndo_gso_check(skb, dev)) || + unlikely((skb->ip_summed != CHECKSUM_PARTIAL) && + (skb->ip_summed != CHECKSUM_UNNECESSARY))); + } +--- a/include/net/vxlan.h ++++ b/include/net/vxlan.h +@@ -1,6 +1,9 @@ + #ifndef __NET_VXLAN_H + #define __NET_VXLAN_H 1 + ++#include ++#include ++#include + #include + #include + #include +@@ -51,16 +54,33 @@ int vxlan_xmit_skb(struct vxlan_sock *vs + __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, + __be16 src_port, __be16 dst_port, __be32 vni, bool xnet); + +-static inline bool vxlan_gso_check(struct sk_buff *skb) ++static inline netdev_features_t vxlan_features_check(struct sk_buff *skb, ++ netdev_features_t features) + { +- if ((skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL) && ++ u8 l4_hdr = 0; ++ ++ if (!skb->encapsulation) ++ return features; ++ ++ switch (vlan_get_protocol(skb)) { ++ case htons(ETH_P_IP): ++ l4_hdr = ip_hdr(skb)->protocol; ++ break; ++ case htons(ETH_P_IPV6): ++ l4_hdr = ipv6_hdr(skb)->nexthdr; ++ break; ++ default: ++ return features;; ++ } ++ ++ if ((l4_hdr == IPPROTO_UDP) && + (skb->inner_protocol_type != ENCAP_TYPE_ETHER || + skb->inner_protocol != htons(ETH_P_TEB) || + (skb_inner_mac_header(skb) - skb_transport_header(skb) != + sizeof(struct udphdr) + sizeof(struct vxlanhdr)))) +- return false; ++ return features & ~(NETIF_F_ALL_CSUM | NETIF_F_GSO_MASK); + +- return true; ++ return features; + } + + /* IP header + UDP + VXLAN + Ethernet header */ +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -2566,7 +2566,7 @@ static netdev_features_t harmonize_featu + + netdev_features_t netif_skb_features(struct sk_buff *skb) + { +- const struct net_device *dev = skb->dev; ++ struct net_device *dev = skb->dev; + netdev_features_t features = dev->features; + u16 gso_segs = skb_shinfo(skb)->gso_segs; + __be16 protocol = skb->protocol; +@@ -2574,13 +2574,20 @@ netdev_features_t netif_skb_features(str + if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs) + features &= ~NETIF_F_GSO_MASK; + ++ /* If encapsulation offload request, verify we are testing ++ * hardware encapsulation features instead of standard ++ * features for the netdev ++ */ ++ if (skb->encapsulation) ++ features &= dev->hw_enc_features; ++ + if (!vlan_tx_tag_present(skb)) { + if (unlikely(protocol == htons(ETH_P_8021Q) || + protocol == htons(ETH_P_8021AD))) { + struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; + protocol = veh->h_vlan_encapsulated_proto; + } else { +- return harmonize_features(skb, features); ++ goto finalize; + } + } + +@@ -2598,6 +2605,11 @@ netdev_features_t netif_skb_features(str + NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_STAG_TX); + ++finalize: ++ if (dev->netdev_ops->ndo_features_check) ++ features &= dev->netdev_ops->ndo_features_check(skb, dev, ++ features); ++ + return harmonize_features(skb, features); + } + EXPORT_SYMBOL(netif_skb_features); +@@ -2672,13 +2684,6 @@ static struct sk_buff *validate_xmit_skb + if (unlikely(!skb)) + goto out_null; + +- /* If encapsulation offload request, verify we are testing +- * hardware encapsulation features instead of standard +- * features for the netdev +- */ +- if (skb->encapsulation) +- features &= dev->hw_enc_features; +- + if (netif_needs_gso(dev, skb, features)) { + struct sk_buff *segs; + diff --git a/queue-3.18/net-mlx4-cache-line-cqe-eqe-stride-fixes.patch b/queue-3.18/net-mlx4-cache-line-cqe-eqe-stride-fixes.patch new file mode 100644 index 00000000000..3163e152ca3 --- /dev/null +++ b/queue-3.18/net-mlx4-cache-line-cqe-eqe-stride-fixes.patch @@ -0,0 +1,59 @@ +From foo@baz Sat Jan 17 18:12:21 PST 2015 +From: Ido Shamay +Date: Tue, 16 Dec 2014 13:28:54 +0200 +Subject: net/mlx4: Cache line CQE/EQE stride fixes + +From: Ido Shamay + +[ Upstream commit c3f2511feac088030055012cc8f64ebd84c87dbc ] + +This commit contains 2 fixes for the 128B CQE/EQE stride feaure. +Wei found that mlx4_QUERY_HCA function marked the wrong capability +in flags (64B CQE/EQE), when CQE/EQE stride feature was enabled. +Also added small fix in initial CQE ownership bit assignment, when CQE +is size is not default 32B. + +Fixes: 77507aa24 (net/mlx4: Enable CQE/EQE stride support) +Signed-off-by: Wei Yang +Signed-off-by: Ido Shamay +Signed-off-by: Amir Vadai +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 11 +++++++++-- + drivers/net/ethernet/mellanox/mlx4/fw.c | 4 ++-- + 2 files changed, 11 insertions(+), 4 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c ++++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +@@ -1569,8 +1569,15 @@ int mlx4_en_start_port(struct net_device + mlx4_en_free_affinity_hint(priv, i); + goto cq_err; + } +- for (j = 0; j < cq->size; j++) +- cq->buf[j].owner_sr_opcode = MLX4_CQE_OWNER_MASK; ++ ++ for (j = 0; j < cq->size; j++) { ++ struct mlx4_cqe *cqe = NULL; ++ ++ cqe = mlx4_en_get_cqe(cq->buf, j, priv->cqe_size) + ++ priv->cqe_factor; ++ cqe->owner_sr_opcode = MLX4_CQE_OWNER_MASK; ++ } ++ + err = mlx4_en_set_cq_moder(priv, cq); + if (err) { + en_err(priv, "Failed setting cq moderation parameters\n"); +--- a/drivers/net/ethernet/mellanox/mlx4/fw.c ++++ b/drivers/net/ethernet/mellanox/mlx4/fw.c +@@ -1647,8 +1647,8 @@ int mlx4_QUERY_HCA(struct mlx4_dev *dev, + /* CX3 is capable of extending CQEs\EQEs to strides larger than 64B */ + MLX4_GET(byte_field, outbox, INIT_HCA_EQE_CQE_STRIDE_OFFSET); + if (byte_field) { +- param->dev_cap_enabled |= MLX4_DEV_CAP_64B_EQE_ENABLED; +- param->dev_cap_enabled |= MLX4_DEV_CAP_64B_CQE_ENABLED; ++ param->dev_cap_enabled |= MLX4_DEV_CAP_EQE_STRIDE_ENABLED; ++ param->dev_cap_enabled |= MLX4_DEV_CAP_CQE_STRIDE_ENABLED; + param->cqe_size = 1 << ((byte_field & + MLX4_CQE_SIZE_MASK_STRIDE) + 5); + param->eqe_size = 1 << (((byte_field & diff --git a/queue-3.18/net-mlx4_core-correcly-update-the-mtt-s-offset-in-the-mr-re-reg-flow.patch b/queue-3.18/net-mlx4_core-correcly-update-the-mtt-s-offset-in-the-mr-re-reg-flow.patch new file mode 100644 index 00000000000..1726e8e875d --- /dev/null +++ b/queue-3.18/net-mlx4_core-correcly-update-the-mtt-s-offset-in-the-mr-re-reg-flow.patch @@ -0,0 +1,55 @@ +From foo@baz Sat Jan 17 18:12:21 PST 2015 +From: Maor Gottlieb +Date: Tue, 30 Dec 2014 11:59:49 +0200 +Subject: net/mlx4_core: Correcly update the mtt's offset in the MR re-reg flow + +From: Maor Gottlieb + +[ Upstream commit a51e0df4c1e06afd7aba84496c14238e6b363caa ] + +Previously, mlx4_mt_rereg_write filled the MPT's entity_size with the +old MTT's page shift, which could result in using an incorrect offset. +Fix the initialization to be after we calculate the new MTT offset. + +In addition, assign mtt order to -1 after calling mlx4_mtt_cleanup. This +is necessary in order to mark the MTT as invalid and avoid freeing it later. + +Fixes: e630664 ('mlx4_core: Add helper functions to support MR re-registration') +Signed-off-by: Maor Gottlieb +Signed-off-by: Matan Barak +Signed-off-by: Or Gerlitz +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/mellanox/mlx4/mr.c | 9 +++++---- + 1 file changed, 5 insertions(+), 4 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx4/mr.c ++++ b/drivers/net/ethernet/mellanox/mlx4/mr.c +@@ -590,6 +590,7 @@ EXPORT_SYMBOL_GPL(mlx4_mr_free); + void mlx4_mr_rereg_mem_cleanup(struct mlx4_dev *dev, struct mlx4_mr *mr) + { + mlx4_mtt_cleanup(dev, &mr->mtt); ++ mr->mtt.order = -1; + } + EXPORT_SYMBOL_GPL(mlx4_mr_rereg_mem_cleanup); + +@@ -599,14 +600,14 @@ int mlx4_mr_rereg_mem_write(struct mlx4_ + { + int err; + +- mpt_entry->start = cpu_to_be64(iova); +- mpt_entry->length = cpu_to_be64(size); +- mpt_entry->entity_size = cpu_to_be32(page_shift); +- + err = mlx4_mtt_init(dev, npages, page_shift, &mr->mtt); + if (err) + return err; + ++ mpt_entry->start = cpu_to_be64(mr->iova); ++ mpt_entry->length = cpu_to_be64(mr->size); ++ mpt_entry->entity_size = cpu_to_be32(mr->mtt.page_shift); ++ + mpt_entry->pd_flags &= cpu_to_be32(MLX4_MPT_PD_MASK | + MLX4_MPT_PD_FLAG_EN_INV); + mpt_entry->flags &= cpu_to_be32(MLX4_MPT_FLAG_FREE | diff --git a/queue-3.18/net-mlx4_en-doorbell-is-byteswapped-in-little-endian-archs.patch b/queue-3.18/net-mlx4_en-doorbell-is-byteswapped-in-little-endian-archs.patch new file mode 100644 index 00000000000..ab648a46c92 --- /dev/null +++ b/queue-3.18/net-mlx4_en-doorbell-is-byteswapped-in-little-endian-archs.patch @@ -0,0 +1,46 @@ +From foo@baz Sat Jan 17 18:12:21 PST 2015 +From: Amir Vadai +Date: Mon, 22 Dec 2014 10:21:57 +0200 +Subject: net/mlx4_en: Doorbell is byteswapped in Little Endian archs + +From: Amir Vadai + +[ Upstream commit 492f5add4be84652bbe13da8a250d60c6856a5c5 ] + +iowrite32() will byteswap it's argument on big endian archs. +iowrite32be() will byteswap on little endian archs. +Since we don't want to do this unnecessary byteswap on the fast path, +doorbell is stored in the NIC's native endianness. Using the right +iowrite() according to the arch endianness. + +CC: Wei Yang +CC: David Laight +Fixes: 6a4e812 ("net/mlx4_en: Avoid calling bswap in tx fast path") +Signed-off-by: Amir Vadai +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/mellanox/mlx4/en_tx.c | 12 +++++++++++- + 1 file changed, 11 insertions(+), 1 deletion(-) + +--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c ++++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c +@@ -954,7 +954,17 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff + tx_desc->ctrl.owner_opcode = op_own; + if (send_doorbell) { + wmb(); +- iowrite32(ring->doorbell_qpn, ++ /* Since there is no iowrite*_native() that writes the ++ * value as is, without byteswapping - using the one ++ * the doesn't do byteswapping in the relevant arch ++ * endianness. ++ */ ++#if defined(__LITTLE_ENDIAN) ++ iowrite32( ++#else ++ iowrite32be( ++#endif ++ ring->doorbell_qpn, + ring->bf.uar->map + MLX4_SEND_DOORBELL); + } else { + ring->xmit_more++; diff --git a/queue-3.18/net-reset-secmark-when-scrubbing-packet.patch b/queue-3.18/net-reset-secmark-when-scrubbing-packet.patch new file mode 100644 index 00000000000..fdf511cff39 --- /dev/null +++ b/queue-3.18/net-reset-secmark-when-scrubbing-packet.patch @@ -0,0 +1,37 @@ +From foo@baz Sat Jan 17 18:12:21 PST 2015 +From: Thomas Graf +Date: Tue, 23 Dec 2014 01:13:18 +0100 +Subject: net: Reset secmark when scrubbing packet + +From: Thomas Graf + +[ Upstream commit b8fb4e0648a2ab3734140342002f68fb0c7d1602 ] + +skb_scrub_packet() is called when a packet switches between a context +such as between underlay and overlay, between namespaces, or between +L3 subnets. + +While we already scrub the packet mark, connection tracking entry, +and cached destination, the security mark/context is left intact. + +It seems wrong to inherit the security context of a packet when going +from overlay to underlay or across forwarding paths. + +Signed-off-by: Thomas Graf +Acked-by: Flavio Leitner +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/core/skbuff.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/net/core/skbuff.c ++++ b/net/core/skbuff.c +@@ -4040,6 +4040,7 @@ void skb_scrub_packet(struct sk_buff *sk + skb->ignore_df = 0; + skb_dst_drop(skb); + skb->mark = 0; ++ skb_init_secmark(skb); + secpath_reset(skb); + nf_reset(skb); + nf_reset_trace(skb); diff --git a/queue-3.18/netlink-always-copy-on-mmap-tx.patch b/queue-3.18/netlink-always-copy-on-mmap-tx.patch new file mode 100644 index 00000000000..33292f614f8 --- /dev/null +++ b/queue-3.18/netlink-always-copy-on-mmap-tx.patch @@ -0,0 +1,127 @@ +From foo@baz Sat Jan 17 18:12:21 PST 2015 +From: David Miller +Date: Tue, 16 Dec 2014 17:58:17 -0500 +Subject: netlink: Always copy on mmap TX. + +From: David Miller + +[ Upstream commit 4682a0358639b29cf69437ed909c6221f8c89847 ] + +Checking the file f_count and the nlk->mapped count is not completely +sufficient to prevent the mmap'd area contents from changing from +under us during netlink mmap sendmsg() operations. + +Be careful to sample the header's length field only once, because this +could change from under us as well. + +Fixes: 5fd96123ee19 ("netlink: implement memory mapped sendmsg()") +Signed-off-by: David S. Miller +Acked-by: Daniel Borkmann +Acked-by: Thomas Graf +Signed-off-by: Greg Kroah-Hartman +--- + net/netlink/af_netlink.c | 52 ++++++++++++++--------------------------------- + 1 file changed, 16 insertions(+), 36 deletions(-) + +--- a/net/netlink/af_netlink.c ++++ b/net/netlink/af_netlink.c +@@ -526,14 +526,14 @@ out: + return err; + } + +-static void netlink_frame_flush_dcache(const struct nl_mmap_hdr *hdr) ++static void netlink_frame_flush_dcache(const struct nl_mmap_hdr *hdr, unsigned int nm_len) + { + #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 + struct page *p_start, *p_end; + + /* First page is flushed through netlink_{get,set}_status */ + p_start = pgvec_to_page(hdr + PAGE_SIZE); +- p_end = pgvec_to_page((void *)hdr + NL_MMAP_HDRLEN + hdr->nm_len - 1); ++ p_end = pgvec_to_page((void *)hdr + NL_MMAP_HDRLEN + nm_len - 1); + while (p_start <= p_end) { + flush_dcache_page(p_start); + p_start++; +@@ -715,24 +715,16 @@ static int netlink_mmap_sendmsg(struct s + struct nl_mmap_hdr *hdr; + struct sk_buff *skb; + unsigned int maxlen; +- bool excl = true; + int err = 0, len = 0; + +- /* Netlink messages are validated by the receiver before processing. +- * In order to avoid userspace changing the contents of the message +- * after validation, the socket and the ring may only be used by a +- * single process, otherwise we fall back to copying. +- */ +- if (atomic_long_read(&sk->sk_socket->file->f_count) > 1 || +- atomic_read(&nlk->mapped) > 1) +- excl = false; +- + mutex_lock(&nlk->pg_vec_lock); + + ring = &nlk->tx_ring; + maxlen = ring->frame_size - NL_MMAP_HDRLEN; + + do { ++ unsigned int nm_len; ++ + hdr = netlink_current_frame(ring, NL_MMAP_STATUS_VALID); + if (hdr == NULL) { + if (!(msg->msg_flags & MSG_DONTWAIT) && +@@ -740,35 +732,23 @@ static int netlink_mmap_sendmsg(struct s + schedule(); + continue; + } +- if (hdr->nm_len > maxlen) { ++ ++ nm_len = ACCESS_ONCE(hdr->nm_len); ++ if (nm_len > maxlen) { + err = -EINVAL; + goto out; + } + +- netlink_frame_flush_dcache(hdr); ++ netlink_frame_flush_dcache(hdr, nm_len); + +- if (likely(dst_portid == 0 && dst_group == 0 && excl)) { +- skb = alloc_skb_head(GFP_KERNEL); +- if (skb == NULL) { +- err = -ENOBUFS; +- goto out; +- } +- sock_hold(sk); +- netlink_ring_setup_skb(skb, sk, ring, hdr); +- NETLINK_CB(skb).flags |= NETLINK_SKB_TX; +- __skb_put(skb, hdr->nm_len); +- netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED); +- atomic_inc(&ring->pending); +- } else { +- skb = alloc_skb(hdr->nm_len, GFP_KERNEL); +- if (skb == NULL) { +- err = -ENOBUFS; +- goto out; +- } +- __skb_put(skb, hdr->nm_len); +- memcpy(skb->data, (void *)hdr + NL_MMAP_HDRLEN, hdr->nm_len); +- netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED); ++ skb = alloc_skb(nm_len, GFP_KERNEL); ++ if (skb == NULL) { ++ err = -ENOBUFS; ++ goto out; + } ++ __skb_put(skb, nm_len); ++ memcpy(skb->data, (void *)hdr + NL_MMAP_HDRLEN, nm_len); ++ netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED); + + netlink_increment_head(ring); + +@@ -814,7 +794,7 @@ static void netlink_queue_mmaped_skb(str + hdr->nm_pid = NETLINK_CB(skb).creds.pid; + hdr->nm_uid = from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid); + hdr->nm_gid = from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid); +- netlink_frame_flush_dcache(hdr); ++ netlink_frame_flush_dcache(hdr, hdr->nm_len); + netlink_set_status(hdr, NL_MMAP_STATUS_VALID); + + NETLINK_CB(skb).flags |= NETLINK_SKB_DELIVERED; diff --git a/queue-3.18/netlink-don-t-reorder-loads-stores-before-marking-mmap-netlink-frame-as-available.patch b/queue-3.18/netlink-don-t-reorder-loads-stores-before-marking-mmap-netlink-frame-as-available.patch new file mode 100644 index 00000000000..817d4636ae0 --- /dev/null +++ b/queue-3.18/netlink-don-t-reorder-loads-stores-before-marking-mmap-netlink-frame-as-available.patch @@ -0,0 +1,50 @@ +From foo@baz Sat Jan 17 18:12:21 PST 2015 +From: Thomas Graf +Date: Thu, 18 Dec 2014 10:30:26 +0000 +Subject: netlink: Don't reorder loads/stores before marking mmap netlink frame as available + +From: Thomas Graf + +[ Upstream commit a18e6a186f53af06937a2c268c72443336f4ab56 ] + +Each mmap Netlink frame contains a status field which indicates +whether the frame is unused, reserved, contains data or needs to +be skipped. Both loads and stores may not be reordeded and must +complete before the status field is changed and another CPU might +pick up the frame for use. Use an smp_mb() to cover needs of both +types of callers to netlink_set_status(), callers which have been +reading data frame from the frame, and callers which have been +filling or releasing and thus writing to the frame. + +- Example code path requiring a smp_rmb(): + memcpy(skb->data, (void *)hdr + NL_MMAP_HDRLEN, hdr->nm_len); + netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED); + +- Example code path requiring a smp_wmb(): + hdr->nm_uid = from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid); + hdr->nm_gid = from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid); + netlink_frame_flush_dcache(hdr); + netlink_set_status(hdr, NL_MMAP_STATUS_VALID); + +Fixes: f9c228 ("netlink: implement memory mapped recvmsg()") +Reported-by: Eric Dumazet +Signed-off-by: Thomas Graf +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/netlink/af_netlink.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/netlink/af_netlink.c ++++ b/net/netlink/af_netlink.c +@@ -551,9 +551,9 @@ static enum nl_mmap_status netlink_get_s + static void netlink_set_status(struct nl_mmap_hdr *hdr, + enum nl_mmap_status status) + { ++ smp_mb(); + hdr->nm_status = status; + flush_dcache_page(pgvec_to_page(hdr)); +- smp_wmb(); + } + + static struct nl_mmap_hdr * diff --git a/queue-3.18/tcp-do-not-apply-tso-segment-limit-to-non-tso-packets.patch b/queue-3.18/tcp-do-not-apply-tso-segment-limit-to-non-tso-packets.patch new file mode 100644 index 00000000000..46e69d38576 --- /dev/null +++ b/queue-3.18/tcp-do-not-apply-tso-segment-limit-to-non-tso-packets.patch @@ -0,0 +1,54 @@ +From foo@baz Sat Jan 17 18:12:21 PST 2015 +From: Herbert Xu +Date: Thu, 1 Jan 2015 00:39:23 +1100 +Subject: tcp: Do not apply TSO segment limit to non-TSO packets + +From: Herbert Xu + +[ Upstream commit 843925f33fcc293d80acf2c5c8a78adf3344d49b ] + +Thomas Jarosch reported IPsec TCP stalls when a PMTU event occurs. + +In fact the problem was completely unrelated to IPsec. The bug is +also reproducible if you just disable TSO/GSO. + +The problem is that when the MSS goes down, existing queued packet +on the TX queue that have not been transmitted yet all look like +TSO packets and get treated as such. + +This then triggers a bug where tcp_mss_split_point tells us to +generate a zero-sized packet on the TX queue. Once that happens +we're screwed because the zero-sized packet can never be removed +by ACKs. + +Fixes: 1485348d242 ("tcp: Apply device TSO segment limit earlier") +Reported-by: Thomas Jarosch +Signed-off-by: Herbert Xu + +Cheers, +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/tcp_output.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -1984,7 +1984,7 @@ static bool tcp_write_xmit(struct sock * + if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) + break; + +- if (tso_segs == 1) { ++ if (tso_segs == 1 || !sk->sk_gso_max_segs) { + if (unlikely(!tcp_nagle_test(tp, skb, mss_now, + (tcp_skb_is_last(sk, skb) ? + nonagle : TCP_NAGLE_PUSH)))) +@@ -2020,7 +2020,7 @@ static bool tcp_write_xmit(struct sock * + } + + limit = mss_now; +- if (tso_segs > 1 && !tcp_urg_mode(tp)) ++ if (tso_segs > 1 && sk->sk_gso_max_segs && !tcp_urg_mode(tp)) + limit = tcp_mss_split_point(sk, skb, mss_now, + min_t(unsigned int, + cwnd_quota, diff --git a/queue-3.18/tcp6-don-t-move-ip6cb-before-xfrm6_policy_check.patch b/queue-3.18/tcp6-don-t-move-ip6cb-before-xfrm6_policy_check.patch new file mode 100644 index 00000000000..73470ded591 --- /dev/null +++ b/queue-3.18/tcp6-don-t-move-ip6cb-before-xfrm6_policy_check.patch @@ -0,0 +1,109 @@ +From foo@baz Sat Jan 17 18:12:21 PST 2015 +From: Nicolas Dichtel +Date: Mon, 22 Dec 2014 18:22:48 +0100 +Subject: tcp6: don't move IP6CB before xfrm6_policy_check() + +From: Nicolas Dichtel + +[ Upstream commit 2dc49d1680b534877fd20cce52557ea542bb06b6 ] + +When xfrm6_policy_check() is used, _decode_session6() is called after some +intermediate functions. This function uses IP6CB(), thus TCP_SKB_CB() must be +prepared after the call of xfrm6_policy_check(). + +Before this patch, scenarii with IPv6 + TCP + IPsec Transport are broken. + +Fixes: 971f10eca186 ("tcp: better TCP_SKB_CB layout to reduce cache line misses") +Reported-by: Huaibin Wang +Suggested-by: Eric Dumazet +Signed-off-by: Nicolas Dichtel +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv6/tcp_ipv6.c | 45 +++++++++++++++++++++++++++++---------------- + 1 file changed, 29 insertions(+), 16 deletions(-) + +--- a/net/ipv6/tcp_ipv6.c ++++ b/net/ipv6/tcp_ipv6.c +@@ -1385,6 +1385,28 @@ ipv6_pktoptions: + return 0; + } + ++static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr, ++ const struct tcphdr *th) ++{ ++ /* This is tricky: we move IP6CB at its correct location into ++ * TCP_SKB_CB(). It must be done after xfrm6_policy_check(), because ++ * _decode_session6() uses IP6CB(). ++ * barrier() makes sure compiler won't play aliasing games. ++ */ ++ memmove(&TCP_SKB_CB(skb)->header.h6, IP6CB(skb), ++ sizeof(struct inet6_skb_parm)); ++ barrier(); ++ ++ TCP_SKB_CB(skb)->seq = ntohl(th->seq); ++ TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + ++ skb->len - th->doff*4); ++ TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); ++ TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); ++ TCP_SKB_CB(skb)->tcp_tw_isn = 0; ++ TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr); ++ TCP_SKB_CB(skb)->sacked = 0; ++} ++ + static int tcp_v6_rcv(struct sk_buff *skb) + { + const struct tcphdr *th; +@@ -1416,24 +1438,9 @@ static int tcp_v6_rcv(struct sk_buff *sk + + th = tcp_hdr(skb); + hdr = ipv6_hdr(skb); +- /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() +- * barrier() makes sure compiler wont play fool^Waliasing games. +- */ +- memmove(&TCP_SKB_CB(skb)->header.h6, IP6CB(skb), +- sizeof(struct inet6_skb_parm)); +- barrier(); +- +- TCP_SKB_CB(skb)->seq = ntohl(th->seq); +- TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + +- skb->len - th->doff*4); +- TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); +- TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); +- TCP_SKB_CB(skb)->tcp_tw_isn = 0; +- TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr); +- TCP_SKB_CB(skb)->sacked = 0; + + sk = __inet6_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest, +- tcp_v6_iif(skb)); ++ inet6_iif(skb)); + if (!sk) + goto no_tcp_socket; + +@@ -1449,6 +1456,8 @@ process: + if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) + goto discard_and_relse; + ++ tcp_v6_fill_cb(skb, hdr, th); ++ + #ifdef CONFIG_TCP_MD5SIG + if (tcp_v6_inbound_md5_hash(sk, skb)) + goto discard_and_relse; +@@ -1480,6 +1489,8 @@ no_tcp_socket: + if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto discard_it; + ++ tcp_v6_fill_cb(skb, hdr, th); ++ + if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) { + csum_error: + TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS); +@@ -1503,6 +1514,8 @@ do_time_wait: + goto discard_it; + } + ++ tcp_v6_fill_cb(skb, hdr, th); ++ + if (skb->len < (th->doff<<2)) { + inet_twsk_put(inet_twsk(sk)); + goto bad_packet; diff --git a/queue-3.18/team-avoid-possible-underflow-of-count_pending-value-for-notify_peers-and-mcast_rejoin.patch b/queue-3.18/team-avoid-possible-underflow-of-count_pending-value-for-notify_peers-and-mcast_rejoin.patch new file mode 100644 index 00000000000..a87f152860c --- /dev/null +++ b/queue-3.18/team-avoid-possible-underflow-of-count_pending-value-for-notify_peers-and-mcast_rejoin.patch @@ -0,0 +1,94 @@ +From foo@baz Sat Jan 17 18:12:21 PST 2015 +From: Jiri Pirko +Date: Wed, 14 Jan 2015 18:15:30 +0100 +Subject: team: avoid possible underflow of count_pending value for notify_peers and mcast_rejoin + +From: Jiri Pirko + +[ Upstream commit b0d11b42785b70e19bc6a3122eead3f7969a7589 ] + +This patch is fixing a race condition that may cause setting +count_pending to -1, which results in unwanted big bulk of arp messages +(in case of "notify peers"). + +Consider following scenario: + +count_pending == 2 + CPU0 CPU1 + team_notify_peers_work + atomic_dec_and_test (dec count_pending to 1) + schedule_delayed_work + team_notify_peers + atomic_add (adding 1 to count_pending) + team_notify_peers_work + atomic_dec_and_test (dec count_pending to 1) + schedule_delayed_work + team_notify_peers_work + atomic_dec_and_test (dec count_pending to 0) + schedule_delayed_work + team_notify_peers_work + atomic_dec_and_test (dec count_pending to -1) + +Fix this race by using atomic_dec_if_positive - that will prevent +count_pending running under 0. + +Fixes: fc423ff00df3a1955441 ("team: add peer notification") +Fixes: 492b200efdd20b8fcfd ("team: add support for sending multicast rejoins") +Signed-off-by: Jiri Pirko +Signed-off-by: Jiri Benc +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/team/team.c | 16 ++++++++++++++-- + 1 file changed, 14 insertions(+), 2 deletions(-) + +--- a/drivers/net/team/team.c ++++ b/drivers/net/team/team.c +@@ -629,6 +629,7 @@ static int team_change_mode(struct team + static void team_notify_peers_work(struct work_struct *work) + { + struct team *team; ++ int val; + + team = container_of(work, struct team, notify_peers.dw.work); + +@@ -636,9 +637,14 @@ static void team_notify_peers_work(struc + schedule_delayed_work(&team->notify_peers.dw, 0); + return; + } ++ val = atomic_dec_if_positive(&team->notify_peers.count_pending); ++ if (val < 0) { ++ rtnl_unlock(); ++ return; ++ } + call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, team->dev); + rtnl_unlock(); +- if (!atomic_dec_and_test(&team->notify_peers.count_pending)) ++ if (val) + schedule_delayed_work(&team->notify_peers.dw, + msecs_to_jiffies(team->notify_peers.interval)); + } +@@ -669,6 +675,7 @@ static void team_notify_peers_fini(struc + static void team_mcast_rejoin_work(struct work_struct *work) + { + struct team *team; ++ int val; + + team = container_of(work, struct team, mcast_rejoin.dw.work); + +@@ -676,9 +683,14 @@ static void team_mcast_rejoin_work(struc + schedule_delayed_work(&team->mcast_rejoin.dw, 0); + return; + } ++ val = atomic_dec_if_positive(&team->mcast_rejoin.count_pending); ++ if (val < 0) { ++ rtnl_unlock(); ++ return; ++ } + call_netdevice_notifiers(NETDEV_RESEND_IGMP, team->dev); + rtnl_unlock(); +- if (!atomic_dec_and_test(&team->mcast_rejoin.count_pending)) ++ if (val) + schedule_delayed_work(&team->mcast_rejoin.dw, + msecs_to_jiffies(team->mcast_rejoin.interval)); + } diff --git a/queue-3.18/tg3-tg3_disable_ints-using-uninitialized-mailbox-value-to-disable-interrupts.patch b/queue-3.18/tg3-tg3_disable_ints-using-uninitialized-mailbox-value-to-disable-interrupts.patch new file mode 100644 index 00000000000..601080cd3d7 --- /dev/null +++ b/queue-3.18/tg3-tg3_disable_ints-using-uninitialized-mailbox-value-to-disable-interrupts.patch @@ -0,0 +1,87 @@ +From foo@baz Sat Jan 17 18:12:21 PST 2015 +From: Prashant Sreedharan +Date: Sat, 20 Dec 2014 12:16:17 -0800 +Subject: tg3: tg3_disable_ints using uninitialized mailbox value to disable interrupts + +From: Prashant Sreedharan + +[ Upstream commit 05b0aa579397b734f127af58e401a30784a1e315 ] + +During driver load in tg3_init_one, if the driver detects DMA activity before +intializing the chip tg3_halt is called. As part of tg3_halt interrupts are +disabled using routine tg3_disable_ints. This routine was using mailbox value +which was not initialized (default value is 0). As a result driver was writing +0x00000001 to pci config space register 0, which is the vendor id / device id. + +This driver bug was exposed because of the commit a7877b17a667 (PCI: Check only +the Vendor ID to identify Configuration Request Retry). Also this issue is only +seen in older generation chipsets like 5722 because config space write to offset +0 from driver is possible. The newer generation chips ignore writes to offset 0. +Also without commit a7877b17a667, for these older chips when a GRC reset is +issued the Bootcode would reprogram the vendor id/device id, which is the reason +this bug was masked earlier. + +Fixed by initializing the interrupt mailbox registers before calling tg3_halt. + +Please queue for -stable. + +Reported-by: Nils Holland +Reported-by: Marcelo Ricardo Leitner +Signed-off-by: Prashant Sreedharan +Signed-off-by: Michael Chan +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/broadcom/tg3.c | 34 +++++++++++++++++----------------- + 1 file changed, 17 insertions(+), 17 deletions(-) + +--- a/drivers/net/ethernet/broadcom/tg3.c ++++ b/drivers/net/ethernet/broadcom/tg3.c +@@ -17789,23 +17789,6 @@ static int tg3_init_one(struct pci_dev * + goto err_out_apeunmap; + } + +- /* +- * Reset chip in case UNDI or EFI driver did not shutdown +- * DMA self test will enable WDMAC and we'll see (spurious) +- * pending DMA on the PCI bus at that point. +- */ +- if ((tr32(HOSTCC_MODE) & HOSTCC_MODE_ENABLE) || +- (tr32(WDMAC_MODE) & WDMAC_MODE_ENABLE)) { +- tw32(MEMARB_MODE, MEMARB_MODE_ENABLE); +- tg3_halt(tp, RESET_KIND_SHUTDOWN, 1); +- } +- +- err = tg3_test_dma(tp); +- if (err) { +- dev_err(&pdev->dev, "DMA engine test failed, aborting\n"); +- goto err_out_apeunmap; +- } +- + intmbx = MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW; + rcvmbx = MAILBOX_RCVRET_CON_IDX_0 + TG3_64BIT_REG_LOW; + sndmbx = MAILBOX_SNDHOST_PROD_IDX_0 + TG3_64BIT_REG_LOW; +@@ -17850,6 +17833,23 @@ static int tg3_init_one(struct pci_dev * + sndmbx += 0xc; + } + ++ /* ++ * Reset chip in case UNDI or EFI driver did not shutdown ++ * DMA self test will enable WDMAC and we'll see (spurious) ++ * pending DMA on the PCI bus at that point. ++ */ ++ if ((tr32(HOSTCC_MODE) & HOSTCC_MODE_ENABLE) || ++ (tr32(WDMAC_MODE) & WDMAC_MODE_ENABLE)) { ++ tw32(MEMARB_MODE, MEMARB_MODE_ENABLE); ++ tg3_halt(tp, RESET_KIND_SHUTDOWN, 1); ++ } ++ ++ err = tg3_test_dma(tp); ++ if (err) { ++ dev_err(&pdev->dev, "DMA engine test failed, aborting\n"); ++ goto err_out_apeunmap; ++ } ++ + tg3_init_coal(tp); + + pci_set_drvdata(pdev, dev); diff --git a/queue-3.18/xen-netback-fixing-the-propagation-of-the-transmit-shaper-timeout.patch b/queue-3.18/xen-netback-fixing-the-propagation-of-the-transmit-shaper-timeout.patch new file mode 100644 index 00000000000..4e6122880e5 --- /dev/null +++ b/queue-3.18/xen-netback-fixing-the-propagation-of-the-transmit-shaper-timeout.patch @@ -0,0 +1,34 @@ +From foo@baz Sat Jan 17 18:12:21 PST 2015 +From: "Palik, Imre" +Date: Tue, 6 Jan 2015 16:44:44 +0100 +Subject: xen-netback: fixing the propagation of the transmit shaper timeout + +From: "Palik, Imre" + +[ Upstream commit 07ff890daeda31cf23173865edf50bcb03e100c3 ] + +Since e9ce7cb6b107 ("xen-netback: Factor queue-specific data into queue struct"), +the transimt shaper timeout is always set to 0. The value the user sets via +xenbus is never propagated to the transmit shaper. + +This patch fixes the issue. + +Cc: Anthony Liguori +Signed-off-by: Imre Palik +Acked-by: Ian Campbell +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/xen-netback/xenbus.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/drivers/net/xen-netback/xenbus.c ++++ b/drivers/net/xen-netback/xenbus.c +@@ -736,6 +736,7 @@ static void connect(struct backend_info + } + + queue->remaining_credit = credit_bytes; ++ queue->credit_usec = credit_usec; + + err = connect_rings(be, queue); + if (err) { diff --git a/queue-3.18/xen-netback-support-frontends-without-feature-rx-notify-again.patch b/queue-3.18/xen-netback-support-frontends-without-feature-rx-notify-again.patch new file mode 100644 index 00000000000..37833e88e96 --- /dev/null +++ b/queue-3.18/xen-netback-support-frontends-without-feature-rx-notify-again.patch @@ -0,0 +1,180 @@ +From foo@baz Sat Jan 17 18:12:21 PST 2015 +From: David Vrabel +Date: Thu, 18 Dec 2014 11:13:06 +0000 +Subject: xen-netback: support frontends without feature-rx-notify again + +From: David Vrabel + +[ Upstram commit 26c0e102585d5a4d311f5d6eb7f524d288e7f6b7 ] + +Commit bc96f648df1bbc2729abbb84513cf4f64273a1f1 (xen-netback: make +feature-rx-notify mandatory) incorrectly assumed that there were no +frontends in use that did not support this feature. But the frontend +driver in MiniOS does not and since this is used by (qemu) stubdoms, +these stopped working. + +Netback sort of works as-is in this mode except: + +- If there are no Rx requests and the internal Rx queue fills, only + the drain timeout will wake the thread. The default drain timeout + of 10 s would give unacceptable pauses. + +- If an Rx stall was detected and the internal Rx queue is drained, + then the Rx thread would never wake. + +Handle these two cases (when feature-rx-notify is disabled) by: + +- Reducing the drain timeout to 30 ms. + +- Disabling Rx stall detection. + +Reported-by: John +Tested-by: John +Signed-off-by: David Vrabel +Reviewed-by: Wei Liu +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/xen-netback/common.h | 4 +++- + drivers/net/xen-netback/interface.c | 4 +++- + drivers/net/xen-netback/netback.c | 27 ++++++++++++++------------- + drivers/net/xen-netback/xenbus.c | 12 +++++++++--- + 4 files changed, 29 insertions(+), 18 deletions(-) + +--- a/drivers/net/xen-netback/common.h ++++ b/drivers/net/xen-netback/common.h +@@ -230,6 +230,8 @@ struct xenvif { + */ + bool disabled; + unsigned long status; ++ unsigned long drain_timeout; ++ unsigned long stall_timeout; + + /* Queues */ + struct xenvif_queue *queues; +@@ -328,7 +330,7 @@ irqreturn_t xenvif_interrupt(int irq, vo + extern bool separate_tx_rx_irq; + + extern unsigned int rx_drain_timeout_msecs; +-extern unsigned int rx_drain_timeout_jiffies; ++extern unsigned int rx_stall_timeout_msecs; + extern unsigned int xenvif_max_queues; + + #ifdef CONFIG_DEBUG_FS +--- a/drivers/net/xen-netback/interface.c ++++ b/drivers/net/xen-netback/interface.c +@@ -166,7 +166,7 @@ static int xenvif_start_xmit(struct sk_b + goto drop; + + cb = XENVIF_RX_CB(skb); +- cb->expires = jiffies + rx_drain_timeout_jiffies; ++ cb->expires = jiffies + vif->drain_timeout; + + xenvif_rx_queue_tail(queue, skb); + xenvif_kick_thread(queue); +@@ -414,6 +414,8 @@ struct xenvif *xenvif_alloc(struct devic + vif->ip_csum = 1; + vif->dev = dev; + vif->disabled = false; ++ vif->drain_timeout = msecs_to_jiffies(rx_drain_timeout_msecs); ++ vif->stall_timeout = msecs_to_jiffies(rx_stall_timeout_msecs); + + /* Start out with no queues. */ + vif->queues = NULL; +--- a/drivers/net/xen-netback/netback.c ++++ b/drivers/net/xen-netback/netback.c +@@ -60,14 +60,12 @@ module_param(separate_tx_rx_irq, bool, 0 + */ + unsigned int rx_drain_timeout_msecs = 10000; + module_param(rx_drain_timeout_msecs, uint, 0444); +-unsigned int rx_drain_timeout_jiffies; + + /* The length of time before the frontend is considered unresponsive + * because it isn't providing Rx slots. + */ +-static unsigned int rx_stall_timeout_msecs = 60000; ++unsigned int rx_stall_timeout_msecs = 60000; + module_param(rx_stall_timeout_msecs, uint, 0444); +-static unsigned int rx_stall_timeout_jiffies; + + unsigned int xenvif_max_queues; + module_param_named(max_queues, xenvif_max_queues, uint, 0644); +@@ -2022,7 +2020,7 @@ static bool xenvif_rx_queue_stalled(stru + return !queue->stalled + && prod - cons < XEN_NETBK_RX_SLOTS_MAX + && time_after(jiffies, +- queue->last_rx_time + rx_stall_timeout_jiffies); ++ queue->last_rx_time + queue->vif->stall_timeout); + } + + static bool xenvif_rx_queue_ready(struct xenvif_queue *queue) +@@ -2040,8 +2038,9 @@ static bool xenvif_have_rx_work(struct x + { + return (!skb_queue_empty(&queue->rx_queue) + && xenvif_rx_ring_slots_available(queue, XEN_NETBK_RX_SLOTS_MAX)) +- || xenvif_rx_queue_stalled(queue) +- || xenvif_rx_queue_ready(queue) ++ || (queue->vif->stall_timeout && ++ (xenvif_rx_queue_stalled(queue) ++ || xenvif_rx_queue_ready(queue))) + || kthread_should_stop() + || queue->vif->disabled; + } +@@ -2094,6 +2093,9 @@ int xenvif_kthread_guest_rx(void *data) + struct xenvif_queue *queue = data; + struct xenvif *vif = queue->vif; + ++ if (!vif->stall_timeout) ++ xenvif_queue_carrier_on(queue); ++ + for (;;) { + xenvif_wait_for_rx_work(queue); + +@@ -2120,10 +2122,12 @@ int xenvif_kthread_guest_rx(void *data) + * while it's probably not responsive, drop the + * carrier so packets are dropped earlier. + */ +- if (xenvif_rx_queue_stalled(queue)) +- xenvif_queue_carrier_off(queue); +- else if (xenvif_rx_queue_ready(queue)) +- xenvif_queue_carrier_on(queue); ++ if (vif->stall_timeout) { ++ if (xenvif_rx_queue_stalled(queue)) ++ xenvif_queue_carrier_off(queue); ++ else if (xenvif_rx_queue_ready(queue)) ++ xenvif_queue_carrier_on(queue); ++ } + + /* Queued packets may have foreign pages from other + * domains. These cannot be queued indefinitely as +@@ -2194,9 +2198,6 @@ static int __init netback_init(void) + if (rc) + goto failed_init; + +- rx_drain_timeout_jiffies = msecs_to_jiffies(rx_drain_timeout_msecs); +- rx_stall_timeout_jiffies = msecs_to_jiffies(rx_stall_timeout_msecs); +- + #ifdef CONFIG_DEBUG_FS + xen_netback_dbg_root = debugfs_create_dir("xen-netback", NULL); + if (IS_ERR_OR_NULL(xen_netback_dbg_root)) +--- a/drivers/net/xen-netback/xenbus.c ++++ b/drivers/net/xen-netback/xenbus.c +@@ -886,9 +886,15 @@ static int read_xenbus_vif_flags(struct + return -EOPNOTSUPP; + + if (xenbus_scanf(XBT_NIL, dev->otherend, +- "feature-rx-notify", "%d", &val) < 0 || val == 0) { +- xenbus_dev_fatal(dev, -EINVAL, "feature-rx-notify is mandatory"); +- return -EINVAL; ++ "feature-rx-notify", "%d", &val) < 0) ++ val = 0; ++ if (!val) { ++ /* - Reduce drain timeout to poll more frequently for ++ * Rx requests. ++ * - Disable Rx stall detection. ++ */ ++ be->vif->drain_timeout = msecs_to_jiffies(30); ++ be->vif->stall_timeout = 0; + } + + if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-sg", -- 2.47.3