From: Greg Kroah-Hartman Date: Thu, 10 Nov 2016 15:47:16 +0000 (+0100) Subject: 4.8-stable patches X-Git-Tag: v4.4.32~5 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=8cf12f76679941d3a395680f9762a8e8da6ae2b6;p=thirdparty%2Fkernel%2Fstable-queue.git 4.8-stable patches added patches: arch-powerpc-update-parameters-for-csum_tcpudp_magic-csum_tcpudp_nofold.patch bridge-multicast-restore-perm-router-ports-on-multicast-enable.patch drivers-ptp-fix-kernel-memory-disclosure.patch ib-ipoib-move-back-ib-ll-address-into-the-hard-header.patch ip6_tunnel-fix-ip6_tnl_lookup.patch ip6_tunnel-update-skb-protocol-to-eth_p_ipv6-in-ip6_tnl_xmit.patch ipv4-disable-bh-in-set_ping_group_range.patch ipv4-use-the-right-lock-for-ping_group_range.patch ipv6-correctly-add-local-routes-when-lo-goes-up.patch ipv6-tcp-restore-ip6cb-for-pktoptions-skbs.patch macsec-fix-header-length-if-sci-is-added-if-explicitly-disabled.patch net-add-netdev-all_adj_list-refcnt-propagation-to-fix-panic.patch net-add-recursion-limit-to-gro.patch net-core-correctly-iterate-over-lower-adjacency-list.patch net-fec-call-swap_buffer-prior-to-ip-header-alignment.patch net-fec-set-mac-address-unconditionally.patch net-ipv6-do-not-consider-link-state-for-nexthop-validation.patch net-mlx4_en-fixup-xdp-tx-irq-to-match-rx.patch net-phy-trigger-state-machine-on-state-change-and-not-polling.patch net-pktgen-fix-pkt_size.patch net-pktgen-remove-rcu-locking-in-pktgen_change_name.patch net-sched-act_vlan-push-skb-data-to-mac_header-prior-calling-skb_vlan_-functions.patch net-sched-filters-fix-notification-of-filter-delete-with-proper-handle.patch net-sctp-forbid-negative-length.patch net_sched-reorder-pernet-ops-and-act-ops-registrations.patch netlink-do-not-enter-direct-reclaim-from-netlink_dump.patch netvsc-fix-incorrect-receive-checksum-offloading.patch packet-call-fanout_release-while-unregistering-a-netdev.patch packet-on-direct_xmit-limit-tso-and-csum-to-supported-devices.patch rtnetlink-add-rtnexthop-offload-flag-to-compare-mask.patch sctp-fix-the-panic-caused-by-route-update.patch sctp-validate-chunk-len-before-actually-using-it.patch switchdev-execute-bridge-ndos-only-for-bridge-ports.patch udp-fix-ip_checksum-handling.patch --- diff --git a/queue-4.8/arch-powerpc-update-parameters-for-csum_tcpudp_magic-csum_tcpudp_nofold.patch b/queue-4.8/arch-powerpc-update-parameters-for-csum_tcpudp_magic-csum_tcpudp_nofold.patch new file mode 100644 index 00000000000..329510c6cb8 --- /dev/null +++ b/queue-4.8/arch-powerpc-update-parameters-for-csum_tcpudp_magic-csum_tcpudp_nofold.patch @@ -0,0 +1,52 @@ +From foo@baz Thu Nov 10 16:43:03 CET 2016 +From: Ivan Vecera +Date: Thu, 27 Oct 2016 16:30:06 +0200 +Subject: arch/powerpc: Update parameters for csum_tcpudp_magic & csum_tcpudp_nofold + +From: Ivan Vecera + + +[ Upstream commit f9d4286b9516b02e795214412d36885f572b57ad ] + +Commit 01cfbad "ipv4: Update parameters for csum_tcpudp_magic to their +original types" changed parameters for csum_tcpudp_magic and +csum_tcpudp_nofold for many platforms but not for PowerPC. + +Fixes: 01cfbad "ipv4: Update parameters for csum_tcpudp_magic to their original types" +Cc: Alexander Duyck +Signed-off-by: Ivan Vecera +Acked-by: Alexander Duyck +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + arch/powerpc/include/asm/checksum.h | 12 ++++-------- + 1 file changed, 4 insertions(+), 8 deletions(-) + +--- a/arch/powerpc/include/asm/checksum.h ++++ b/arch/powerpc/include/asm/checksum.h +@@ -53,10 +53,8 @@ static inline __sum16 csum_fold(__wsum s + return (__force __sum16)(~((__force u32)sum + tmp) >> 16); + } + +-static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, +- unsigned short len, +- unsigned short proto, +- __wsum sum) ++static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, ++ __u8 proto, __wsum sum) + { + #ifdef __powerpc64__ + unsigned long s = (__force u32)sum; +@@ -83,10 +81,8 @@ static inline __wsum csum_tcpudp_nofold( + * computes the checksum of the TCP/UDP pseudo-header + * returns a 16-bit checksum, already complemented + */ +-static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, +- unsigned short len, +- unsigned short proto, +- __wsum sum) ++static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len, ++ __u8 proto, __wsum sum) + { + return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum)); + } diff --git a/queue-4.8/bridge-multicast-restore-perm-router-ports-on-multicast-enable.patch b/queue-4.8/bridge-multicast-restore-perm-router-ports-on-multicast-enable.patch new file mode 100644 index 00000000000..8fca7cc1623 --- /dev/null +++ b/queue-4.8/bridge-multicast-restore-perm-router-ports-on-multicast-enable.patch @@ -0,0 +1,115 @@ +From foo@baz Thu Nov 10 16:43:03 CET 2016 +From: Nikolay Aleksandrov +Date: Tue, 18 Oct 2016 18:09:48 +0200 +Subject: bridge: multicast: restore perm router ports on multicast enable + +From: Nikolay Aleksandrov + + +[ Upstream commit 7cb3f9214dfa443c1ccc2be637dcc6344cc203f0 ] + +Satish reported a problem with the perm multicast router ports not getting +reenabled after some series of events, in particular if it happens that the +multicast snooping has been disabled and the port goes to disabled state +then it will be deleted from the router port list, but if it moves into +non-disabled state it will not be re-added because the mcast snooping is +still disabled, and enabling snooping later does nothing. + +Here are the steps to reproduce, setup br0 with snooping enabled and eth1 +added as a perm router (multicast_router = 2): +1. $ echo 0 > /sys/class/net/br0/bridge/multicast_snooping +2. $ ip l set eth1 down +^ This step deletes the interface from the router list +3. $ ip l set eth1 up +^ This step does not add it again because mcast snooping is disabled +4. $ echo 1 > /sys/class/net/br0/bridge/multicast_snooping +5. $ bridge -d -s mdb show + + +At this point we have mcast enabled and eth1 as a perm router (value = 2) +but it is not in the router list which is incorrect. + +After this change: +1. $ echo 0 > /sys/class/net/br0/bridge/multicast_snooping +2. $ ip l set eth1 down +^ This step deletes the interface from the router list +3. $ ip l set eth1 up +^ This step does not add it again because mcast snooping is disabled +4. $ echo 1 > /sys/class/net/br0/bridge/multicast_snooping +5. $ bridge -d -s mdb show +router ports on br0: eth1 + +Note: we can directly do br_multicast_enable_port for all because the +querier timer already has checks for the port state and will simply +expire if it's in blocking/disabled. See the comment added by +commit 9aa66382163e7 ("bridge: multicast: add a comment to +br_port_state_selection about blocking state") + +Fixes: 561f1103a2b7 ("bridge: Add multicast_snooping sysfs toggle") +Reported-by: Satish Ashok +Signed-off-by: Nikolay Aleksandrov +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/bridge/br_multicast.c | 23 ++++++++++++++--------- + 1 file changed, 14 insertions(+), 9 deletions(-) + +--- a/net/bridge/br_multicast.c ++++ b/net/bridge/br_multicast.c +@@ -972,13 +972,12 @@ static void br_multicast_enable(struct b + mod_timer(&query->timer, jiffies); + } + +-void br_multicast_enable_port(struct net_bridge_port *port) ++static void __br_multicast_enable_port(struct net_bridge_port *port) + { + struct net_bridge *br = port->br; + +- spin_lock(&br->multicast_lock); + if (br->multicast_disabled || !netif_running(br->dev)) +- goto out; ++ return; + + br_multicast_enable(&port->ip4_own_query); + #if IS_ENABLED(CONFIG_IPV6) +@@ -987,8 +986,14 @@ void br_multicast_enable_port(struct net + if (port->multicast_router == MDB_RTR_TYPE_PERM && + hlist_unhashed(&port->rlist)) + br_multicast_add_router(br, port); ++} + +-out: ++void br_multicast_enable_port(struct net_bridge_port *port) ++{ ++ struct net_bridge *br = port->br; ++ ++ spin_lock(&br->multicast_lock); ++ __br_multicast_enable_port(port); + spin_unlock(&br->multicast_lock); + } + +@@ -1994,8 +1999,9 @@ static void br_multicast_start_querier(s + + int br_multicast_toggle(struct net_bridge *br, unsigned long val) + { +- int err = 0; + struct net_bridge_mdb_htable *mdb; ++ struct net_bridge_port *port; ++ int err = 0; + + spin_lock_bh(&br->multicast_lock); + if (br->multicast_disabled == !val) +@@ -2023,10 +2029,9 @@ rollback: + goto rollback; + } + +- br_multicast_start_querier(br, &br->ip4_own_query); +-#if IS_ENABLED(CONFIG_IPV6) +- br_multicast_start_querier(br, &br->ip6_own_query); +-#endif ++ br_multicast_open(br); ++ list_for_each_entry(port, &br->port_list, list) ++ __br_multicast_enable_port(port); + + unlock: + spin_unlock_bh(&br->multicast_lock); diff --git a/queue-4.8/drivers-ptp-fix-kernel-memory-disclosure.patch b/queue-4.8/drivers-ptp-fix-kernel-memory-disclosure.patch new file mode 100644 index 00000000000..3214f066058 --- /dev/null +++ b/queue-4.8/drivers-ptp-fix-kernel-memory-disclosure.patch @@ -0,0 +1,32 @@ +From foo@baz Thu Nov 10 16:43:03 CET 2016 +From: Vlad Tsyrklevich +Date: Tue, 11 Oct 2016 15:02:47 +0200 +Subject: drivers/ptp: Fix kernel memory disclosure + +From: Vlad Tsyrklevich + + +[ Upstream commit 02a9079c66341836c4914c33c06a73245060df2e ] + +The reserved field precise_offset->rsv is not cleared before being +copied to user space, leaking kernel stack memory. Clear the struct +before it's copied. + +Signed-off-by: Vlad Tsyrklevich +Acked-by: Richard Cochran +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/ptp/ptp_chardev.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/drivers/ptp/ptp_chardev.c ++++ b/drivers/ptp/ptp_chardev.c +@@ -193,6 +193,7 @@ long ptp_ioctl(struct posix_clock *pc, u + if (err) + break; + ++ memset(&precise_offset, 0, sizeof(precise_offset)); + ts = ktime_to_timespec64(xtstamp.device); + precise_offset.device.sec = ts.tv_sec; + precise_offset.device.nsec = ts.tv_nsec; diff --git a/queue-4.8/ib-ipoib-move-back-ib-ll-address-into-the-hard-header.patch b/queue-4.8/ib-ipoib-move-back-ib-ll-address-into-the-hard-header.patch new file mode 100644 index 00000000000..889e65a10ba --- /dev/null +++ b/queue-4.8/ib-ipoib-move-back-ib-ll-address-into-the-hard-header.patch @@ -0,0 +1,353 @@ +From foo@baz Thu Nov 10 16:43:03 CET 2016 +From: Paolo Abeni +Date: Thu, 13 Oct 2016 18:26:56 +0200 +Subject: IB/ipoib: move back IB LL address into the hard header + +From: Paolo Abeni + + +[ Upstream commit fc791b6335152c5278dc4a4991bcb2d329f806f9 ] + +After the commit 9207f9d45b0a ("net: preserve IP control block +during GSO segmentation"), the GSO CB and the IPoIB CB conflict. +That destroy the IPoIB address information cached there, +causing a severe performance regression, as better described here: + +http://marc.info/?l=linux-kernel&m=146787279825501&w=2 + +This change moves the data cached by the IPoIB driver from the +skb control lock into the IPoIB hard header, as done before +the commit 936d7de3d736 ("IPoIB: Stop lying about hard_header_len +and use skb->cb to stash LL addresses"). +In order to avoid GRO issue, on packet reception, the IPoIB driver +stash into the skb a dummy pseudo header, so that the received +packets have actually a hard header matching the declared length. +To avoid changing the connected mode maximum mtu, the allocated +head buffer size is increased by the pseudo header length. + +After this commit, IPoIB performances are back to pre-regression +value. + +v2 -> v3: rebased +v1 -> v2: avoid changing the max mtu, increasing the head buf size + +Fixes: 9207f9d45b0a ("net: preserve IP control block during GSO segmentation") +Signed-off-by: Paolo Abeni +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/infiniband/ulp/ipoib/ipoib.h | 20 ++++++--- + drivers/infiniband/ulp/ipoib/ipoib_cm.c | 15 +++--- + drivers/infiniband/ulp/ipoib/ipoib_ib.c | 12 ++--- + drivers/infiniband/ulp/ipoib/ipoib_main.c | 54 +++++++++++++++---------- + drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 6 +- + 5 files changed, 64 insertions(+), 43 deletions(-) + +--- a/drivers/infiniband/ulp/ipoib/ipoib.h ++++ b/drivers/infiniband/ulp/ipoib/ipoib.h +@@ -63,6 +63,8 @@ enum ipoib_flush_level { + + enum { + IPOIB_ENCAP_LEN = 4, ++ IPOIB_PSEUDO_LEN = 20, ++ IPOIB_HARD_LEN = IPOIB_ENCAP_LEN + IPOIB_PSEUDO_LEN, + + IPOIB_UD_HEAD_SIZE = IB_GRH_BYTES + IPOIB_ENCAP_LEN, + IPOIB_UD_RX_SG = 2, /* max buffer needed for 4K mtu */ +@@ -134,15 +136,21 @@ struct ipoib_header { + u16 reserved; + }; + +-struct ipoib_cb { +- struct qdisc_skb_cb qdisc_cb; +- u8 hwaddr[INFINIBAND_ALEN]; ++struct ipoib_pseudo_header { ++ u8 hwaddr[INFINIBAND_ALEN]; + }; + +-static inline struct ipoib_cb *ipoib_skb_cb(const struct sk_buff *skb) ++static inline void skb_add_pseudo_hdr(struct sk_buff *skb) + { +- BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct ipoib_cb)); +- return (struct ipoib_cb *)skb->cb; ++ char *data = skb_push(skb, IPOIB_PSEUDO_LEN); ++ ++ /* ++ * only the ipoib header is present now, make room for a dummy ++ * pseudo header and set skb field accordingly ++ */ ++ memset(data, 0, IPOIB_PSEUDO_LEN); ++ skb_reset_mac_header(skb); ++ skb_pull(skb, IPOIB_HARD_LEN); + } + + /* Used for all multicast joins (broadcast, IPv4 mcast and IPv6 mcast) */ +--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c ++++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c +@@ -63,6 +63,8 @@ MODULE_PARM_DESC(cm_data_debug_level, + #define IPOIB_CM_RX_DELAY (3 * 256 * HZ) + #define IPOIB_CM_RX_UPDATE_MASK (0x3) + ++#define IPOIB_CM_RX_RESERVE (ALIGN(IPOIB_HARD_LEN, 16) - IPOIB_ENCAP_LEN) ++ + static struct ib_qp_attr ipoib_cm_err_attr = { + .qp_state = IB_QPS_ERR + }; +@@ -146,15 +148,15 @@ static struct sk_buff *ipoib_cm_alloc_rx + struct sk_buff *skb; + int i; + +- skb = dev_alloc_skb(IPOIB_CM_HEAD_SIZE + 12); ++ skb = dev_alloc_skb(ALIGN(IPOIB_CM_HEAD_SIZE + IPOIB_PSEUDO_LEN, 16)); + if (unlikely(!skb)) + return NULL; + + /* +- * IPoIB adds a 4 byte header. So we need 12 more bytes to align the ++ * IPoIB adds a IPOIB_ENCAP_LEN byte header, this will align the + * IP header to a multiple of 16. + */ +- skb_reserve(skb, 12); ++ skb_reserve(skb, IPOIB_CM_RX_RESERVE); + + mapping[0] = ib_dma_map_single(priv->ca, skb->data, IPOIB_CM_HEAD_SIZE, + DMA_FROM_DEVICE); +@@ -624,9 +626,9 @@ void ipoib_cm_handle_rx_wc(struct net_de + if (wc->byte_len < IPOIB_CM_COPYBREAK) { + int dlen = wc->byte_len; + +- small_skb = dev_alloc_skb(dlen + 12); ++ small_skb = dev_alloc_skb(dlen + IPOIB_CM_RX_RESERVE); + if (small_skb) { +- skb_reserve(small_skb, 12); ++ skb_reserve(small_skb, IPOIB_CM_RX_RESERVE); + ib_dma_sync_single_for_cpu(priv->ca, rx_ring[wr_id].mapping[0], + dlen, DMA_FROM_DEVICE); + skb_copy_from_linear_data(skb, small_skb->data, dlen); +@@ -663,8 +665,7 @@ void ipoib_cm_handle_rx_wc(struct net_de + + copied: + skb->protocol = ((struct ipoib_header *) skb->data)->proto; +- skb_reset_mac_header(skb); +- skb_pull(skb, IPOIB_ENCAP_LEN); ++ skb_add_pseudo_hdr(skb); + + ++dev->stats.rx_packets; + dev->stats.rx_bytes += skb->len; +--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c ++++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c +@@ -128,16 +128,15 @@ static struct sk_buff *ipoib_alloc_rx_sk + + buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu); + +- skb = dev_alloc_skb(buf_size + IPOIB_ENCAP_LEN); ++ skb = dev_alloc_skb(buf_size + IPOIB_HARD_LEN); + if (unlikely(!skb)) + return NULL; + + /* +- * IB will leave a 40 byte gap for a GRH and IPoIB adds a 4 byte +- * header. So we need 4 more bytes to get to 48 and align the +- * IP header to a multiple of 16. ++ * the IP header will be at IPOIP_HARD_LEN + IB_GRH_BYTES, that is ++ * 64 bytes aligned + */ +- skb_reserve(skb, 4); ++ skb_reserve(skb, sizeof(struct ipoib_pseudo_header)); + + mapping = priv->rx_ring[id].mapping; + mapping[0] = ib_dma_map_single(priv->ca, skb->data, buf_size, +@@ -253,8 +252,7 @@ static void ipoib_ib_handle_rx_wc(struct + skb_pull(skb, IB_GRH_BYTES); + + skb->protocol = ((struct ipoib_header *) skb->data)->proto; +- skb_reset_mac_header(skb); +- skb_pull(skb, IPOIB_ENCAP_LEN); ++ skb_add_pseudo_hdr(skb); + + ++dev->stats.rx_packets; + dev->stats.rx_bytes += skb->len; +--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c ++++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c +@@ -925,9 +925,12 @@ static void neigh_add_path(struct sk_buf + ipoib_neigh_free(neigh); + goto err_drop; + } +- if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) ++ if (skb_queue_len(&neigh->queue) < ++ IPOIB_MAX_PATH_REC_QUEUE) { ++ /* put pseudoheader back on for next time */ ++ skb_push(skb, IPOIB_PSEUDO_LEN); + __skb_queue_tail(&neigh->queue, skb); +- else { ++ } else { + ipoib_warn(priv, "queue length limit %d. Packet drop.\n", + skb_queue_len(&neigh->queue)); + goto err_drop; +@@ -964,7 +967,7 @@ err_drop: + } + + static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, +- struct ipoib_cb *cb) ++ struct ipoib_pseudo_header *phdr) + { + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_path *path; +@@ -972,16 +975,18 @@ static void unicast_arp_send(struct sk_b + + spin_lock_irqsave(&priv->lock, flags); + +- path = __path_find(dev, cb->hwaddr + 4); ++ path = __path_find(dev, phdr->hwaddr + 4); + if (!path || !path->valid) { + int new_path = 0; + + if (!path) { +- path = path_rec_create(dev, cb->hwaddr + 4); ++ path = path_rec_create(dev, phdr->hwaddr + 4); + new_path = 1; + } + if (path) { + if (skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) { ++ /* put pseudoheader back on for next time */ ++ skb_push(skb, IPOIB_PSEUDO_LEN); + __skb_queue_tail(&path->queue, skb); + } else { + ++dev->stats.tx_dropped; +@@ -1009,10 +1014,12 @@ static void unicast_arp_send(struct sk_b + be16_to_cpu(path->pathrec.dlid)); + + spin_unlock_irqrestore(&priv->lock, flags); +- ipoib_send(dev, skb, path->ah, IPOIB_QPN(cb->hwaddr)); ++ ipoib_send(dev, skb, path->ah, IPOIB_QPN(phdr->hwaddr)); + return; + } else if ((path->query || !path_rec_start(dev, path)) && + skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) { ++ /* put pseudoheader back on for next time */ ++ skb_push(skb, IPOIB_PSEUDO_LEN); + __skb_queue_tail(&path->queue, skb); + } else { + ++dev->stats.tx_dropped; +@@ -1026,13 +1033,15 @@ static int ipoib_start_xmit(struct sk_bu + { + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_neigh *neigh; +- struct ipoib_cb *cb = ipoib_skb_cb(skb); ++ struct ipoib_pseudo_header *phdr; + struct ipoib_header *header; + unsigned long flags; + ++ phdr = (struct ipoib_pseudo_header *) skb->data; ++ skb_pull(skb, sizeof(*phdr)); + header = (struct ipoib_header *) skb->data; + +- if (unlikely(cb->hwaddr[4] == 0xff)) { ++ if (unlikely(phdr->hwaddr[4] == 0xff)) { + /* multicast, arrange "if" according to probability */ + if ((header->proto != htons(ETH_P_IP)) && + (header->proto != htons(ETH_P_IPV6)) && +@@ -1045,13 +1054,13 @@ static int ipoib_start_xmit(struct sk_bu + return NETDEV_TX_OK; + } + /* Add in the P_Key for multicast*/ +- cb->hwaddr[8] = (priv->pkey >> 8) & 0xff; +- cb->hwaddr[9] = priv->pkey & 0xff; ++ phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff; ++ phdr->hwaddr[9] = priv->pkey & 0xff; + +- neigh = ipoib_neigh_get(dev, cb->hwaddr); ++ neigh = ipoib_neigh_get(dev, phdr->hwaddr); + if (likely(neigh)) + goto send_using_neigh; +- ipoib_mcast_send(dev, cb->hwaddr, skb); ++ ipoib_mcast_send(dev, phdr->hwaddr, skb); + return NETDEV_TX_OK; + } + +@@ -1060,16 +1069,16 @@ static int ipoib_start_xmit(struct sk_bu + case htons(ETH_P_IP): + case htons(ETH_P_IPV6): + case htons(ETH_P_TIPC): +- neigh = ipoib_neigh_get(dev, cb->hwaddr); ++ neigh = ipoib_neigh_get(dev, phdr->hwaddr); + if (unlikely(!neigh)) { +- neigh_add_path(skb, cb->hwaddr, dev); ++ neigh_add_path(skb, phdr->hwaddr, dev); + return NETDEV_TX_OK; + } + break; + case htons(ETH_P_ARP): + case htons(ETH_P_RARP): + /* for unicast ARP and RARP should always perform path find */ +- unicast_arp_send(skb, dev, cb); ++ unicast_arp_send(skb, dev, phdr); + return NETDEV_TX_OK; + default: + /* ethertype not supported by IPoIB */ +@@ -1086,11 +1095,13 @@ send_using_neigh: + goto unref; + } + } else if (neigh->ah) { +- ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(cb->hwaddr)); ++ ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(phdr->hwaddr)); + goto unref; + } + + if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { ++ /* put pseudoheader back on for next time */ ++ skb_push(skb, sizeof(*phdr)); + spin_lock_irqsave(&priv->lock, flags); + __skb_queue_tail(&neigh->queue, skb); + spin_unlock_irqrestore(&priv->lock, flags); +@@ -1122,8 +1133,8 @@ static int ipoib_hard_header(struct sk_b + unsigned short type, + const void *daddr, const void *saddr, unsigned len) + { ++ struct ipoib_pseudo_header *phdr; + struct ipoib_header *header; +- struct ipoib_cb *cb = ipoib_skb_cb(skb); + + header = (struct ipoib_header *) skb_push(skb, sizeof *header); + +@@ -1132,12 +1143,13 @@ static int ipoib_hard_header(struct sk_b + + /* + * we don't rely on dst_entry structure, always stuff the +- * destination address into skb->cb so we can figure out where ++ * destination address into skb hard header so we can figure out where + * to send the packet later. + */ +- memcpy(cb->hwaddr, daddr, INFINIBAND_ALEN); ++ phdr = (struct ipoib_pseudo_header *) skb_push(skb, sizeof(*phdr)); ++ memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN); + +- return sizeof *header; ++ return IPOIB_HARD_LEN; + } + + static void ipoib_set_mcast_list(struct net_device *dev) +@@ -1759,7 +1771,7 @@ void ipoib_setup(struct net_device *dev) + + dev->flags |= IFF_BROADCAST | IFF_MULTICAST; + +- dev->hard_header_len = IPOIB_ENCAP_LEN; ++ dev->hard_header_len = IPOIB_HARD_LEN; + dev->addr_len = INFINIBAND_ALEN; + dev->type = ARPHRD_INFINIBAND; + dev->tx_queue_len = ipoib_sendq_size * 2; +--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c ++++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +@@ -796,9 +796,11 @@ void ipoib_mcast_send(struct net_device + __ipoib_mcast_add(dev, mcast); + list_add_tail(&mcast->list, &priv->multicast_list); + } +- if (skb_queue_len(&mcast->pkt_queue) < IPOIB_MAX_MCAST_QUEUE) ++ if (skb_queue_len(&mcast->pkt_queue) < IPOIB_MAX_MCAST_QUEUE) { ++ /* put pseudoheader back on for next time */ ++ skb_push(skb, sizeof(struct ipoib_pseudo_header)); + skb_queue_tail(&mcast->pkt_queue, skb); +- else { ++ } else { + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + } diff --git a/queue-4.8/ip6_tunnel-fix-ip6_tnl_lookup.patch b/queue-4.8/ip6_tunnel-fix-ip6_tnl_lookup.patch new file mode 100644 index 00000000000..0012d523056 --- /dev/null +++ b/queue-4.8/ip6_tunnel-fix-ip6_tnl_lookup.patch @@ -0,0 +1,47 @@ +From foo@baz Thu Nov 10 16:43:03 CET 2016 +From: Vadim Fedorenko +Date: Tue, 11 Oct 2016 22:47:20 +0300 +Subject: ip6_tunnel: fix ip6_tnl_lookup + +From: Vadim Fedorenko + + +[ Upstream commit 68d00f332e0ba7f60f212be74ede290c9f873bc5 ] + +The commit ea3dc9601bda ("ip6_tunnel: Add support for wildcard tunnel +endpoints.") introduces support for wildcards in tunnels endpoints, +but in some rare circumstances ip6_tnl_lookup selects wrong tunnel +interface relying only on source or destination address of the packet +and not checking presence of wildcard in tunnels endpoints. Later in +ip6_tnl_rcv this packets can be dicarded because of difference in +ipproto even if fallback device have proper ipproto configuration. + +This patch adds checks of wildcard endpoint in tunnel avoiding such +behavior + +Fixes: ea3dc9601bda ("ip6_tunnel: Add support for wildcard tunnel endpoints.") +Signed-off-by: Vadim Fedorenko +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv6/ip6_tunnel.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/net/ipv6/ip6_tunnel.c ++++ b/net/ipv6/ip6_tunnel.c +@@ -155,6 +155,7 @@ ip6_tnl_lookup(struct net *net, const st + hash = HASH(&any, local); + for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { + if (ipv6_addr_equal(local, &t->parms.laddr) && ++ ipv6_addr_any(&t->parms.raddr) && + (t->dev->flags & IFF_UP)) + return t; + } +@@ -162,6 +163,7 @@ ip6_tnl_lookup(struct net *net, const st + hash = HASH(remote, &any); + for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { + if (ipv6_addr_equal(remote, &t->parms.raddr) && ++ ipv6_addr_any(&t->parms.laddr) && + (t->dev->flags & IFF_UP)) + return t; + } diff --git a/queue-4.8/ip6_tunnel-update-skb-protocol-to-eth_p_ipv6-in-ip6_tnl_xmit.patch b/queue-4.8/ip6_tunnel-update-skb-protocol-to-eth_p_ipv6-in-ip6_tnl_xmit.patch new file mode 100644 index 00000000000..cb5dad4130e --- /dev/null +++ b/queue-4.8/ip6_tunnel-update-skb-protocol-to-eth_p_ipv6-in-ip6_tnl_xmit.patch @@ -0,0 +1,36 @@ +From foo@baz Thu Nov 10 16:43:03 CET 2016 +From: Eli Cooper +Date: Wed, 26 Oct 2016 10:11:09 +0800 +Subject: ip6_tunnel: Update skb->protocol to ETH_P_IPV6 in ip6_tnl_xmit() + +From: Eli Cooper + + +[ Upstream commit ae148b085876fa771d9ef2c05f85d4b4bf09ce0d ] + +This patch updates skb->protocol to ETH_P_IPV6 in ip6_tnl_xmit() when an +IPv6 header is installed to a socket buffer. + +This is not a cosmetic change. Without updating this value, GSO packets +transmitted through an ipip6 tunnel have the protocol of ETH_P_IP and +skb_mac_gso_segment() will attempt to call gso_segment() for IPv4, +which results in the packets being dropped. + +Fixes: b8921ca83eed ("ip4ip6: Support for GSO/GRO") +Signed-off-by: Eli Cooper +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv6/ip6_tunnel.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/net/ipv6/ip6_tunnel.c ++++ b/net/ipv6/ip6_tunnel.c +@@ -1134,6 +1134,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, st + if (err) + return err; + ++ skb->protocol = htons(ETH_P_IPV6); + skb_push(skb, sizeof(struct ipv6hdr)); + skb_reset_network_header(skb); + ipv6h = ipv6_hdr(skb); diff --git a/queue-4.8/ipv4-disable-bh-in-set_ping_group_range.patch b/queue-4.8/ipv4-disable-bh-in-set_ping_group_range.patch new file mode 100644 index 00000000000..cee136da2bd --- /dev/null +++ b/queue-4.8/ipv4-disable-bh-in-set_ping_group_range.patch @@ -0,0 +1,38 @@ +From foo@baz Thu Nov 10 16:43:03 CET 2016 +From: Eric Dumazet +Date: Thu, 20 Oct 2016 10:26:48 -0700 +Subject: ipv4: disable BH in set_ping_group_range() + +From: Eric Dumazet + + +[ Upstream commit a681574c99be23e4d20b769bf0e543239c364af5 ] + +In commit 4ee3bd4a8c746 ("ipv4: disable BH when changing ip local port +range") Cong added BH protection in set_local_port_range() but missed +that same fix was needed in set_ping_group_range() + +Fixes: b8f1a55639e6 ("udp: Add function to make source port for UDP tunnels") +Signed-off-by: Eric Dumazet +Reported-by: Eric Salo +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/sysctl_net_ipv4.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/net/ipv4/sysctl_net_ipv4.c ++++ b/net/ipv4/sysctl_net_ipv4.c +@@ -109,10 +109,10 @@ static void set_ping_group_range(struct + kgid_t *data = table->data; + struct net *net = + container_of(table->data, struct net, ipv4.ping_group_range.range); +- write_seqlock(&net->ipv4.ip_local_ports.lock); ++ write_seqlock_bh(&net->ipv4.ip_local_ports.lock); + data[0] = low; + data[1] = high; +- write_sequnlock(&net->ipv4.ip_local_ports.lock); ++ write_sequnlock_bh(&net->ipv4.ip_local_ports.lock); + } + + /* Validate changes from /proc interface. */ diff --git a/queue-4.8/ipv4-use-the-right-lock-for-ping_group_range.patch b/queue-4.8/ipv4-use-the-right-lock-for-ping_group_range.patch new file mode 100644 index 00000000000..74cb8edaaa5 --- /dev/null +++ b/queue-4.8/ipv4-use-the-right-lock-for-ping_group_range.patch @@ -0,0 +1,61 @@ +From foo@baz Thu Nov 10 16:43:03 CET 2016 +From: WANG Cong +Date: Thu, 20 Oct 2016 14:19:46 -0700 +Subject: ipv4: use the right lock for ping_group_range + +From: WANG Cong + + +[ Upstream commit 396a30cce15d084b2b1a395aa6d515c3d559c674 ] + +This reverts commit a681574c99be23e4d20b769bf0e543239c364af5 +("ipv4: disable BH in set_ping_group_range()") because we never +read ping_group_range in BH context (unlike local_port_range). + +Then, since we already have a lock for ping_group_range, those +using ip_local_ports.lock for ping_group_range are clearly typos. + +We might consider to share a same lock for both ping_group_range +and local_port_range w.r.t. space saving, but that should be for +net-next. + +Fixes: a681574c99be ("ipv4: disable BH in set_ping_group_range()") +Fixes: ba6b918ab234 ("ping: move ping_group_range out of CONFIG_SYSCTL") +Cc: Eric Dumazet +Cc: Eric Salo +Signed-off-by: Cong Wang +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/sysctl_net_ipv4.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +--- a/net/ipv4/sysctl_net_ipv4.c ++++ b/net/ipv4/sysctl_net_ipv4.c +@@ -96,11 +96,11 @@ static void inet_get_ping_group_range_ta + container_of(table->data, struct net, ipv4.ping_group_range.range); + unsigned int seq; + do { +- seq = read_seqbegin(&net->ipv4.ip_local_ports.lock); ++ seq = read_seqbegin(&net->ipv4.ping_group_range.lock); + + *low = data[0]; + *high = data[1]; +- } while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq)); ++ } while (read_seqretry(&net->ipv4.ping_group_range.lock, seq)); + } + + /* Update system visible IP port range */ +@@ -109,10 +109,10 @@ static void set_ping_group_range(struct + kgid_t *data = table->data; + struct net *net = + container_of(table->data, struct net, ipv4.ping_group_range.range); +- write_seqlock_bh(&net->ipv4.ip_local_ports.lock); ++ write_seqlock(&net->ipv4.ping_group_range.lock); + data[0] = low; + data[1] = high; +- write_sequnlock_bh(&net->ipv4.ip_local_ports.lock); ++ write_sequnlock(&net->ipv4.ping_group_range.lock); + } + + /* Validate changes from /proc interface. */ diff --git a/queue-4.8/ipv6-correctly-add-local-routes-when-lo-goes-up.patch b/queue-4.8/ipv6-correctly-add-local-routes-when-lo-goes-up.patch new file mode 100644 index 00000000000..8517d9c0399 --- /dev/null +++ b/queue-4.8/ipv6-correctly-add-local-routes-when-lo-goes-up.patch @@ -0,0 +1,57 @@ +From foo@baz Thu Nov 10 16:43:03 CET 2016 +From: Nicolas Dichtel +Date: Wed, 12 Oct 2016 10:10:40 +0200 +Subject: ipv6: correctly add local routes when lo goes up + +From: Nicolas Dichtel + + +[ Upstream commit a220445f9f4382c36a53d8ef3e08165fa27f7e2c ] + +The goal of the patch is to fix this scenario: + ip link add dummy1 type dummy + ip link set dummy1 up + ip link set lo down ; ip link set lo up + +After that sequence, the local route to the link layer address of dummy1 is +not there anymore. + +When the loopback is set down, all local routes are deleted by +addrconf_ifdown()/rt6_ifdown(). At this time, the rt6_info entry still +exists, because the corresponding idev has a reference on it. After the rcu +grace period, dst_rcu_free() is called, and thus ___dst_free(), which will +set obsolete to DST_OBSOLETE_DEAD. + +In this case, init_loopback() is called before dst_rcu_free(), thus +obsolete is still sets to something <= 0. So, the function doesn't add the +route again. To avoid that race, let's check the rt6 refcnt instead. + +Fixes: 25fb6ca4ed9c ("net IPv6 : Fix broken IPv6 routing table after loopback down-up") +Fixes: a881ae1f625c ("ipv6: don't call addrconf_dst_alloc again when enable lo") +Fixes: 33d99113b110 ("ipv6: reallocate addrconf router for ipv6 address when lo device up") +Reported-by: Francesco Santoro +Reported-by: Samuel Gauthier +CC: Balakumaran Kannan +CC: Maruthi Thotad +CC: Sabrina Dubroca +CC: Hannes Frederic Sowa +CC: Weilong Chen +CC: Gao feng +Signed-off-by: Nicolas Dichtel +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv6/addrconf.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/ipv6/addrconf.c ++++ b/net/ipv6/addrconf.c +@@ -2995,7 +2995,7 @@ static void init_loopback(struct net_dev + * lo device down, release this obsolete dst and + * reallocate a new router for ifa. + */ +- if (sp_ifa->rt->dst.obsolete > 0) { ++ if (!atomic_read(&sp_ifa->rt->rt6i_ref)) { + ip6_rt_put(sp_ifa->rt); + sp_ifa->rt = NULL; + } else { diff --git a/queue-4.8/ipv6-tcp-restore-ip6cb-for-pktoptions-skbs.patch b/queue-4.8/ipv6-tcp-restore-ip6cb-for-pktoptions-skbs.patch new file mode 100644 index 00000000000..3337ecaffdc --- /dev/null +++ b/queue-4.8/ipv6-tcp-restore-ip6cb-for-pktoptions-skbs.patch @@ -0,0 +1,97 @@ +From foo@baz Thu Nov 10 16:43:03 CET 2016 +From: Eric Dumazet +Date: Wed, 12 Oct 2016 19:01:45 +0200 +Subject: ipv6: tcp: restore IP6CB for pktoptions skbs + +From: Eric Dumazet + + +[ Upstream commit 8ce48623f0cf3d632e32448411feddccb693d351 ] + +Baozeng Ding reported following KASAN splat : + +BUG: KASAN: use-after-free in ip6_datagram_recv_specific_ctl+0x13f1/0x15c0 at addr ffff880029c84ec8 +Read of size 1 by task poc/25548 +Call Trace: + [] dump_stack+0x12e/0x185 /lib/dump_stack.c:15 + [< inline >] print_address_description /mm/kasan/report.c:204 + [] kasan_report_error+0x48b/0x4b0 /mm/kasan/report.c:283 + [< inline >] kasan_report /mm/kasan/report.c:303 + [] __asan_report_load1_noabort+0x3e/0x40 /mm/kasan/report.c:321 + [] ip6_datagram_recv_specific_ctl+0x13f1/0x15c0 /net/ipv6/datagram.c:687 + [] ip6_datagram_recv_ctl+0x33/0x40 + [] do_ipv6_getsockopt.isra.4+0xaec/0x2150 + [] ipv6_getsockopt+0x116/0x230 + [] tcp_getsockopt+0x82/0xd0 /net/ipv4/tcp.c:3035 + [] sock_common_getsockopt+0x95/0xd0 /net/core/sock.c:2647 + [< inline >] SYSC_getsockopt /net/socket.c:1776 + [] SyS_getsockopt+0x142/0x230 /net/socket.c:1758 + [] entry_SYSCALL_64_fastpath+0x23/0xc6 +Memory state around the buggy address: + ffff880029c84d80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff + ffff880029c84e00: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff +> ffff880029c84e80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff + ^ + ffff880029c84f00: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff + ffff880029c84f80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff + +He also provided a syzkaller reproducer. + +Issue is that ip6_datagram_recv_specific_ctl() expects to find IP6CB +data that was moved at a different place in tcp_v6_rcv() + +This patch moves tcp_v6_restore_cb() up and calls it from +tcp_v6_do_rcv() when np->pktoptions is set. + +Fixes: 971f10eca186 ("tcp: better TCP_SKB_CB layout to reduce cache line misses") +Signed-off-by: Eric Dumazet +Reported-by: Baozeng Ding +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv6/tcp_ipv6.c | 20 +++++++++++--------- + 1 file changed, 11 insertions(+), 9 deletions(-) + +--- a/net/ipv6/tcp_ipv6.c ++++ b/net/ipv6/tcp_ipv6.c +@@ -1193,6 +1193,16 @@ out: + return NULL; + } + ++static void tcp_v6_restore_cb(struct sk_buff *skb) ++{ ++ /* We need to move header back to the beginning if xfrm6_policy_check() ++ * and tcp_v6_fill_cb() are going to be called again. ++ * ip6_datagram_recv_specific_ctl() also expects IP6CB to be there. ++ */ ++ memmove(IP6CB(skb), &TCP_SKB_CB(skb)->header.h6, ++ sizeof(struct inet6_skb_parm)); ++} ++ + /* The socket must have it's spinlock held when we get + * here, unless it is a TCP_LISTEN socket. + * +@@ -1322,6 +1332,7 @@ ipv6_pktoptions: + np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb)); + if (ipv6_opt_accepted(sk, opt_skb, &TCP_SKB_CB(opt_skb)->header.h6)) { + skb_set_owner_r(opt_skb, sk); ++ tcp_v6_restore_cb(opt_skb); + opt_skb = xchg(&np->pktoptions, opt_skb); + } else { + __kfree_skb(opt_skb); +@@ -1355,15 +1366,6 @@ static void tcp_v6_fill_cb(struct sk_buf + TCP_SKB_CB(skb)->sacked = 0; + } + +-static void tcp_v6_restore_cb(struct sk_buff *skb) +-{ +- /* We need to move header back to the beginning if xfrm6_policy_check() +- * and tcp_v6_fill_cb() are going to be called again. +- */ +- memmove(IP6CB(skb), &TCP_SKB_CB(skb)->header.h6, +- sizeof(struct inet6_skb_parm)); +-} +- + static int tcp_v6_rcv(struct sk_buff *skb) + { + const struct tcphdr *th; diff --git a/queue-4.8/macsec-fix-header-length-if-sci-is-added-if-explicitly-disabled.patch b/queue-4.8/macsec-fix-header-length-if-sci-is-added-if-explicitly-disabled.patch new file mode 100644 index 00000000000..d0151ab125c --- /dev/null +++ b/queue-4.8/macsec-fix-header-length-if-sci-is-added-if-explicitly-disabled.patch @@ -0,0 +1,105 @@ +From foo@baz Thu Nov 10 16:43:03 CET 2016 +From: Tobias Brunner +Date: Mon, 24 Oct 2016 15:44:26 +0200 +Subject: macsec: Fix header length if SCI is added if explicitly disabled + +From: Tobias Brunner + + +[ Upstream commit e0f841f5cbf2a195c63f3441f3d8ef1cd2bdeeed ] + +Even if sending SCIs is explicitly disabled, the code that creates the +Security Tag might still decide to add it (e.g. if multiple RX SCs are +defined on the MACsec interface). +But because the header length so far only depended on the configuration +option the SCI overwrote the original frame's contents (EtherType and +e.g. the beginning of the IP header) and if encrypted did not visibly +end up in the packet, while the SC flag in the TCI field of the Security +Tag was still set, resulting in invalid MACsec frames. + +Fixes: c09440f7dcb3 ("macsec: introduce IEEE 802.1AE driver") +Signed-off-by: Tobias Brunner +Acked-by: Sabrina Dubroca +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/macsec.c | 26 ++++++++++++++++++-------- + 1 file changed, 18 insertions(+), 8 deletions(-) + +--- a/drivers/net/macsec.c ++++ b/drivers/net/macsec.c +@@ -397,6 +397,14 @@ static struct macsec_cb *macsec_skb_cb(s + #define DEFAULT_ENCRYPT false + #define DEFAULT_ENCODING_SA 0 + ++static bool send_sci(const struct macsec_secy *secy) ++{ ++ const struct macsec_tx_sc *tx_sc = &secy->tx_sc; ++ ++ return tx_sc->send_sci || ++ (secy->n_rx_sc > 1 && !tx_sc->end_station && !tx_sc->scb); ++} ++ + static sci_t make_sci(u8 *addr, __be16 port) + { + sci_t sci; +@@ -437,15 +445,15 @@ static unsigned int macsec_extra_len(boo + + /* Fill SecTAG according to IEEE 802.1AE-2006 10.5.3 */ + static void macsec_fill_sectag(struct macsec_eth_header *h, +- const struct macsec_secy *secy, u32 pn) ++ const struct macsec_secy *secy, u32 pn, ++ bool sci_present) + { + const struct macsec_tx_sc *tx_sc = &secy->tx_sc; + +- memset(&h->tci_an, 0, macsec_sectag_len(tx_sc->send_sci)); ++ memset(&h->tci_an, 0, macsec_sectag_len(sci_present)); + h->eth.h_proto = htons(ETH_P_MACSEC); + +- if (tx_sc->send_sci || +- (secy->n_rx_sc > 1 && !tx_sc->end_station && !tx_sc->scb)) { ++ if (sci_present) { + h->tci_an |= MACSEC_TCI_SC; + memcpy(&h->secure_channel_id, &secy->sci, + sizeof(h->secure_channel_id)); +@@ -650,6 +658,7 @@ static struct sk_buff *macsec_encrypt(st + struct macsec_tx_sc *tx_sc; + struct macsec_tx_sa *tx_sa; + struct macsec_dev *macsec = macsec_priv(dev); ++ bool sci_present; + u32 pn; + + secy = &macsec->secy; +@@ -687,7 +696,8 @@ static struct sk_buff *macsec_encrypt(st + + unprotected_len = skb->len; + eth = eth_hdr(skb); +- hh = (struct macsec_eth_header *)skb_push(skb, macsec_extra_len(tx_sc->send_sci)); ++ sci_present = send_sci(secy); ++ hh = (struct macsec_eth_header *)skb_push(skb, macsec_extra_len(sci_present)); + memmove(hh, eth, 2 * ETH_ALEN); + + pn = tx_sa_update_pn(tx_sa, secy); +@@ -696,7 +706,7 @@ static struct sk_buff *macsec_encrypt(st + kfree_skb(skb); + return ERR_PTR(-ENOLINK); + } +- macsec_fill_sectag(hh, secy, pn); ++ macsec_fill_sectag(hh, secy, pn, sci_present); + macsec_set_shortlen(hh, unprotected_len - 2 * ETH_ALEN); + + skb_put(skb, secy->icv_len); +@@ -726,10 +736,10 @@ static struct sk_buff *macsec_encrypt(st + skb_to_sgvec(skb, sg, 0, skb->len); + + if (tx_sc->encrypt) { +- int len = skb->len - macsec_hdr_len(tx_sc->send_sci) - ++ int len = skb->len - macsec_hdr_len(sci_present) - + secy->icv_len; + aead_request_set_crypt(req, sg, sg, len, iv); +- aead_request_set_ad(req, macsec_hdr_len(tx_sc->send_sci)); ++ aead_request_set_ad(req, macsec_hdr_len(sci_present)); + } else { + aead_request_set_crypt(req, sg, sg, 0, iv); + aead_request_set_ad(req, skb->len - secy->icv_len); diff --git a/queue-4.8/net-add-netdev-all_adj_list-refcnt-propagation-to-fix-panic.patch b/queue-4.8/net-add-netdev-all_adj_list-refcnt-propagation-to-fix-panic.patch new file mode 100644 index 00000000000..cb284712c5c --- /dev/null +++ b/queue-4.8/net-add-netdev-all_adj_list-refcnt-propagation-to-fix-panic.patch @@ -0,0 +1,271 @@ +From foo@baz Thu Nov 10 16:43:03 CET 2016 +From: Andrew Collins +Date: Mon, 3 Oct 2016 13:43:02 -0600 +Subject: net: Add netdev all_adj_list refcnt propagation to fix panic + +From: Andrew Collins + + +[ Upstream commit 93409033ae653f1c9a949202fb537ab095b2092f ] + +This is a respin of a patch to fix a relatively easily reproducible kernel +panic related to the all_adj_list handling for netdevs in recent kernels. + +The following sequence of commands will reproduce the issue: + +ip link add link eth0 name eth0.100 type vlan id 100 +ip link add link eth0 name eth0.200 type vlan id 200 +ip link add name testbr type bridge +ip link set eth0.100 master testbr +ip link set eth0.200 master testbr +ip link add link testbr mac0 type macvlan +ip link delete dev testbr + +This creates an upper/lower tree of (excuse the poor ASCII art): + + /---eth0.100-eth0 +mac0-testbr- + \---eth0.200-eth0 + +When testbr is deleted, the all_adj_lists are walked, and eth0 is deleted twice from +the mac0 list. Unfortunately, during setup in __netdev_upper_dev_link, only one +reference to eth0 is added, so this results in a panic. + +This change adds reference count propagation so things are handled properly. + +Matthias Schiffer reported a similar crash in batman-adv: + +https://github.com/freifunk-gluon/gluon/issues/680 +https://www.open-mesh.org/issues/247 + +which this patch also seems to resolve. + +Signed-off-by: Andrew Collins +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/core/dev.c | 68 +++++++++++++++++++++++++++++++-------------------------- + 1 file changed, 37 insertions(+), 31 deletions(-) + +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -5578,6 +5578,7 @@ static inline bool netdev_adjacent_is_ne + + static int __netdev_adjacent_dev_insert(struct net_device *dev, + struct net_device *adj_dev, ++ u16 ref_nr, + struct list_head *dev_list, + void *private, bool master) + { +@@ -5587,7 +5588,7 @@ static int __netdev_adjacent_dev_insert( + adj = __netdev_find_adj(adj_dev, dev_list); + + if (adj) { +- adj->ref_nr++; ++ adj->ref_nr += ref_nr; + return 0; + } + +@@ -5597,7 +5598,7 @@ static int __netdev_adjacent_dev_insert( + + adj->dev = adj_dev; + adj->master = master; +- adj->ref_nr = 1; ++ adj->ref_nr = ref_nr; + adj->private = private; + dev_hold(adj_dev); + +@@ -5636,6 +5637,7 @@ free_adj: + + static void __netdev_adjacent_dev_remove(struct net_device *dev, + struct net_device *adj_dev, ++ u16 ref_nr, + struct list_head *dev_list) + { + struct netdev_adjacent *adj; +@@ -5648,10 +5650,10 @@ static void __netdev_adjacent_dev_remove + BUG(); + } + +- if (adj->ref_nr > 1) { +- pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name, +- adj->ref_nr-1); +- adj->ref_nr--; ++ if (adj->ref_nr > ref_nr) { ++ pr_debug("%s to %s ref_nr-%d = %d\n", dev->name, adj_dev->name, ++ ref_nr, adj->ref_nr-ref_nr); ++ adj->ref_nr -= ref_nr; + return; + } + +@@ -5670,21 +5672,22 @@ static void __netdev_adjacent_dev_remove + + static int __netdev_adjacent_dev_link_lists(struct net_device *dev, + struct net_device *upper_dev, ++ u16 ref_nr, + struct list_head *up_list, + struct list_head *down_list, + void *private, bool master) + { + int ret; + +- ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private, +- master); ++ ret = __netdev_adjacent_dev_insert(dev, upper_dev, ref_nr, up_list, ++ private, master); + if (ret) + return ret; + +- ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private, +- false); ++ ret = __netdev_adjacent_dev_insert(upper_dev, dev, ref_nr, down_list, ++ private, false); + if (ret) { +- __netdev_adjacent_dev_remove(dev, upper_dev, up_list); ++ __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list); + return ret; + } + +@@ -5692,9 +5695,10 @@ static int __netdev_adjacent_dev_link_li + } + + static int __netdev_adjacent_dev_link(struct net_device *dev, +- struct net_device *upper_dev) ++ struct net_device *upper_dev, ++ u16 ref_nr) + { +- return __netdev_adjacent_dev_link_lists(dev, upper_dev, ++ return __netdev_adjacent_dev_link_lists(dev, upper_dev, ref_nr, + &dev->all_adj_list.upper, + &upper_dev->all_adj_list.lower, + NULL, false); +@@ -5702,17 +5706,19 @@ static int __netdev_adjacent_dev_link(st + + static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, + struct net_device *upper_dev, ++ u16 ref_nr, + struct list_head *up_list, + struct list_head *down_list) + { +- __netdev_adjacent_dev_remove(dev, upper_dev, up_list); +- __netdev_adjacent_dev_remove(upper_dev, dev, down_list); ++ __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list); ++ __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list); + } + + static void __netdev_adjacent_dev_unlink(struct net_device *dev, +- struct net_device *upper_dev) ++ struct net_device *upper_dev, ++ u16 ref_nr) + { +- __netdev_adjacent_dev_unlink_lists(dev, upper_dev, ++ __netdev_adjacent_dev_unlink_lists(dev, upper_dev, ref_nr, + &dev->all_adj_list.upper, + &upper_dev->all_adj_list.lower); + } +@@ -5721,17 +5727,17 @@ static int __netdev_adjacent_dev_link_ne + struct net_device *upper_dev, + void *private, bool master) + { +- int ret = __netdev_adjacent_dev_link(dev, upper_dev); ++ int ret = __netdev_adjacent_dev_link(dev, upper_dev, 1); + + if (ret) + return ret; + +- ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, ++ ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, 1, + &dev->adj_list.upper, + &upper_dev->adj_list.lower, + private, master); + if (ret) { +- __netdev_adjacent_dev_unlink(dev, upper_dev); ++ __netdev_adjacent_dev_unlink(dev, upper_dev, 1); + return ret; + } + +@@ -5741,8 +5747,8 @@ static int __netdev_adjacent_dev_link_ne + static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, + struct net_device *upper_dev) + { +- __netdev_adjacent_dev_unlink(dev, upper_dev); +- __netdev_adjacent_dev_unlink_lists(dev, upper_dev, ++ __netdev_adjacent_dev_unlink(dev, upper_dev, 1); ++ __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1, + &dev->adj_list.upper, + &upper_dev->adj_list.lower); + } +@@ -5795,7 +5801,7 @@ static int __netdev_upper_dev_link(struc + list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { + pr_debug("Interlinking %s with %s, non-neighbour\n", + i->dev->name, j->dev->name); +- ret = __netdev_adjacent_dev_link(i->dev, j->dev); ++ ret = __netdev_adjacent_dev_link(i->dev, j->dev, i->ref_nr); + if (ret) + goto rollback_mesh; + } +@@ -5805,7 +5811,7 @@ static int __netdev_upper_dev_link(struc + list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { + pr_debug("linking %s's upper device %s with %s\n", + upper_dev->name, i->dev->name, dev->name); +- ret = __netdev_adjacent_dev_link(dev, i->dev); ++ ret = __netdev_adjacent_dev_link(dev, i->dev, i->ref_nr); + if (ret) + goto rollback_upper_mesh; + } +@@ -5814,7 +5820,7 @@ static int __netdev_upper_dev_link(struc + list_for_each_entry(i, &dev->all_adj_list.lower, list) { + pr_debug("linking %s's lower device %s with %s\n", dev->name, + i->dev->name, upper_dev->name); +- ret = __netdev_adjacent_dev_link(i->dev, upper_dev); ++ ret = __netdev_adjacent_dev_link(i->dev, upper_dev, i->ref_nr); + if (ret) + goto rollback_lower_mesh; + } +@@ -5832,7 +5838,7 @@ rollback_lower_mesh: + list_for_each_entry(i, &dev->all_adj_list.lower, list) { + if (i == to_i) + break; +- __netdev_adjacent_dev_unlink(i->dev, upper_dev); ++ __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr); + } + + i = NULL; +@@ -5842,7 +5848,7 @@ rollback_upper_mesh: + list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { + if (i == to_i) + break; +- __netdev_adjacent_dev_unlink(dev, i->dev); ++ __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr); + } + + i = j = NULL; +@@ -5854,7 +5860,7 @@ rollback_mesh: + list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { + if (i == to_i && j == to_j) + break; +- __netdev_adjacent_dev_unlink(i->dev, j->dev); ++ __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr); + } + if (i == to_i) + break; +@@ -5934,16 +5940,16 @@ void netdev_upper_dev_unlink(struct net_ + */ + list_for_each_entry(i, &dev->all_adj_list.lower, list) + list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) +- __netdev_adjacent_dev_unlink(i->dev, j->dev); ++ __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr); + + /* remove also the devices itself from lower/upper device + * list + */ + list_for_each_entry(i, &dev->all_adj_list.lower, list) +- __netdev_adjacent_dev_unlink(i->dev, upper_dev); ++ __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr); + + list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) +- __netdev_adjacent_dev_unlink(dev, i->dev); ++ __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr); + + call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, + &changeupper_info.info); diff --git a/queue-4.8/net-add-recursion-limit-to-gro.patch b/queue-4.8/net-add-recursion-limit-to-gro.patch new file mode 100644 index 00000000000..152c56ccac1 --- /dev/null +++ b/queue-4.8/net-add-recursion-limit-to-gro.patch @@ -0,0 +1,220 @@ +From foo@baz Thu Nov 10 16:43:03 CET 2016 +From: Sabrina Dubroca +Date: Thu, 20 Oct 2016 15:58:02 +0200 +Subject: net: add recursion limit to GRO + +From: Sabrina Dubroca + + +[ Upstream commit fcd91dd449867c6bfe56a81cabba76b829fd05cd ] + +Currently, GRO can do unlimited recursion through the gro_receive +handlers. This was fixed for tunneling protocols by limiting tunnel GRO +to one level with encap_mark, but both VLAN and TEB still have this +problem. Thus, the kernel is vulnerable to a stack overflow, if we +receive a packet composed entirely of VLAN headers. + +This patch adds a recursion counter to the GRO layer to prevent stack +overflow. When a gro_receive function hits the recursion limit, GRO is +aborted for this skb and it is processed normally. This recursion +counter is put in the GRO CB, but could be turned into a percpu counter +if we run out of space in the CB. + +Thanks to Vladimír Beneš for the initial bug report. + +Fixes: CVE-2016-7039 +Fixes: 9b174d88c257 ("net: Add Transparent Ethernet Bridging GRO support.") +Fixes: 66e5133f19e9 ("vlan: Add GRO support for non hardware accelerated vlan") +Signed-off-by: Sabrina Dubroca +Reviewed-by: Jiri Benc +Acked-by: Hannes Frederic Sowa +Acked-by: Tom Herbert +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/geneve.c | 2 +- + drivers/net/vxlan.c | 2 +- + include/linux/netdevice.h | 39 ++++++++++++++++++++++++++++++++++++++- + net/8021q/vlan.c | 2 +- + net/core/dev.c | 1 + + net/ethernet/eth.c | 2 +- + net/ipv4/af_inet.c | 2 +- + net/ipv4/fou.c | 4 ++-- + net/ipv4/gre_offload.c | 2 +- + net/ipv4/udp_offload.c | 2 +- + net/ipv6/ip6_offload.c | 2 +- + 11 files changed, 49 insertions(+), 11 deletions(-) + +--- a/drivers/net/geneve.c ++++ b/drivers/net/geneve.c +@@ -453,7 +453,7 @@ static struct sk_buff **geneve_gro_recei + + skb_gro_pull(skb, gh_len); + skb_gro_postpull_rcsum(skb, gh, gh_len); +- pp = ptype->callbacks.gro_receive(head, skb); ++ pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb); + flush = 0; + + out_unlock: +--- a/drivers/net/vxlan.c ++++ b/drivers/net/vxlan.c +@@ -584,7 +584,7 @@ static struct sk_buff **vxlan_gro_receiv + } + } + +- pp = eth_gro_receive(head, skb); ++ pp = call_gro_receive(eth_gro_receive, head, skb); + flush = 0; + + out: +--- a/include/linux/netdevice.h ++++ b/include/linux/netdevice.h +@@ -2154,7 +2154,10 @@ struct napi_gro_cb { + /* Used to determine if flush_id can be ignored */ + u8 is_atomic:1; + +- /* 5 bit hole */ ++ /* Number of gro_receive callbacks this packet already went through */ ++ u8 recursion_counter:4; ++ ++ /* 1 bit hole */ + + /* used to support CHECKSUM_COMPLETE for tunneling protocols */ + __wsum csum; +@@ -2165,6 +2168,40 @@ struct napi_gro_cb { + + #define NAPI_GRO_CB(skb) ((struct napi_gro_cb *)(skb)->cb) + ++#define GRO_RECURSION_LIMIT 15 ++static inline int gro_recursion_inc_test(struct sk_buff *skb) ++{ ++ return ++NAPI_GRO_CB(skb)->recursion_counter == GRO_RECURSION_LIMIT; ++} ++ ++typedef struct sk_buff **(*gro_receive_t)(struct sk_buff **, struct sk_buff *); ++static inline struct sk_buff **call_gro_receive(gro_receive_t cb, ++ struct sk_buff **head, ++ struct sk_buff *skb) ++{ ++ if (unlikely(gro_recursion_inc_test(skb))) { ++ NAPI_GRO_CB(skb)->flush |= 1; ++ return NULL; ++ } ++ ++ return cb(head, skb); ++} ++ ++typedef struct sk_buff **(*gro_receive_sk_t)(struct sock *, struct sk_buff **, ++ struct sk_buff *); ++static inline struct sk_buff **call_gro_receive_sk(gro_receive_sk_t cb, ++ struct sock *sk, ++ struct sk_buff **head, ++ struct sk_buff *skb) ++{ ++ if (unlikely(gro_recursion_inc_test(skb))) { ++ NAPI_GRO_CB(skb)->flush |= 1; ++ return NULL; ++ } ++ ++ return cb(sk, head, skb); ++} ++ + struct packet_type { + __be16 type; /* This is really htons(ether_type). */ + struct net_device *dev; /* NULL is wildcarded here */ +--- a/net/8021q/vlan.c ++++ b/net/8021q/vlan.c +@@ -664,7 +664,7 @@ static struct sk_buff **vlan_gro_receive + + skb_gro_pull(skb, sizeof(*vhdr)); + skb_gro_postpull_rcsum(skb, vhdr, sizeof(*vhdr)); +- pp = ptype->callbacks.gro_receive(head, skb); ++ pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb); + + out_unlock: + rcu_read_unlock(); +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -4496,6 +4496,7 @@ static enum gro_result dev_gro_receive(s + NAPI_GRO_CB(skb)->flush = 0; + NAPI_GRO_CB(skb)->free = 0; + NAPI_GRO_CB(skb)->encap_mark = 0; ++ NAPI_GRO_CB(skb)->recursion_counter = 0; + NAPI_GRO_CB(skb)->is_fou = 0; + NAPI_GRO_CB(skb)->is_atomic = 1; + NAPI_GRO_CB(skb)->gro_remcsum_start = 0; +--- a/net/ethernet/eth.c ++++ b/net/ethernet/eth.c +@@ -439,7 +439,7 @@ struct sk_buff **eth_gro_receive(struct + + skb_gro_pull(skb, sizeof(*eh)); + skb_gro_postpull_rcsum(skb, eh, sizeof(*eh)); +- pp = ptype->callbacks.gro_receive(head, skb); ++ pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb); + + out_unlock: + rcu_read_unlock(); +--- a/net/ipv4/af_inet.c ++++ b/net/ipv4/af_inet.c +@@ -1388,7 +1388,7 @@ struct sk_buff **inet_gro_receive(struct + skb_gro_pull(skb, sizeof(*iph)); + skb_set_transport_header(skb, skb_gro_offset(skb)); + +- pp = ops->callbacks.gro_receive(head, skb); ++ pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); + + out_unlock: + rcu_read_unlock(); +--- a/net/ipv4/fou.c ++++ b/net/ipv4/fou.c +@@ -249,7 +249,7 @@ static struct sk_buff **fou_gro_receive( + if (!ops || !ops->callbacks.gro_receive) + goto out_unlock; + +- pp = ops->callbacks.gro_receive(head, skb); ++ pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); + + out_unlock: + rcu_read_unlock(); +@@ -441,7 +441,7 @@ next_proto: + if (WARN_ON_ONCE(!ops || !ops->callbacks.gro_receive)) + goto out_unlock; + +- pp = ops->callbacks.gro_receive(head, skb); ++ pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); + flush = 0; + + out_unlock: +--- a/net/ipv4/gre_offload.c ++++ b/net/ipv4/gre_offload.c +@@ -227,7 +227,7 @@ static struct sk_buff **gre_gro_receive( + /* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/ + skb_gro_postpull_rcsum(skb, greh, grehlen); + +- pp = ptype->callbacks.gro_receive(head, skb); ++ pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb); + flush = 0; + + out_unlock: +--- a/net/ipv4/udp_offload.c ++++ b/net/ipv4/udp_offload.c +@@ -293,7 +293,7 @@ unflush: + + skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */ + skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); +- pp = udp_sk(sk)->gro_receive(sk, head, skb); ++ pp = call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb); + + out_unlock: + rcu_read_unlock(); +--- a/net/ipv6/ip6_offload.c ++++ b/net/ipv6/ip6_offload.c +@@ -243,7 +243,7 @@ static struct sk_buff **ipv6_gro_receive + + skb_gro_postpull_rcsum(skb, iph, nlen); + +- pp = ops->callbacks.gro_receive(head, skb); ++ pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); + + out_unlock: + rcu_read_unlock(); diff --git a/queue-4.8/net-core-correctly-iterate-over-lower-adjacency-list.patch b/queue-4.8/net-core-correctly-iterate-over-lower-adjacency-list.patch new file mode 100644 index 00000000000..5b7089edf7f --- /dev/null +++ b/queue-4.8/net-core-correctly-iterate-over-lower-adjacency-list.patch @@ -0,0 +1,110 @@ +From foo@baz Thu Nov 10 16:43:03 CET 2016 +From: Ido Schimmel +Date: Wed, 19 Oct 2016 16:57:08 +0300 +Subject: net: core: Correctly iterate over lower adjacency list + +From: Ido Schimmel + + +[ Upstream commit e4961b0768852d9eb7383e1a5df178eacb714656 ] + +Tamir reported the following trace when processing ARP requests received +via a vlan device on top of a VLAN-aware bridge: + + NMI watchdog: BUG: soft lockup - CPU#1 stuck for 22s! [swapper/1:0] +[...] + CPU: 1 PID: 0 Comm: swapper/1 Tainted: G W 4.8.0-rc7 #1 + Hardware name: Mellanox Technologies Ltd. "MSN2100-CB2F"/"SA001017", BIOS 5.6.5 06/07/2016 + task: ffff88017edfea40 task.stack: ffff88017ee10000 + RIP: 0010:[] [] netdev_all_lower_get_next_rcu+0x33/0x60 +[...] + Call Trace: + + [] mlxsw_sp_port_lower_dev_hold+0x5a/0xa0 [mlxsw_spectrum] + [] mlxsw_sp_router_netevent_event+0x80/0x150 [mlxsw_spectrum] + [] notifier_call_chain+0x4a/0x70 + [] atomic_notifier_call_chain+0x1a/0x20 + [] call_netevent_notifiers+0x1b/0x20 + [] neigh_update+0x306/0x740 + [] neigh_event_ns+0x4e/0xb0 + [] arp_process+0x66f/0x700 + [] ? common_interrupt+0x8c/0x8c + [] arp_rcv+0x139/0x1d0 + [] ? vlan_do_receive+0xda/0x320 + [] __netif_receive_skb_core+0x524/0xab0 + [] ? dev_queue_xmit+0x10/0x20 + [] ? br_forward_finish+0x3d/0xc0 [bridge] + [] ? br_handle_vlan+0xf6/0x1b0 [bridge] + [] __netif_receive_skb+0x18/0x60 + [] netif_receive_skb_internal+0x40/0xb0 + [] netif_receive_skb+0x1c/0x70 + [] br_pass_frame_up+0xc6/0x160 [bridge] + [] ? deliver_clone+0x37/0x50 [bridge] + [] ? br_flood+0xcc/0x160 [bridge] + [] br_handle_frame_finish+0x224/0x4f0 [bridge] + [] br_handle_frame+0x174/0x300 [bridge] + [] __netif_receive_skb_core+0x329/0xab0 + [] ? find_next_bit+0x15/0x20 + [] ? cpumask_next_and+0x32/0x50 + [] ? load_balance+0x178/0x9b0 + [] __netif_receive_skb+0x18/0x60 + [] netif_receive_skb_internal+0x40/0xb0 + [] netif_receive_skb+0x1c/0x70 + [] mlxsw_sp_rx_listener_func+0x61/0xb0 [mlxsw_spectrum] + [] mlxsw_core_skb_receive+0x187/0x200 [mlxsw_core] + [] mlxsw_pci_cq_tasklet+0x63a/0x9b0 [mlxsw_pci] + [] tasklet_action+0xf6/0x110 + [] __do_softirq+0xf6/0x280 + [] irq_exit+0xdf/0xf0 + [] do_IRQ+0x54/0xd0 + [] common_interrupt+0x8c/0x8c + +The problem is that netdev_all_lower_get_next_rcu() never advances the +iterator, thereby causing the loop over the lower adjacency list to run +forever. + +Fix this by advancing the iterator and avoid the infinite loop. + +Fixes: 7ce856aaaf13 ("mlxsw: spectrum: Add couple of lower device helper functions") +Signed-off-by: Ido Schimmel +Reported-by: Tamir Winetroub +Reviewed-by: Jiri Pirko +Acked-by: David Ahern +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/netdevice.h | 2 +- + net/core/dev.c | 10 +++++++--- + 2 files changed, 8 insertions(+), 4 deletions(-) + +--- a/include/linux/netdevice.h ++++ b/include/linux/netdevice.h +@@ -3862,7 +3862,7 @@ struct net_device *netdev_all_lower_get_ + ldev = netdev_all_lower_get_next(dev, &(iter))) + + #define netdev_for_each_all_lower_dev_rcu(dev, ldev, iter) \ +- for (iter = (dev)->all_adj_list.lower.next, \ ++ for (iter = &(dev)->all_adj_list.lower, \ + ldev = netdev_all_lower_get_next_rcu(dev, &(iter)); \ + ldev; \ + ldev = netdev_all_lower_get_next_rcu(dev, &(iter))) +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -5500,10 +5500,14 @@ struct net_device *netdev_all_lower_get_ + { + struct netdev_adjacent *lower; + +- lower = list_first_or_null_rcu(&dev->all_adj_list.lower, +- struct netdev_adjacent, list); ++ lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); + +- return lower ? lower->dev : NULL; ++ if (&lower->list == &dev->all_adj_list.lower) ++ return NULL; ++ ++ *iter = &lower->list; ++ ++ return lower->dev; + } + EXPORT_SYMBOL(netdev_all_lower_get_next_rcu); + diff --git a/queue-4.8/net-fec-call-swap_buffer-prior-to-ip-header-alignment.patch b/queue-4.8/net-fec-call-swap_buffer-prior-to-ip-header-alignment.patch new file mode 100644 index 00000000000..0aac366dbea --- /dev/null +++ b/queue-4.8/net-fec-call-swap_buffer-prior-to-ip-header-alignment.patch @@ -0,0 +1,49 @@ +From foo@baz Thu Nov 10 16:43:03 CET 2016 +From: Fabio Estevam +Date: Fri, 21 Oct 2016 09:34:29 -0200 +Subject: net: fec: Call swap_buffer() prior to IP header alignment + +From: Fabio Estevam + + +[ Upstream commit 235bde1ed3f0fff0f68f367ec8807b89ea151258 ] + +Commit 3ac72b7b63d5 ("net: fec: align IP header in hardware") breaks +networking on mx28. + +There is an erratum on mx28 (ENGR121613 - ENET big endian mode +not compatible with ARM little endian) that requires an additional +byte-swap operation to workaround this problem. + +So call swap_buffer() prior to performing the IP header alignment +to restore network functionality on mx28. + +Fixes: 3ac72b7b63d5 ("net: fec: align IP header in hardware") +Reported-and-tested-by: Henri Roosen +Signed-off-by: Fabio Estevam +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/freescale/fec_main.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +--- a/drivers/net/ethernet/freescale/fec_main.c ++++ b/drivers/net/ethernet/freescale/fec_main.c +@@ -1430,14 +1430,14 @@ fec_enet_rx_queue(struct net_device *nde + skb_put(skb, pkt_len - 4); + data = skb->data; + ++ if (!is_copybreak && need_swap) ++ swap_buffer(data, pkt_len); ++ + #if !defined(CONFIG_M5272) + if (fep->quirks & FEC_QUIRK_HAS_RACC) + data = skb_pull_inline(skb, 2); + #endif + +- if (!is_copybreak && need_swap) +- swap_buffer(data, pkt_len); +- + /* Extract the enhanced buffer descriptor */ + ebdp = NULL; + if (fep->bufdesc_ex) diff --git a/queue-4.8/net-fec-set-mac-address-unconditionally.patch b/queue-4.8/net-fec-set-mac-address-unconditionally.patch new file mode 100644 index 00000000000..3568d41e91a --- /dev/null +++ b/queue-4.8/net-fec-set-mac-address-unconditionally.patch @@ -0,0 +1,48 @@ +From foo@baz Thu Nov 10 16:43:03 CET 2016 +From: Gavin Schenk +Date: Fri, 30 Sep 2016 11:46:10 +0200 +Subject: net: fec: set mac address unconditionally + +From: Gavin Schenk + + +[ Upstream commit b82d44d78480faff7456e9e0999acb9d38666057 ] + +If the mac address origin is not dt, you can only safely assign a mac +address after "link up" of the device. If the link is off the clocks are +disabled and because of issues assigning registers when clocks are off the +new mac address cannot be written in .ndo_set_mac_address() on some soc's. +This fix sets the mac address unconditionally in fec_restart(...) and +ensures consistency between fec registers and the network layer. + +Signed-off-by: Gavin Schenk +Acked-by: Fugang Duan +Acked-by: Uwe Kleine-König +Fixes: 9638d19e4816 ("net: fec: add netif status check before set mac address") +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/freescale/fec_main.c | 12 +++++------- + 1 file changed, 5 insertions(+), 7 deletions(-) + +--- a/drivers/net/ethernet/freescale/fec_main.c ++++ b/drivers/net/ethernet/freescale/fec_main.c +@@ -913,13 +913,11 @@ fec_restart(struct net_device *ndev) + * enet-mac reset will reset mac address registers too, + * so need to reconfigure it. + */ +- if (fep->quirks & FEC_QUIRK_ENET_MAC) { +- memcpy(&temp_mac, ndev->dev_addr, ETH_ALEN); +- writel((__force u32)cpu_to_be32(temp_mac[0]), +- fep->hwp + FEC_ADDR_LOW); +- writel((__force u32)cpu_to_be32(temp_mac[1]), +- fep->hwp + FEC_ADDR_HIGH); +- } ++ memcpy(&temp_mac, ndev->dev_addr, ETH_ALEN); ++ writel((__force u32)cpu_to_be32(temp_mac[0]), ++ fep->hwp + FEC_ADDR_LOW); ++ writel((__force u32)cpu_to_be32(temp_mac[1]), ++ fep->hwp + FEC_ADDR_HIGH); + + /* Clear any outstanding interrupt. */ + writel(0xffffffff, fep->hwp + FEC_IEVENT); diff --git a/queue-4.8/net-ipv6-do-not-consider-link-state-for-nexthop-validation.patch b/queue-4.8/net-ipv6-do-not-consider-link-state-for-nexthop-validation.patch new file mode 100644 index 00000000000..da26662ad80 --- /dev/null +++ b/queue-4.8/net-ipv6-do-not-consider-link-state-for-nexthop-validation.patch @@ -0,0 +1,66 @@ +From foo@baz Thu Nov 10 16:43:03 CET 2016 +From: David Ahern +Date: Mon, 24 Oct 2016 12:27:23 -0700 +Subject: net: ipv6: Do not consider link state for nexthop validation + +From: David Ahern + + +[ Upstream commit d5d32e4b76687f4df9ad3ba8d3702b7347f51fa6 ] + +Similar to IPv4, do not consider link state when validating next hops. + +Currently, if the link is down default routes can fail to insert: + $ ip -6 ro add vrf blue default via 2100:2::64 dev eth2 + RTNETLINK answers: No route to host + +With this patch the command succeeds. + +Fixes: 8c14586fc320 ("net: ipv6: Use passed in table for nexthop lookups") +Signed-off-by: David Ahern +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + include/net/ip6_route.h | 1 + + net/ipv6/route.c | 6 ++++-- + 2 files changed, 5 insertions(+), 2 deletions(-) + +--- a/include/net/ip6_route.h ++++ b/include/net/ip6_route.h +@@ -32,6 +32,7 @@ struct route_info { + #define RT6_LOOKUP_F_SRCPREF_TMP 0x00000008 + #define RT6_LOOKUP_F_SRCPREF_PUBLIC 0x00000010 + #define RT6_LOOKUP_F_SRCPREF_COA 0x00000020 ++#define RT6_LOOKUP_F_IGNORE_LINKSTATE 0x00000040 + + /* We do not (yet ?) support IPv6 jumbograms (RFC 2675) + * Unlike IPv4, hdr->seg_len doesn't include the IPv6 header +--- a/net/ipv6/route.c ++++ b/net/ipv6/route.c +@@ -656,7 +656,8 @@ static struct rt6_info *find_match(struc + struct net_device *dev = rt->dst.dev; + + if (dev && !netif_carrier_ok(dev) && +- idev->cnf.ignore_routes_with_linkdown) ++ idev->cnf.ignore_routes_with_linkdown && ++ !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) + goto out; + + if (rt6_check_expired(rt)) +@@ -1050,6 +1051,7 @@ struct rt6_info *ip6_pol_route(struct ne + int strict = 0; + + strict |= flags & RT6_LOOKUP_F_IFACE; ++ strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; + if (net->ipv6.devconf_all->forwarding == 0) + strict |= RT6_LOOKUP_F_REACHABLE; + +@@ -1783,7 +1785,7 @@ static struct rt6_info *ip6_nh_lookup_ta + }; + struct fib6_table *table; + struct rt6_info *rt; +- int flags = RT6_LOOKUP_F_IFACE; ++ int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE; + + table = fib6_get_table(net, cfg->fc_table); + if (!table) diff --git a/queue-4.8/net-mlx4_en-fixup-xdp-tx-irq-to-match-rx.patch b/queue-4.8/net-mlx4_en-fixup-xdp-tx-irq-to-match-rx.patch new file mode 100644 index 00000000000..0748ce76fa7 --- /dev/null +++ b/queue-4.8/net-mlx4_en-fixup-xdp-tx-irq-to-match-rx.patch @@ -0,0 +1,51 @@ +From foo@baz Thu Nov 10 16:43:03 CET 2016 +From: Brenden Blanco +Date: Thu, 13 Oct 2016 13:13:11 -0700 +Subject: net/mlx4_en: fixup xdp tx irq to match rx + +From: Brenden Blanco + + +[ Upstream commit 958b3d396d7f80755e2c2e6a8f873a669f38de10 ] + +In cases where the number of tx rings is not a multiple of the number of +rx rings, the tx completion event will be handled on a different core +from the transmit and population of the ring. Races on the ring will +lead to a double-free of the page, and possibly other corruption. + +The rings are initialized by default with a valid multiple of rings, +based on the number of cpus, therefore an invalid configuration requires +ethtool to change the ring layout. For instance 'ethtool -L eth0 rx 9 tx +8' will cause packets received on rx0, and XDP_TX'd to tx48, to be +completed on cpu3 (48 % 9 == 3). + +Resolve this discrepancy by shifting the irq for the xdp tx queues to +start again from 0, modulo rx_ring_num. + +Fixes: 9ecc2d86171a ("net/mlx4_en: add xdp forwarding and data write support") +Reported-by: Jesper Dangaard Brouer +Signed-off-by: Brenden Blanco +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/mellanox/mlx4/en_cq.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx4/en_cq.c ++++ b/drivers/net/ethernet/mellanox/mlx4/en_cq.c +@@ -127,7 +127,15 @@ int mlx4_en_activate_cq(struct mlx4_en_p + /* For TX we use the same irq per + ring we assigned for the RX */ + struct mlx4_en_cq *rx_cq; ++ int xdp_index; + ++ /* The xdp tx irq must align with the rx ring that forwards to ++ * it, so reindex these from 0. This should only happen when ++ * tx_ring_num is not a multiple of rx_ring_num. ++ */ ++ xdp_index = (priv->xdp_ring_num - priv->tx_ring_num) + cq_idx; ++ if (xdp_index >= 0) ++ cq_idx = xdp_index; + cq_idx = cq_idx % priv->rx_ring_num; + rx_cq = priv->rx_cq[cq_idx]; + cq->vector = rx_cq->vector; diff --git a/queue-4.8/net-phy-trigger-state-machine-on-state-change-and-not-polling.patch b/queue-4.8/net-phy-trigger-state-machine-on-state-change-and-not-polling.patch new file mode 100644 index 00000000000..4ad7a87f80f --- /dev/null +++ b/queue-4.8/net-phy-trigger-state-machine-on-state-change-and-not-polling.patch @@ -0,0 +1,91 @@ +From foo@baz Thu Nov 10 16:43:03 CET 2016 +From: Andrew Lunn +Date: Wed, 12 Oct 2016 22:14:53 +0200 +Subject: net: phy: Trigger state machine on state change and not polling. + +From: Andrew Lunn + + +[ Upstream commit 3c293f4e08b58ad5b78f78d89ca1fd41f87f8729 ] + +The phy_start() is used to indicate the PHY is now ready to do its +work. The state is changed, normally to PHY_UP which means that both +the MAC and the PHY are ready. + +If the phy driver is using polling, when the next poll happens, the +state machine notices the PHY is now in PHY_UP, and kicks off +auto-negotiation, if needed. + +If however, the PHY is using interrupts, there is no polling. The phy +is stuck in PHY_UP until the next interrupt comes along. And there is +no reason for the PHY to interrupt. + +Have phy_start() schedule the state machine to run, which both speeds +up the polling use case, and makes the interrupt use case actually +work. + +This problems exists whenever there is a state change which will not +cause an interrupt. Trigger the state machine in these cases, +e.g. phy_error(). + +Signed-off-by: Andrew Lunn +Cc: Kyle Roeschley +Tested-by: Kyle Roeschley +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/phy/phy.c | 22 ++++++++++++++++++++-- + 1 file changed, 20 insertions(+), 2 deletions(-) + +--- a/drivers/net/phy/phy.c ++++ b/drivers/net/phy/phy.c +@@ -608,6 +608,21 @@ void phy_start_machine(struct phy_device + } + + /** ++ * phy_trigger_machine - trigger the state machine to run ++ * ++ * @phydev: the phy_device struct ++ * ++ * Description: There has been a change in state which requires that the ++ * state machine runs. ++ */ ++ ++static void phy_trigger_machine(struct phy_device *phydev) ++{ ++ cancel_delayed_work_sync(&phydev->state_queue); ++ queue_delayed_work(system_power_efficient_wq, &phydev->state_queue, 0); ++} ++ ++/** + * phy_stop_machine - stop the PHY state machine tracking + * @phydev: target phy_device struct + * +@@ -639,6 +654,8 @@ static void phy_error(struct phy_device + mutex_lock(&phydev->lock); + phydev->state = PHY_HALTED; + mutex_unlock(&phydev->lock); ++ ++ phy_trigger_machine(phydev); + } + + /** +@@ -800,8 +817,7 @@ void phy_change(struct work_struct *work + } + + /* reschedule state queue work to run as soon as possible */ +- cancel_delayed_work_sync(&phydev->state_queue); +- queue_delayed_work(system_power_efficient_wq, &phydev->state_queue, 0); ++ phy_trigger_machine(phydev); + return; + + ignore: +@@ -890,6 +906,8 @@ void phy_start(struct phy_device *phydev + /* if phy was suspended, bring the physical link up again */ + if (do_resume) + phy_resume(phydev); ++ ++ phy_trigger_machine(phydev); + } + EXPORT_SYMBOL(phy_start); + diff --git a/queue-4.8/net-pktgen-fix-pkt_size.patch b/queue-4.8/net-pktgen-fix-pkt_size.patch new file mode 100644 index 00000000000..6d4320ae196 --- /dev/null +++ b/queue-4.8/net-pktgen-fix-pkt_size.patch @@ -0,0 +1,108 @@ +From foo@baz Thu Nov 10 16:43:03 CET 2016 +From: Paolo Abeni +Date: Fri, 30 Sep 2016 16:56:45 +0200 +Subject: net: pktgen: fix pkt_size + +From: Paolo Abeni + + +[ Upstream commit 63d75463c91a5b5be7c0aca11ceb45ea5a0ae81d ] + +The commit 879c7220e828 ("net: pktgen: Observe needed_headroom +of the device") increased the 'pkt_overhead' field value by +LL_RESERVED_SPACE. +As a side effect the generated packet size, computed as: + + /* Eth + IPh + UDPh + mpls */ + datalen = pkt_dev->cur_pkt_size - 14 - 20 - 8 - + pkt_dev->pkt_overhead; + +is decreased by the same value. +The above changed slightly the behavior of existing pktgen users, +and made the procfs interface somewhat inconsistent. +Fix it by restoring the previous pkt_overhead value and using +LL_RESERVED_SPACE as extralen in skb allocation. +Also, change pktgen_alloc_skb() to only partially reserve +the headroom to allow the caller to prefetch from ll header +start. + +v1 -> v2: + - fixed some typos in the comments + +Fixes: 879c7220e828 ("net: pktgen: Observe needed_headroom of the device") +Suggested-by: Ben Greear +Signed-off-by: Paolo Abeni +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/core/pktgen.c | 21 ++++++++++----------- + 1 file changed, 10 insertions(+), 11 deletions(-) + +--- a/net/core/pktgen.c ++++ b/net/core/pktgen.c +@@ -2286,7 +2286,7 @@ out: + + static inline void set_pkt_overhead(struct pktgen_dev *pkt_dev) + { +- pkt_dev->pkt_overhead = LL_RESERVED_SPACE(pkt_dev->odev); ++ pkt_dev->pkt_overhead = 0; + pkt_dev->pkt_overhead += pkt_dev->nr_labels*sizeof(u32); + pkt_dev->pkt_overhead += VLAN_TAG_SIZE(pkt_dev); + pkt_dev->pkt_overhead += SVLAN_TAG_SIZE(pkt_dev); +@@ -2777,13 +2777,13 @@ static void pktgen_finalize_skb(struct p + } + + static struct sk_buff *pktgen_alloc_skb(struct net_device *dev, +- struct pktgen_dev *pkt_dev, +- unsigned int extralen) ++ struct pktgen_dev *pkt_dev) + { ++ unsigned int extralen = LL_RESERVED_SPACE(dev); + struct sk_buff *skb = NULL; +- unsigned int size = pkt_dev->cur_pkt_size + 64 + extralen + +- pkt_dev->pkt_overhead; ++ unsigned int size; + ++ size = pkt_dev->cur_pkt_size + 64 + extralen + pkt_dev->pkt_overhead; + if (pkt_dev->flags & F_NODE) { + int node = pkt_dev->node >= 0 ? pkt_dev->node : numa_node_id(); + +@@ -2796,8 +2796,9 @@ static struct sk_buff *pktgen_alloc_skb( + skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT); + } + ++ /* the caller pre-fetches from skb->data and reserves for the mac hdr */ + if (likely(skb)) +- skb_reserve(skb, LL_RESERVED_SPACE(dev)); ++ skb_reserve(skb, extralen - 16); + + return skb; + } +@@ -2830,16 +2831,14 @@ static struct sk_buff *fill_packet_ipv4( + mod_cur_headers(pkt_dev); + queue_map = pkt_dev->cur_queue_map; + +- datalen = (odev->hard_header_len + 16) & ~0xf; +- +- skb = pktgen_alloc_skb(odev, pkt_dev, datalen); ++ skb = pktgen_alloc_skb(odev, pkt_dev); + if (!skb) { + sprintf(pkt_dev->result, "No memory"); + return NULL; + } + + prefetchw(skb->data); +- skb_reserve(skb, datalen); ++ skb_reserve(skb, 16); + + /* Reserve for ethernet and IP header */ + eth = (__u8 *) skb_push(skb, 14); +@@ -2959,7 +2958,7 @@ static struct sk_buff *fill_packet_ipv6( + mod_cur_headers(pkt_dev); + queue_map = pkt_dev->cur_queue_map; + +- skb = pktgen_alloc_skb(odev, pkt_dev, 16); ++ skb = pktgen_alloc_skb(odev, pkt_dev); + if (!skb) { + sprintf(pkt_dev->result, "No memory"); + return NULL; diff --git a/queue-4.8/net-pktgen-remove-rcu-locking-in-pktgen_change_name.patch b/queue-4.8/net-pktgen-remove-rcu-locking-in-pktgen_change_name.patch new file mode 100644 index 00000000000..4e2098dc2f8 --- /dev/null +++ b/queue-4.8/net-pktgen-remove-rcu-locking-in-pktgen_change_name.patch @@ -0,0 +1,91 @@ +From foo@baz Thu Nov 10 16:43:03 CET 2016 +From: Eric Dumazet +Date: Sat, 15 Oct 2016 17:50:49 +0200 +Subject: net: pktgen: remove rcu locking in pktgen_change_name() + +From: Eric Dumazet + + +[ Upstream commit 9a0b1e8ba4061778897b544afc898de2163382f7 ] + +After Jesper commit back in linux-3.18, we trigger a lockdep +splat in proc_create_data() while allocating memory from +pktgen_change_name(). + +This patch converts t->if_lock to a mutex, since it is now only +used from control path, and adds proper locking to pktgen_change_name() + +1) pktgen_thread_lock to protect the outer loop (iterating threads) +2) t->if_lock to protect the inner loop (iterating devices) + +Note that before Jesper patch, pktgen_change_name() was lacking proper +protection, but lockdep was not able to detect the problem. + +Fixes: 8788370a1d4b ("pktgen: RCU-ify "if_list" to remove lock in next_to_run()") +Reported-by: John Sperbeck +Signed-off-by: Eric Dumazet +Cc: Jesper Dangaard Brouer +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/core/pktgen.c | 17 ++++++++++------- + 1 file changed, 10 insertions(+), 7 deletions(-) + +--- a/net/core/pktgen.c ++++ b/net/core/pktgen.c +@@ -216,8 +216,8 @@ + #define M_QUEUE_XMIT 2 /* Inject packet into qdisc */ + + /* If lock -- protects updating of if_list */ +-#define if_lock(t) spin_lock(&(t->if_lock)); +-#define if_unlock(t) spin_unlock(&(t->if_lock)); ++#define if_lock(t) mutex_lock(&(t->if_lock)); ++#define if_unlock(t) mutex_unlock(&(t->if_lock)); + + /* Used to help with determining the pkts on receive */ + #define PKTGEN_MAGIC 0xbe9be955 +@@ -423,7 +423,7 @@ struct pktgen_net { + }; + + struct pktgen_thread { +- spinlock_t if_lock; /* for list of devices */ ++ struct mutex if_lock; /* for list of devices */ + struct list_head if_list; /* All device here */ + struct list_head th_list; + struct task_struct *tsk; +@@ -2010,11 +2010,13 @@ static void pktgen_change_name(const str + { + struct pktgen_thread *t; + ++ mutex_lock(&pktgen_thread_lock); ++ + list_for_each_entry(t, &pn->pktgen_threads, th_list) { + struct pktgen_dev *pkt_dev; + +- rcu_read_lock(); +- list_for_each_entry_rcu(pkt_dev, &t->if_list, list) { ++ if_lock(t); ++ list_for_each_entry(pkt_dev, &t->if_list, list) { + if (pkt_dev->odev != dev) + continue; + +@@ -2029,8 +2031,9 @@ static void pktgen_change_name(const str + dev->name); + break; + } +- rcu_read_unlock(); ++ if_unlock(t); + } ++ mutex_unlock(&pktgen_thread_lock); + } + + static int pktgen_device_event(struct notifier_block *unused, +@@ -3762,7 +3765,7 @@ static int __net_init pktgen_create_thre + return -ENOMEM; + } + +- spin_lock_init(&t->if_lock); ++ mutex_init(&t->if_lock); + t->cpu = cpu; + + INIT_LIST_HEAD(&t->if_list); diff --git a/queue-4.8/net-sched-act_vlan-push-skb-data-to-mac_header-prior-calling-skb_vlan_-functions.patch b/queue-4.8/net-sched-act_vlan-push-skb-data-to-mac_header-prior-calling-skb_vlan_-functions.patch new file mode 100644 index 00000000000..274a800dda2 --- /dev/null +++ b/queue-4.8/net-sched-act_vlan-push-skb-data-to-mac_header-prior-calling-skb_vlan_-functions.patch @@ -0,0 +1,68 @@ +From foo@baz Thu Nov 10 16:43:03 CET 2016 +From: Shmulik Ladkani +Date: Thu, 29 Sep 2016 12:10:40 +0300 +Subject: net/sched: act_vlan: Push skb->data to mac_header prior calling skb_vlan_*() functions + +From: Shmulik Ladkani + + +[ Upstream commit f39acc84aad10710e89835c60d3b6694c43a8dd9 ] + +Generic skb_vlan_push/skb_vlan_pop functions don't properly handle the +case where the input skb data pointer does not point at the mac header: + +- They're doing push/pop, but fail to properly unwind data back to its + original location. + For example, in the skb_vlan_push case, any subsequent + 'skb_push(skb, skb->mac_len)' calls make the skb->data point 4 bytes + BEFORE start of frame, leading to bogus frames that may be transmitted. + +- They update rcsum per the added/removed 4 bytes tag. + Alas if data is originally after the vlan/eth headers, then these + bytes were already pulled out of the csum. + +OTOH calling skb_vlan_push/skb_vlan_pop with skb->data at mac_header +present no issues. + +act_vlan is the only caller to skb_vlan_*() that has skb->data pointing +at network header (upon ingress). +Other calles (ovs, bpf) already adjust skb->data at mac_header. + +This patch fixes act_vlan to point to the mac_header prior calling +skb_vlan_*() functions, as other callers do. + +Signed-off-by: Shmulik Ladkani +Cc: Daniel Borkmann +Cc: Pravin Shelar +Cc: Jiri Pirko +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/sched/act_vlan.c | 9 +++++++++ + 1 file changed, 9 insertions(+) + +--- a/net/sched/act_vlan.c ++++ b/net/sched/act_vlan.c +@@ -36,6 +36,12 @@ static int tcf_vlan(struct sk_buff *skb, + bstats_update(&v->tcf_bstats, skb); + action = v->tcf_action; + ++ /* Ensure 'data' points at mac_header prior calling vlan manipulating ++ * functions. ++ */ ++ if (skb_at_tc_ingress(skb)) ++ skb_push_rcsum(skb, skb->mac_len); ++ + switch (v->tcfv_action) { + case TCA_VLAN_ACT_POP: + err = skb_vlan_pop(skb); +@@ -57,6 +63,9 @@ drop: + action = TC_ACT_SHOT; + v->tcf_qstats.drops++; + unlock: ++ if (skb_at_tc_ingress(skb)) ++ skb_pull_rcsum(skb, skb->mac_len); ++ + spin_unlock(&v->tcf_lock); + return action; + } diff --git a/queue-4.8/net-sched-filters-fix-notification-of-filter-delete-with-proper-handle.patch b/queue-4.8/net-sched-filters-fix-notification-of-filter-delete-with-proper-handle.patch new file mode 100644 index 00000000000..91a8adcc87e --- /dev/null +++ b/queue-4.8/net-sched-filters-fix-notification-of-filter-delete-with-proper-handle.patch @@ -0,0 +1,80 @@ +From foo@baz Thu Nov 10 16:43:03 CET 2016 +From: Jamal Hadi Salim +Date: Mon, 24 Oct 2016 20:18:27 -0400 +Subject: net sched filters: fix notification of filter delete with proper handle + +From: Jamal Hadi Salim + + +[ Upstream commit 9ee7837449b3d6f0fcf9132c6b5e5aaa58cc67d4 ] + +Daniel says: + +While trying out [1][2], I noticed that tc monitor doesn't show the +correct handle on delete: + +$ tc monitor +qdisc clsact ffff: dev eno1 parent ffff:fff1 +filter dev eno1 ingress protocol all pref 49152 bpf handle 0x2a [...] +deleted filter dev eno1 ingress protocol all pref 49152 bpf handle 0xf3be0c80 + +some context to explain the above: +The user identity of any tc filter is represented by a 32-bit +identifier encoded in tcm->tcm_handle. Example 0x2a in the bpf filter +above. A user wishing to delete, get or even modify a specific filter +uses this handle to reference it. +Every classifier is free to provide its own semantics for the 32 bit handle. +Example: classifiers like u32 use schemes like 800:1:801 to describe +the semantics of their filters represented as hash table, bucket and +node ids etc. +Classifiers also have internal per-filter representation which is different +from this externally visible identity. Most classifiers set this +internal representation to be a pointer address (which allows fast retrieval +of said filters in their implementations). This internal representation +is referenced with the "fh" variable in the kernel control code. + +When a user successfuly deletes a specific filter, by specifying the correct +tcm->tcm_handle, an event is generated to user space which indicates +which specific filter was deleted. + +Before this patch, the "fh" value was sent to user space as the identity. +As an example what is shown in the sample bpf filter delete event above +is 0xf3be0c80. This is infact a 32-bit truncation of 0xffff8807f3be0c80 +which happens to be a 64-bit memory address of the internal filter +representation (address of the corresponding filter's struct cls_bpf_prog); + +After this patch the appropriate user identifiable handle as encoded +in the originating request tcm->tcm_handle is generated in the event. +One of the cardinal rules of netlink rules is to be able to take an +event (such as a delete in this case) and reflect it back to the +kernel and successfully delete the filter. This patch achieves that. + +Note, this issue has existed since the original TC action +infrastructure code patch back in 2004 as found in: +https://git.kernel.org/cgit/linux/kernel/git/history/history.git/commit/ + +[1] http://patchwork.ozlabs.org/patch/682828/ +[2] http://patchwork.ozlabs.org/patch/682829/ + +Fixes: 4e54c4816bfe ("[NET]: Add tc extensions infrastructure.") +Reported-by: Daniel Borkmann +Acked-by: Cong Wang +Signed-off-by: Jamal Hadi Salim +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/sched/cls_api.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/net/sched/cls_api.c ++++ b/net/sched/cls_api.c +@@ -344,7 +344,8 @@ replay: + if (err == 0) { + struct tcf_proto *next = rtnl_dereference(tp->next); + +- tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER); ++ tfilter_notify(net, skb, n, tp, ++ t->tcm_handle, RTM_DELTFILTER); + if (tcf_destroy(tp, false)) + RCU_INIT_POINTER(*back, next); + } diff --git a/queue-4.8/net-sctp-forbid-negative-length.patch b/queue-4.8/net-sctp-forbid-negative-length.patch new file mode 100644 index 00000000000..9259d1b3eb8 --- /dev/null +++ b/queue-4.8/net-sctp-forbid-negative-length.patch @@ -0,0 +1,79 @@ +From foo@baz Thu Nov 10 16:43:03 CET 2016 +From: Jiri Slaby +Date: Fri, 21 Oct 2016 14:13:24 +0200 +Subject: net: sctp, forbid negative length + +From: Jiri Slaby + + +[ Upstream commit a4b8e71b05c27bae6bad3bdecddbc6b68a3ad8cf ] + +Most of getsockopt handlers in net/sctp/socket.c check len against +sizeof some structure like: + if (len < sizeof(int)) + return -EINVAL; + +On the first look, the check seems to be correct. But since len is int +and sizeof returns size_t, int gets promoted to unsigned size_t too. So +the test returns false for negative lengths. Yes, (-1 < sizeof(long)) is +false. + +Fix this in sctp by explicitly checking len < 0 before any getsockopt +handler is called. + +Note that sctp_getsockopt_events already handled the negative case. +Since we added the < 0 check elsewhere, this one can be removed. + +If not checked, this is the result: +UBSAN: Undefined behaviour in ../mm/page_alloc.c:2722:19 +shift exponent 52 is too large for 32-bit type 'int' +CPU: 1 PID: 24535 Comm: syz-executor Not tainted 4.8.1-0-syzkaller #1 +Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.9.1-0-gb3ef39f-prebuilt.qemu-project.org 04/01/2014 + 0000000000000000 ffff88006d99f2a8 ffffffffb2f7bdea 0000000041b58ab3 + ffffffffb4363c14 ffffffffb2f7bcde ffff88006d99f2d0 ffff88006d99f270 + 0000000000000000 0000000000000000 0000000000000034 ffffffffb5096422 +Call Trace: + [] ? __ubsan_handle_shift_out_of_bounds+0x29c/0x300 +... + [] ? kmalloc_order+0x24/0x90 + [] ? kmalloc_order_trace+0x24/0x220 + [] ? __kmalloc+0x330/0x540 + [] ? sctp_getsockopt_local_addrs+0x174/0xca0 [sctp] + [] ? sctp_getsockopt+0x10d/0x1b0 [sctp] + [] ? sock_common_getsockopt+0xb9/0x150 + [] ? SyS_getsockopt+0x1a5/0x270 + +Signed-off-by: Jiri Slaby +Cc: Vlad Yasevich +Cc: Neil Horman +Cc: "David S. Miller" +Cc: linux-sctp@vger.kernel.org +Cc: netdev@vger.kernel.org +Acked-by: Neil Horman +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/sctp/socket.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +--- a/net/sctp/socket.c ++++ b/net/sctp/socket.c +@@ -4683,7 +4683,7 @@ static int sctp_getsockopt_disable_fragm + static int sctp_getsockopt_events(struct sock *sk, int len, char __user *optval, + int __user *optlen) + { +- if (len <= 0) ++ if (len == 0) + return -EINVAL; + if (len > sizeof(struct sctp_event_subscribe)) + len = sizeof(struct sctp_event_subscribe); +@@ -6426,6 +6426,9 @@ static int sctp_getsockopt(struct sock * + if (get_user(len, optlen)) + return -EFAULT; + ++ if (len < 0) ++ return -EINVAL; ++ + lock_sock(sk); + + switch (optname) { diff --git a/queue-4.8/net_sched-reorder-pernet-ops-and-act-ops-registrations.patch b/queue-4.8/net_sched-reorder-pernet-ops-and-act-ops-registrations.patch new file mode 100644 index 00000000000..856b0102479 --- /dev/null +++ b/queue-4.8/net_sched-reorder-pernet-ops-and-act-ops-registrations.patch @@ -0,0 +1,86 @@ +From foo@baz Thu Nov 10 16:43:03 CET 2016 +From: WANG Cong +Date: Tue, 11 Oct 2016 10:56:45 -0700 +Subject: net_sched: reorder pernet ops and act ops registrations + +From: WANG Cong + + +[ Upstream commit ab102b80cef28c20b3ef7794806c3a982c6444fc ] + +Krister reported a kernel NULL pointer dereference after +tcf_action_init_1() invokes a_o->init(), it is a race condition +where one thread calling tcf_register_action() to initialize +the netns data after putting act ops in the global list and +the other thread searching the list and then calling +a_o->init(net, ...). + +Fix this by moving the pernet ops registration before making +the action ops visible. This is fine because: a) we don't +rely on act_base in pernet ops->init(), b) in the worst case we +have a fully initialized netns but ops is still not ready so +new actions still can't be created. + +Reported-by: Krister Johansen +Tested-by: Krister Johansen +Cc: Jamal Hadi Salim +Signed-off-by: Cong Wang +Acked-by: Jamal Hadi Salim +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/sched/act_api.c | 19 +++++++++++-------- + 1 file changed, 11 insertions(+), 8 deletions(-) + +--- a/net/sched/act_api.c ++++ b/net/sched/act_api.c +@@ -341,22 +341,25 @@ int tcf_register_action(struct tc_action + if (!act->act || !act->dump || !act->init || !act->walk || !act->lookup) + return -EINVAL; + ++ /* We have to register pernet ops before making the action ops visible, ++ * otherwise tcf_action_init_1() could get a partially initialized ++ * netns. ++ */ ++ ret = register_pernet_subsys(ops); ++ if (ret) ++ return ret; ++ + write_lock(&act_mod_lock); + list_for_each_entry(a, &act_base, head) { + if (act->type == a->type || (strcmp(act->kind, a->kind) == 0)) { + write_unlock(&act_mod_lock); ++ unregister_pernet_subsys(ops); + return -EEXIST; + } + } + list_add_tail(&act->head, &act_base); + write_unlock(&act_mod_lock); + +- ret = register_pernet_subsys(ops); +- if (ret) { +- tcf_unregister_action(act, ops); +- return ret; +- } +- + return 0; + } + EXPORT_SYMBOL(tcf_register_action); +@@ -367,8 +370,6 @@ int tcf_unregister_action(struct tc_acti + struct tc_action_ops *a; + int err = -ENOENT; + +- unregister_pernet_subsys(ops); +- + write_lock(&act_mod_lock); + list_for_each_entry(a, &act_base, head) { + if (a == act) { +@@ -378,6 +379,8 @@ int tcf_unregister_action(struct tc_acti + } + } + write_unlock(&act_mod_lock); ++ if (!err) ++ unregister_pernet_subsys(ops); + return err; + } + EXPORT_SYMBOL(tcf_unregister_action); diff --git a/queue-4.8/netlink-do-not-enter-direct-reclaim-from-netlink_dump.patch b/queue-4.8/netlink-do-not-enter-direct-reclaim-from-netlink_dump.patch new file mode 100644 index 00000000000..103c1935df8 --- /dev/null +++ b/queue-4.8/netlink-do-not-enter-direct-reclaim-from-netlink_dump.patch @@ -0,0 +1,66 @@ +From foo@baz Thu Nov 10 16:43:03 CET 2016 +From: Eric Dumazet +Date: Thu, 6 Oct 2016 04:13:18 +0900 +Subject: netlink: do not enter direct reclaim from netlink_dump() + +From: Eric Dumazet + + +[ Upstream commit d35c99ff77ecb2eb239731b799386f3b3637a31e ] + +Since linux-3.15, netlink_dump() can use up to 16384 bytes skb +allocations. + +Due to struct skb_shared_info ~320 bytes overhead, we end up using +order-3 (on x86) page allocations, that might trigger direct reclaim and +add stress. + +The intent was really to attempt a large allocation but immediately +fallback to a smaller one (order-1 on x86) in case of memory stress. + +On recent kernels (linux-4.4), we can remove __GFP_DIRECT_RECLAIM to +meet the goal. Old kernels would need to remove __GFP_WAIT + +While we are at it, since we do an order-3 allocation, allow to use +all the allocated bytes instead of 16384 to reduce syscalls during +large dumps. + +iproute2 already uses 32KB recvmsg() buffer sizes. + +Alexei provided an initial patch downsizing to SKB_WITH_OVERHEAD(16384) + +Fixes: 9063e21fb026 ("netlink: autosize skb lengthes") +Signed-off-by: Eric Dumazet +Reported-by: Alexei Starovoitov +Cc: Greg Thelen +Reviewed-by: Greg Rose +Acked-by: Alexei Starovoitov +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/netlink/af_netlink.c | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +--- a/net/netlink/af_netlink.c ++++ b/net/netlink/af_netlink.c +@@ -1832,7 +1832,7 @@ static int netlink_recvmsg(struct socket + /* Record the max length of recvmsg() calls for future allocations */ + nlk->max_recvmsg_len = max(nlk->max_recvmsg_len, len); + nlk->max_recvmsg_len = min_t(size_t, nlk->max_recvmsg_len, +- 16384); ++ SKB_WITH_OVERHEAD(32768)); + + copied = data_skb->len; + if (len < copied) { +@@ -2083,8 +2083,9 @@ static int netlink_dump(struct sock *sk) + + if (alloc_min_size < nlk->max_recvmsg_len) { + alloc_size = nlk->max_recvmsg_len; +- skb = alloc_skb(alloc_size, GFP_KERNEL | +- __GFP_NOWARN | __GFP_NORETRY); ++ skb = alloc_skb(alloc_size, ++ (GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) | ++ __GFP_NOWARN | __GFP_NORETRY); + } + if (!skb) { + alloc_size = alloc_min_size; diff --git a/queue-4.8/netvsc-fix-incorrect-receive-checksum-offloading.patch b/queue-4.8/netvsc-fix-incorrect-receive-checksum-offloading.patch new file mode 100644 index 00000000000..e1f2569ff68 --- /dev/null +++ b/queue-4.8/netvsc-fix-incorrect-receive-checksum-offloading.patch @@ -0,0 +1,58 @@ +From foo@baz Thu Nov 10 16:43:03 CET 2016 +From: Stephen Hemminger +Date: Sun, 23 Oct 2016 21:32:47 -0700 +Subject: netvsc: fix incorrect receive checksum offloading + +From: Stephen Hemminger + + +[ Upstream commit e52fed7177f74382f742c27de2cc5314790aebb6 ] + +The Hyper-V netvsc driver was looking at the incorrect status bits +in the checksum info. It was setting the receive checksum unnecessary +flag based on the IP header checksum being correct. The checksum +flag is skb is about TCP and UDP checksum status. Because of this +bug, any packet received with bad TCP checksum would be passed +up the stack and to the application causing data corruption. +The problem is reproducible via netcat and netem. + +This had a side effect of not doing receive checksum offload +on IPv6. The driver was also also always doing checksum offload +independent of the checksum setting done via ethtool. + +Signed-off-by: Stephen Hemminger +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/hyperv/netvsc_drv.c | 19 +++++++++++-------- + 1 file changed, 11 insertions(+), 8 deletions(-) + +--- a/drivers/net/hyperv/netvsc_drv.c ++++ b/drivers/net/hyperv/netvsc_drv.c +@@ -624,15 +624,18 @@ static struct sk_buff *netvsc_alloc_recv + packet->total_data_buflen); + + skb->protocol = eth_type_trans(skb, net); +- if (csum_info) { +- /* We only look at the IP checksum here. +- * Should we be dropping the packet if checksum +- * failed? How do we deal with other checksums - TCP/UDP? +- */ +- if (csum_info->receive.ip_checksum_succeeded) ++ ++ /* skb is already created with CHECKSUM_NONE */ ++ skb_checksum_none_assert(skb); ++ ++ /* ++ * In Linux, the IP checksum is always checked. ++ * Do L4 checksum offload if enabled and present. ++ */ ++ if (csum_info && (net->features & NETIF_F_RXCSUM)) { ++ if (csum_info->receive.tcp_checksum_succeeded || ++ csum_info->receive.udp_checksum_succeeded) + skb->ip_summed = CHECKSUM_UNNECESSARY; +- else +- skb->ip_summed = CHECKSUM_NONE; + } + + if (vlan_tci & VLAN_TAG_PRESENT) diff --git a/queue-4.8/packet-call-fanout_release-while-unregistering-a-netdev.patch b/queue-4.8/packet-call-fanout_release-while-unregistering-a-netdev.patch new file mode 100644 index 00000000000..42925a2374f --- /dev/null +++ b/queue-4.8/packet-call-fanout_release-while-unregistering-a-netdev.patch @@ -0,0 +1,36 @@ +From foo@baz Thu Nov 10 16:43:03 CET 2016 +From: Anoob Soman +Date: Wed, 5 Oct 2016 15:12:54 +0100 +Subject: packet: call fanout_release, while UNREGISTERING a netdev + +From: Anoob Soman + + +[ Upstream commit 6664498280cf17a59c3e7cf1a931444c02633ed1 ] + +If a socket has FANOUT sockopt set, a new proto_hook is registered +as part of fanout_add(). When processing a NETDEV_UNREGISTER event in +af_packet, __fanout_unlink is called for all sockets, but prot_hook which was +registered as part of fanout_add is not removed. Call fanout_release, on a +NETDEV_UNREGISTER, which removes prot_hook and removes fanout from the +fanout_list. + +This fixes BUG_ON(!list_empty(&dev->ptype_specific)) in netdev_run_todo() + +Signed-off-by: Anoob Soman +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/packet/af_packet.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/net/packet/af_packet.c ++++ b/net/packet/af_packet.c +@@ -3952,6 +3952,7 @@ static int packet_notifier(struct notifi + } + if (msg == NETDEV_UNREGISTER) { + packet_cached_dev_reset(po); ++ fanout_release(sk); + po->ifindex = -1; + if (po->prot_hook.dev) + dev_put(po->prot_hook.dev); diff --git a/queue-4.8/packet-on-direct_xmit-limit-tso-and-csum-to-supported-devices.patch b/queue-4.8/packet-on-direct_xmit-limit-tso-and-csum-to-supported-devices.patch new file mode 100644 index 00000000000..e4c7534a5e5 --- /dev/null +++ b/queue-4.8/packet-on-direct_xmit-limit-tso-and-csum-to-supported-devices.patch @@ -0,0 +1,96 @@ +From foo@baz Thu Nov 10 16:43:03 CET 2016 +From: Willem de Bruijn +Date: Wed, 26 Oct 2016 11:23:07 -0400 +Subject: packet: on direct_xmit, limit tso and csum to supported devices + +From: Willem de Bruijn + + +[ Upstream commit 104ba78c98808ae837d1f63aae58c183db5505df ] + +When transmitting on a packet socket with PACKET_VNET_HDR and +PACKET_QDISC_BYPASS, validate device support for features requested +in vnet_hdr. + +Drop TSO packets sent to devices that do not support TSO or have the +feature disabled. Note that the latter currently do process those +packets correctly, regardless of not advertising the feature. + +Because of SKB_GSO_DODGY, it is not sufficient to test device features +with netif_needs_gso. Full validate_xmit_skb is needed. + +Switch to software checksum for non-TSO packets that request checksum +offload if that device feature is unsupported or disabled. Note that +similar to the TSO case, device drivers may perform checksum offload +correctly even when not advertising it. + +When switching to software checksum, packets hit skb_checksum_help, +which has two BUG_ON checksum not in linear segment. Packet sockets +always allocate at least up to csum_start + csum_off + 2 as linear. + +Tested by running github.com/wdebruij/kerneltools/psock_txring_vnet.c + + ethtool -K eth0 tso off tx on + psock_txring_vnet -d $dst -s $src -i eth0 -l 2000 -n 1 -q -v + psock_txring_vnet -d $dst -s $src -i eth0 -l 2000 -n 1 -q -v -N + + ethtool -K eth0 tx off + psock_txring_vnet -d $dst -s $src -i eth0 -l 1000 -n 1 -q -v -G + psock_txring_vnet -d $dst -s $src -i eth0 -l 1000 -n 1 -q -v -G -N + +v2: + - add EXPORT_SYMBOL_GPL(validate_xmit_skb_list) + +Fixes: d346a3fae3ff ("packet: introduce PACKET_QDISC_BYPASS socket option") +Signed-off-by: Willem de Bruijn +Acked-by: Eric Dumazet +Acked-by: Daniel Borkmann +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/core/dev.c | 1 + + net/packet/af_packet.c | 9 ++++----- + 2 files changed, 5 insertions(+), 5 deletions(-) + +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -3035,6 +3035,7 @@ struct sk_buff *validate_xmit_skb_list(s + } + return head; + } ++EXPORT_SYMBOL_GPL(validate_xmit_skb_list); + + static void qdisc_pkt_len_init(struct sk_buff *skb) + { +--- a/net/packet/af_packet.c ++++ b/net/packet/af_packet.c +@@ -250,7 +250,7 @@ static void __fanout_link(struct sock *s + static int packet_direct_xmit(struct sk_buff *skb) + { + struct net_device *dev = skb->dev; +- netdev_features_t features; ++ struct sk_buff *orig_skb = skb; + struct netdev_queue *txq; + int ret = NETDEV_TX_BUSY; + +@@ -258,9 +258,8 @@ static int packet_direct_xmit(struct sk_ + !netif_carrier_ok(dev))) + goto drop; + +- features = netif_skb_features(skb); +- if (skb_needs_linearize(skb, features) && +- __skb_linearize(skb)) ++ skb = validate_xmit_skb_list(skb, dev); ++ if (skb != orig_skb) + goto drop; + + txq = skb_get_tx_queue(dev, skb); +@@ -280,7 +279,7 @@ static int packet_direct_xmit(struct sk_ + return ret; + drop: + atomic_long_inc(&dev->tx_dropped); +- kfree_skb(skb); ++ kfree_skb_list(skb); + return NET_XMIT_DROP; + } + diff --git a/queue-4.8/rtnetlink-add-rtnexthop-offload-flag-to-compare-mask.patch b/queue-4.8/rtnetlink-add-rtnexthop-offload-flag-to-compare-mask.patch new file mode 100644 index 00000000000..788ef4dfe10 --- /dev/null +++ b/queue-4.8/rtnetlink-add-rtnexthop-offload-flag-to-compare-mask.patch @@ -0,0 +1,33 @@ +From foo@baz Thu Nov 10 16:43:03 CET 2016 +From: Jiri Pirko +Date: Tue, 18 Oct 2016 18:59:34 +0200 +Subject: rtnetlink: Add rtnexthop offload flag to compare mask + +From: Jiri Pirko + + +[ Upstream commit 85dda4e5b0ee1f5b4e8cc93d39e475006bc61ccd ] + +The offload flag is a status flag and should not be used by +FIB semantics for comparison. + +Fixes: 37ed9493699c ("rtnetlink: add RTNH_F_EXTERNAL flag for fib offload") +Signed-off-by: Jiri Pirko +Reviewed-by: Andy Gospodarek +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + include/uapi/linux/rtnetlink.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/include/uapi/linux/rtnetlink.h ++++ b/include/uapi/linux/rtnetlink.h +@@ -350,7 +350,7 @@ struct rtnexthop { + #define RTNH_F_OFFLOAD 8 /* offloaded route */ + #define RTNH_F_LINKDOWN 16 /* carrier-down on nexthop */ + +-#define RTNH_COMPARE_MASK (RTNH_F_DEAD | RTNH_F_LINKDOWN) ++#define RTNH_COMPARE_MASK (RTNH_F_DEAD | RTNH_F_LINKDOWN | RTNH_F_OFFLOAD) + + /* Macros to handle hexthops */ + diff --git a/queue-4.8/sctp-fix-the-panic-caused-by-route-update.patch b/queue-4.8/sctp-fix-the-panic-caused-by-route-update.patch new file mode 100644 index 00000000000..b2f43f46295 --- /dev/null +++ b/queue-4.8/sctp-fix-the-panic-caused-by-route-update.patch @@ -0,0 +1,58 @@ +From foo@baz Thu Nov 10 16:43:03 CET 2016 +From: Xin Long +Date: Mon, 24 Oct 2016 01:01:09 +0800 +Subject: sctp: fix the panic caused by route update + +From: Xin Long + + +[ Upstream commit ecc515d7238f2cffac839069d56dc271141defa0 ] + +Commit 7303a1475008 ("sctp: identify chunks that need to be fragmented +at IP level") made the chunk be fragmented at IP level in the next round +if it's size exceed PMTU. + +But there still is another case, PMTU can be updated if transport's dst +expires and transport's pmtu_pending is set in sctp_packet_transmit. If +the new PMTU is less than the chunk, the same issue with that commit can +be triggered. + +So we should drop this packet and let it retransmit in another round +where it would be fragmented at IP level. + +This patch is to fix it by checking the chunk size after PMTU may be +updated and dropping this packet if it's size exceed PMTU. + +Fixes: 90017accff61 ("sctp: Add GSO support") +Signed-off-by: Xin Long +Acked-by: Neil Horman +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/sctp/output.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +--- a/net/sctp/output.c ++++ b/net/sctp/output.c +@@ -417,6 +417,7 @@ int sctp_packet_transmit(struct sctp_pac + __u8 has_data = 0; + int gso = 0; + int pktcount = 0; ++ int auth_len = 0; + struct dst_entry *dst; + unsigned char *auth = NULL; /* pointer to auth in skb data */ + +@@ -505,7 +506,12 @@ int sctp_packet_transmit(struct sctp_pac + list_for_each_entry(chunk, &packet->chunk_list, list) { + int padded = WORD_ROUND(chunk->skb->len); + +- if (pkt_size + padded > tp->pathmtu) ++ if (chunk == packet->auth) ++ auth_len = padded; ++ else if (auth_len + padded + packet->overhead > ++ tp->pathmtu) ++ goto nomem; ++ else if (pkt_size + padded > tp->pathmtu) + break; + pkt_size += padded; + } diff --git a/queue-4.8/sctp-validate-chunk-len-before-actually-using-it.patch b/queue-4.8/sctp-validate-chunk-len-before-actually-using-it.patch new file mode 100644 index 00000000000..069e3a024fc --- /dev/null +++ b/queue-4.8/sctp-validate-chunk-len-before-actually-using-it.patch @@ -0,0 +1,58 @@ +From foo@baz Thu Nov 10 16:43:03 CET 2016 +From: Marcelo Ricardo Leitner +Date: Tue, 25 Oct 2016 14:27:39 -0200 +Subject: sctp: validate chunk len before actually using it + +From: Marcelo Ricardo Leitner + + +[ Upstream commit bf911e985d6bbaa328c20c3e05f4eb03de11fdd6 ] + +Andrey Konovalov reported that KASAN detected that SCTP was using a slab +beyond the boundaries. It was caused because when handling out of the +blue packets in function sctp_sf_ootb() it was checking the chunk len +only after already processing the first chunk, validating only for the +2nd and subsequent ones. + +The fix is to just move the check upwards so it's also validated for the +1st chunk. + +Reported-by: Andrey Konovalov +Tested-by: Andrey Konovalov +Signed-off-by: Marcelo Ricardo Leitner +Reviewed-by: Xin Long +Acked-by: Neil Horman +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/sctp/sm_statefuns.c | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +--- a/net/sctp/sm_statefuns.c ++++ b/net/sctp/sm_statefuns.c +@@ -3422,6 +3422,12 @@ sctp_disposition_t sctp_sf_ootb(struct n + return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, + commands); + ++ /* Report violation if chunk len overflows */ ++ ch_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length)); ++ if (ch_end > skb_tail_pointer(skb)) ++ return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, ++ commands); ++ + /* Now that we know we at least have a chunk header, + * do things that are type appropriate. + */ +@@ -3453,12 +3459,6 @@ sctp_disposition_t sctp_sf_ootb(struct n + } + } + +- /* Report violation if chunk len overflows */ +- ch_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length)); +- if (ch_end > skb_tail_pointer(skb)) +- return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, +- commands); +- + ch = (sctp_chunkhdr_t *) ch_end; + } while (ch_end < skb_tail_pointer(skb)); + diff --git a/queue-4.8/switchdev-execute-bridge-ndos-only-for-bridge-ports.patch b/queue-4.8/switchdev-execute-bridge-ndos-only-for-bridge-ports.patch new file mode 100644 index 00000000000..7f21c39bf47 --- /dev/null +++ b/queue-4.8/switchdev-execute-bridge-ndos-only-for-bridge-ports.patch @@ -0,0 +1,104 @@ +From foo@baz Thu Nov 10 16:43:03 CET 2016 +From: Ido Schimmel +Date: Tue, 18 Oct 2016 18:50:23 +0200 +Subject: switchdev: Execute bridge ndos only for bridge ports + +From: Ido Schimmel + + +[ Upstream commit 97c242902c209e7d46e365335db5202634484dcb ] + +We recently got the following warning after setting up a vlan device on +top of an offloaded bridge and executing 'bridge link': + +WARNING: CPU: 0 PID: 18566 at drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c:81 mlxsw_sp_port_orig_get.part.9+0x55/0x70 [mlxsw_spectrum] +[...] + CPU: 0 PID: 18566 Comm: bridge Not tainted 4.8.0-rc7 #1 + Hardware name: Mellanox Technologies Ltd. Mellanox switch/Mellanox switch, BIOS 4.6.5 05/21/2015 + 0000000000000286 00000000e64ab94f ffff880406e6f8f0 ffffffff8135eaa3 + 0000000000000000 0000000000000000 ffff880406e6f930 ffffffff8108c43b + 0000005106e6f988 ffff8803df398840 ffff880403c60108 ffff880406e6f990 + Call Trace: + [] dump_stack+0x63/0x90 + [] __warn+0xcb/0xf0 + [] warn_slowpath_null+0x1d/0x20 + [] mlxsw_sp_port_orig_get.part.9+0x55/0x70 [mlxsw_spectrum] + [] mlxsw_sp_port_attr_get+0xa5/0xb0 [mlxsw_spectrum] + [] switchdev_port_attr_get+0x4f/0x140 + [] switchdev_port_attr_get+0x100/0x140 + [] switchdev_port_attr_get+0x100/0x140 + [] switchdev_port_bridge_getlink+0x5b/0xc0 + [] ? switchdev_port_fdb_dump+0x90/0x90 + [] rtnl_bridge_getlink+0xe7/0x190 + [] netlink_dump+0x122/0x290 + [] __netlink_dump_start+0x15f/0x190 + [] ? rtnl_bridge_dellink+0x230/0x230 + [] rtnetlink_rcv_msg+0x1a6/0x220 + [] ? __kmalloc_node_track_caller+0x208/0x2c0 + [] ? rtnl_bridge_dellink+0x230/0x230 + [] ? rtnl_newlink+0x890/0x890 + [] netlink_rcv_skb+0xa4/0xc0 + [] rtnetlink_rcv+0x28/0x30 + [] netlink_unicast+0x18c/0x240 + [] netlink_sendmsg+0x2fb/0x3a0 + [] sock_sendmsg+0x38/0x50 + [] SYSC_sendto+0x101/0x190 + [] ? __sys_recvmsg+0x51/0x90 + [] SyS_sendto+0xe/0x10 + [] entry_SYSCALL_64_fastpath+0x1a/0xa4 + +The problem is that the 8021q module propagates the call to +ndo_bridge_getlink() via switchdev ops, but the switch driver doesn't +recognize the netdev, as it's not offloaded. + +While we can ignore calls being made to non-bridge ports inside the +driver, a better fix would be to push this check up to the switchdev +layer. + +Note that these ndos can be called for non-bridged netdev, but this only +happens in certain PF drivers which don't call the corresponding +switchdev functions anyway. + +Fixes: 99f44bb3527b ("mlxsw: spectrum: Enable L3 interfaces on top of bridge devices") +Signed-off-by: Ido Schimmel +Reported-by: Tamir Winetroub +Tested-by: Tamir Winetroub +Signed-off-by: Jiri Pirko +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/switchdev/switchdev.c | 9 +++++++++ + 1 file changed, 9 insertions(+) + +--- a/net/switchdev/switchdev.c ++++ b/net/switchdev/switchdev.c +@@ -774,6 +774,9 @@ int switchdev_port_bridge_getlink(struct + u32 mask = BR_LEARNING | BR_LEARNING_SYNC | BR_FLOOD; + int err; + ++ if (!netif_is_bridge_port(dev)) ++ return -EOPNOTSUPP; ++ + err = switchdev_port_attr_get(dev, &attr); + if (err && err != -EOPNOTSUPP) + return err; +@@ -929,6 +932,9 @@ int switchdev_port_bridge_setlink(struct + struct nlattr *afspec; + int err = 0; + ++ if (!netif_is_bridge_port(dev)) ++ return -EOPNOTSUPP; ++ + protinfo = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), + IFLA_PROTINFO); + if (protinfo) { +@@ -962,6 +968,9 @@ int switchdev_port_bridge_dellink(struct + { + struct nlattr *afspec; + ++ if (!netif_is_bridge_port(dev)) ++ return -EOPNOTSUPP; ++ + afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), + IFLA_AF_SPEC); + if (afspec) diff --git a/queue-4.8/udp-fix-ip_checksum-handling.patch b/queue-4.8/udp-fix-ip_checksum-handling.patch new file mode 100644 index 00000000000..470ac91d8ee --- /dev/null +++ b/queue-4.8/udp-fix-ip_checksum-handling.patch @@ -0,0 +1,118 @@ +From foo@baz Thu Nov 10 16:43:03 CET 2016 +From: Eric Dumazet +Date: Sun, 23 Oct 2016 18:03:06 -0700 +Subject: udp: fix IP_CHECKSUM handling + +From: Eric Dumazet + + +[ Upstream commit 10df8e6152c6c400a563a673e9956320bfce1871 ] + +First bug was added in commit ad6f939ab193 ("ip: Add offset parameter to +ip_cmsg_recv") : Tom missed that ipv4 udp messages could be received on +AF_INET6 socket. ip_cmsg_recv(msg, skb) should have been replaced by +ip_cmsg_recv_offset(msg, skb, sizeof(struct udphdr)); + +Then commit e6afc8ace6dd ("udp: remove headers from UDP packets before +queueing") forgot to adjust the offsets now UDP headers are pulled +before skb are put in receive queue. + +Fixes: ad6f939ab193 ("ip: Add offset parameter to ip_cmsg_recv") +Fixes: e6afc8ace6dd ("udp: remove headers from UDP packets before queueing") +Signed-off-by: Eric Dumazet +Cc: Sam Kumar +Cc: Willem de Bruijn +Tested-by: Willem de Bruijn +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + include/net/ip.h | 4 ++-- + net/ipv4/ip_sockglue.c | 11 ++++++----- + net/ipv4/udp.c | 2 +- + net/ipv6/udp.c | 3 ++- + 4 files changed, 11 insertions(+), 9 deletions(-) + +--- a/include/net/ip.h ++++ b/include/net/ip.h +@@ -549,7 +549,7 @@ int ip_options_rcv_srr(struct sk_buff *s + */ + + void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb); +-void ip_cmsg_recv_offset(struct msghdr *msg, struct sk_buff *skb, int offset); ++void ip_cmsg_recv_offset(struct msghdr *msg, struct sk_buff *skb, int tlen, int offset); + int ip_cmsg_send(struct sock *sk, struct msghdr *msg, + struct ipcm_cookie *ipc, bool allow_ipv6); + int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, +@@ -571,7 +571,7 @@ void ip_local_error(struct sock *sk, int + + static inline void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb) + { +- ip_cmsg_recv_offset(msg, skb, 0); ++ ip_cmsg_recv_offset(msg, skb, 0, 0); + } + + bool icmp_global_allow(void); +--- a/net/ipv4/ip_sockglue.c ++++ b/net/ipv4/ip_sockglue.c +@@ -98,7 +98,7 @@ static void ip_cmsg_recv_retopts(struct + } + + static void ip_cmsg_recv_checksum(struct msghdr *msg, struct sk_buff *skb, +- int offset) ++ int tlen, int offset) + { + __wsum csum = skb->csum; + +@@ -106,8 +106,9 @@ static void ip_cmsg_recv_checksum(struct + return; + + if (offset != 0) +- csum = csum_sub(csum, csum_partial(skb_transport_header(skb), +- offset, 0)); ++ csum = csum_sub(csum, ++ csum_partial(skb_transport_header(skb) + tlen, ++ offset, 0)); + + put_cmsg(msg, SOL_IP, IP_CHECKSUM, sizeof(__wsum), &csum); + } +@@ -153,7 +154,7 @@ static void ip_cmsg_recv_dstaddr(struct + } + + void ip_cmsg_recv_offset(struct msghdr *msg, struct sk_buff *skb, +- int offset) ++ int tlen, int offset) + { + struct inet_sock *inet = inet_sk(skb->sk); + unsigned int flags = inet->cmsg_flags; +@@ -216,7 +217,7 @@ void ip_cmsg_recv_offset(struct msghdr * + } + + if (flags & IP_CMSG_CHECKSUM) +- ip_cmsg_recv_checksum(msg, skb, offset); ++ ip_cmsg_recv_checksum(msg, skb, tlen, offset); + } + EXPORT_SYMBOL(ip_cmsg_recv_offset); + +--- a/net/ipv4/udp.c ++++ b/net/ipv4/udp.c +@@ -1327,7 +1327,7 @@ try_again: + *addr_len = sizeof(*sin); + } + if (inet->cmsg_flags) +- ip_cmsg_recv_offset(msg, skb, sizeof(struct udphdr) + off); ++ ip_cmsg_recv_offset(msg, skb, sizeof(struct udphdr), off); + + err = copied; + if (flags & MSG_TRUNC) +--- a/net/ipv6/udp.c ++++ b/net/ipv6/udp.c +@@ -427,7 +427,8 @@ try_again: + + if (is_udp4) { + if (inet->cmsg_flags) +- ip_cmsg_recv(msg, skb); ++ ip_cmsg_recv_offset(msg, skb, ++ sizeof(struct udphdr), off); + } else { + if (np->rxopt.all) + ip6_datagram_recv_specific_ctl(sk, msg, skb);