From: Greg Kroah-Hartman Date: Mon, 27 Mar 2017 16:19:17 +0000 (+0200) Subject: 4.10-stable patches X-Git-Tag: v4.4.58~21 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=ad3041a80acc60fef462ba07c5d0014897c99a4d;p=thirdparty%2Fkernel%2Fstable-queue.git 4.10-stable patches added patches: amd-xgbe-fix-jumbo-mtu-processing-on-newer-hardware.patch amd-xgbe-fix-the-ecc-related-bit-position-definitions.patch genetlink-fix-counting-regression-on-ctrl_dumpfamily.patch ipv4-provide-stronger-user-input-validation-in-nl_fib_input.patch ipv6-make-sure-to-initialize-sockc.tsflags-before-first-use.patch net-bcmgenet-do-not-suspend-phy-if-wake-on-lan-is-enabled.patch net-bcmgenet-remove-bcmgenet_internal_phy_setup.patch net-mlx5-add-missing-entries-for-set-query-rate-limit-commands.patch net-mlx5-e-switch-don-t-allow-changing-inline-mode-when-flows-are-configured.patch net-mlx5-increase-number-of-max-qps-in-default-profile.patch net-mlx5e-avoid-supporting-udp-tunnel-port-ndo-for-vf-reps.patch net-mlx5e-change-the-tc-offload-rule-add-del-code-path-to-be-per-nic-or-e-switch.patch net-mlx5e-count-gso-packets-correctly.patch net-mlx5e-count-lro-packets-correctly.patch net-mlx5e-use-the-proper-uapi-values-when-offloading-tc-vlan-actions.patch net-openvswitch-set-the-ipv6-source-tunnel-key-address-attribute-correctly.patch net-properly-release-sk_frag.page.patch net-solve-a-napi-race.patch net-unix-properly-re-increment-inflight-counter-of-gc-discarded-candidates.patch net-vrf-reset-rt6i_idev-in-local-dst-after-put.patch openvswitch-add-missing-case-ovs_tunnel_key_attr_pad.patch qmi_wwan-add-dell-dw5811e.patch socket-bpf-fix-sk_filter-use-after-free-in-sk_clone_lock.patch tcp-initialize-icsk_ack.lrcvtime-at-session-start-time.patch --- diff --git a/queue-4.10/amd-xgbe-fix-jumbo-mtu-processing-on-newer-hardware.patch b/queue-4.10/amd-xgbe-fix-jumbo-mtu-processing-on-newer-hardware.patch new file mode 100644 index 00000000000..2ed78ba9902 --- /dev/null +++ b/queue-4.10/amd-xgbe-fix-jumbo-mtu-processing-on-newer-hardware.patch @@ -0,0 +1,284 @@ +From foo@baz Mon Mar 27 18:18:08 CEST 2017 +From: "Lendacky, Thomas" +Date: Wed, 15 Mar 2017 15:11:23 -0500 +Subject: amd-xgbe: Fix jumbo MTU processing on newer hardware + +From: "Lendacky, Thomas" + + +[ Upstream commit 622c36f143fc9566ba49d7cec994c2da1182d9e2 ] + +Newer hardware does not provide a cumulative payload length when multiple +descriptors are needed to handle the data. Once the MTU increases beyond +the size that can be handled by a single descriptor, the SKB does not get +built properly by the driver. + +The driver will now calculate the size of the data buffers used by the +hardware. The first buffer of the first descriptor is for packet headers +or packet headers and data when the headers can't be split. Subsequent +descriptors in a multi-descriptor chain will not use the first buffer. The +second buffer is used by all the descriptors in the chain for payload data. +Based on whether the driver is processing the first, intermediate, or last +descriptor it can calculate the buffer usage and build the SKB properly. + +Tested and verified on both old and new hardware. + +Signed-off-by: Tom Lendacky +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/amd/xgbe/xgbe-common.h | 6 + + drivers/net/ethernet/amd/xgbe/xgbe-dev.c | 20 +++-- + drivers/net/ethernet/amd/xgbe/xgbe-drv.c | 102 +++++++++++++++++----------- + 3 files changed, 78 insertions(+), 50 deletions(-) + +--- a/drivers/net/ethernet/amd/xgbe/xgbe-common.h ++++ b/drivers/net/ethernet/amd/xgbe/xgbe-common.h +@@ -1148,8 +1148,8 @@ + #define RX_PACKET_ATTRIBUTES_CSUM_DONE_WIDTH 1 + #define RX_PACKET_ATTRIBUTES_VLAN_CTAG_INDEX 1 + #define RX_PACKET_ATTRIBUTES_VLAN_CTAG_WIDTH 1 +-#define RX_PACKET_ATTRIBUTES_INCOMPLETE_INDEX 2 +-#define RX_PACKET_ATTRIBUTES_INCOMPLETE_WIDTH 1 ++#define RX_PACKET_ATTRIBUTES_LAST_INDEX 2 ++#define RX_PACKET_ATTRIBUTES_LAST_WIDTH 1 + #define RX_PACKET_ATTRIBUTES_CONTEXT_NEXT_INDEX 3 + #define RX_PACKET_ATTRIBUTES_CONTEXT_NEXT_WIDTH 1 + #define RX_PACKET_ATTRIBUTES_CONTEXT_INDEX 4 +@@ -1158,6 +1158,8 @@ + #define RX_PACKET_ATTRIBUTES_RX_TSTAMP_WIDTH 1 + #define RX_PACKET_ATTRIBUTES_RSS_HASH_INDEX 6 + #define RX_PACKET_ATTRIBUTES_RSS_HASH_WIDTH 1 ++#define RX_PACKET_ATTRIBUTES_FIRST_INDEX 7 ++#define RX_PACKET_ATTRIBUTES_FIRST_WIDTH 1 + + #define RX_NORMAL_DESC0_OVT_INDEX 0 + #define RX_NORMAL_DESC0_OVT_WIDTH 16 +--- a/drivers/net/ethernet/amd/xgbe/xgbe-dev.c ++++ b/drivers/net/ethernet/amd/xgbe/xgbe-dev.c +@@ -1896,10 +1896,15 @@ static int xgbe_dev_read(struct xgbe_cha + + /* Get the header length */ + if (XGMAC_GET_BITS_LE(rdesc->desc3, RX_NORMAL_DESC3, FD)) { ++ XGMAC_SET_BITS(packet->attributes, RX_PACKET_ATTRIBUTES, ++ FIRST, 1); + rdata->rx.hdr_len = XGMAC_GET_BITS_LE(rdesc->desc2, + RX_NORMAL_DESC2, HL); + if (rdata->rx.hdr_len) + pdata->ext_stats.rx_split_header_packets++; ++ } else { ++ XGMAC_SET_BITS(packet->attributes, RX_PACKET_ATTRIBUTES, ++ FIRST, 0); + } + + /* Get the RSS hash */ +@@ -1922,19 +1927,16 @@ static int xgbe_dev_read(struct xgbe_cha + } + } + +- /* Get the packet length */ +- rdata->rx.len = XGMAC_GET_BITS_LE(rdesc->desc3, RX_NORMAL_DESC3, PL); +- +- if (!XGMAC_GET_BITS_LE(rdesc->desc3, RX_NORMAL_DESC3, LD)) { +- /* Not all the data has been transferred for this packet */ +- XGMAC_SET_BITS(packet->attributes, RX_PACKET_ATTRIBUTES, +- INCOMPLETE, 1); ++ /* Not all the data has been transferred for this packet */ ++ if (!XGMAC_GET_BITS_LE(rdesc->desc3, RX_NORMAL_DESC3, LD)) + return 0; +- } + + /* This is the last of the data for this packet */ + XGMAC_SET_BITS(packet->attributes, RX_PACKET_ATTRIBUTES, +- INCOMPLETE, 0); ++ LAST, 1); ++ ++ /* Get the packet length */ ++ rdata->rx.len = XGMAC_GET_BITS_LE(rdesc->desc3, RX_NORMAL_DESC3, PL); + + /* Set checksum done indicator as appropriate */ + if (netdev->features & NETIF_F_RXCSUM) +--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c ++++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c +@@ -1973,13 +1973,12 @@ static struct sk_buff *xgbe_create_skb(s + { + struct sk_buff *skb; + u8 *packet; +- unsigned int copy_len; + + skb = napi_alloc_skb(napi, rdata->rx.hdr.dma_len); + if (!skb) + return NULL; + +- /* Start with the header buffer which may contain just the header ++ /* Pull in the header buffer which may contain just the header + * or the header plus data + */ + dma_sync_single_range_for_cpu(pdata->dev, rdata->rx.hdr.dma_base, +@@ -1988,30 +1987,49 @@ static struct sk_buff *xgbe_create_skb(s + + packet = page_address(rdata->rx.hdr.pa.pages) + + rdata->rx.hdr.pa.pages_offset; +- copy_len = (rdata->rx.hdr_len) ? rdata->rx.hdr_len : len; +- copy_len = min(rdata->rx.hdr.dma_len, copy_len); +- skb_copy_to_linear_data(skb, packet, copy_len); +- skb_put(skb, copy_len); +- +- len -= copy_len; +- if (len) { +- /* Add the remaining data as a frag */ +- dma_sync_single_range_for_cpu(pdata->dev, +- rdata->rx.buf.dma_base, +- rdata->rx.buf.dma_off, +- rdata->rx.buf.dma_len, +- DMA_FROM_DEVICE); +- +- skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, +- rdata->rx.buf.pa.pages, +- rdata->rx.buf.pa.pages_offset, +- len, rdata->rx.buf.dma_len); +- rdata->rx.buf.pa.pages = NULL; +- } ++ skb_copy_to_linear_data(skb, packet, len); ++ skb_put(skb, len); + + return skb; + } + ++static unsigned int xgbe_rx_buf1_len(struct xgbe_ring_data *rdata, ++ struct xgbe_packet_data *packet) ++{ ++ /* Always zero if not the first descriptor */ ++ if (!XGMAC_GET_BITS(packet->attributes, RX_PACKET_ATTRIBUTES, FIRST)) ++ return 0; ++ ++ /* First descriptor with split header, return header length */ ++ if (rdata->rx.hdr_len) ++ return rdata->rx.hdr_len; ++ ++ /* First descriptor but not the last descriptor and no split header, ++ * so the full buffer was used ++ */ ++ if (!XGMAC_GET_BITS(packet->attributes, RX_PACKET_ATTRIBUTES, LAST)) ++ return rdata->rx.hdr.dma_len; ++ ++ /* First descriptor and last descriptor and no split header, so ++ * calculate how much of the buffer was used ++ */ ++ return min_t(unsigned int, rdata->rx.hdr.dma_len, rdata->rx.len); ++} ++ ++static unsigned int xgbe_rx_buf2_len(struct xgbe_ring_data *rdata, ++ struct xgbe_packet_data *packet, ++ unsigned int len) ++{ ++ /* Always the full buffer if not the last descriptor */ ++ if (!XGMAC_GET_BITS(packet->attributes, RX_PACKET_ATTRIBUTES, LAST)) ++ return rdata->rx.buf.dma_len; ++ ++ /* Last descriptor so calculate how much of the buffer was used ++ * for the last bit of data ++ */ ++ return rdata->rx.len - len; ++} ++ + static int xgbe_tx_poll(struct xgbe_channel *channel) + { + struct xgbe_prv_data *pdata = channel->pdata; +@@ -2094,8 +2112,8 @@ static int xgbe_rx_poll(struct xgbe_chan + struct napi_struct *napi; + struct sk_buff *skb; + struct skb_shared_hwtstamps *hwtstamps; +- unsigned int incomplete, error, context_next, context; +- unsigned int len, rdesc_len, max_len; ++ unsigned int last, error, context_next, context; ++ unsigned int len, buf1_len, buf2_len, max_len; + unsigned int received = 0; + int packet_count = 0; + +@@ -2105,7 +2123,7 @@ static int xgbe_rx_poll(struct xgbe_chan + if (!ring) + return 0; + +- incomplete = 0; ++ last = 0; + context_next = 0; + + napi = (pdata->per_channel_irq) ? &channel->napi : &pdata->napi; +@@ -2139,9 +2157,8 @@ read_again: + received++; + ring->cur++; + +- incomplete = XGMAC_GET_BITS(packet->attributes, +- RX_PACKET_ATTRIBUTES, +- INCOMPLETE); ++ last = XGMAC_GET_BITS(packet->attributes, RX_PACKET_ATTRIBUTES, ++ LAST); + context_next = XGMAC_GET_BITS(packet->attributes, + RX_PACKET_ATTRIBUTES, + CONTEXT_NEXT); +@@ -2150,7 +2167,7 @@ read_again: + CONTEXT); + + /* Earlier error, just drain the remaining data */ +- if ((incomplete || context_next) && error) ++ if ((!last || context_next) && error) + goto read_again; + + if (error || packet->errors) { +@@ -2162,16 +2179,22 @@ read_again: + } + + if (!context) { +- /* Length is cumulative, get this descriptor's length */ +- rdesc_len = rdata->rx.len - len; +- len += rdesc_len; ++ /* Get the data length in the descriptor buffers */ ++ buf1_len = xgbe_rx_buf1_len(rdata, packet); ++ len += buf1_len; ++ buf2_len = xgbe_rx_buf2_len(rdata, packet, len); ++ len += buf2_len; + +- if (rdesc_len && !skb) { ++ if (!skb) { + skb = xgbe_create_skb(pdata, napi, rdata, +- rdesc_len); +- if (!skb) ++ buf1_len); ++ if (!skb) { + error = 1; +- } else if (rdesc_len) { ++ goto skip_data; ++ } ++ } ++ ++ if (buf2_len) { + dma_sync_single_range_for_cpu(pdata->dev, + rdata->rx.buf.dma_base, + rdata->rx.buf.dma_off, +@@ -2181,13 +2204,14 @@ read_again: + skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, + rdata->rx.buf.pa.pages, + rdata->rx.buf.pa.pages_offset, +- rdesc_len, ++ buf2_len, + rdata->rx.buf.dma_len); + rdata->rx.buf.pa.pages = NULL; + } + } + +- if (incomplete || context_next) ++skip_data: ++ if (!last || context_next) + goto read_again; + + if (!skb) +@@ -2245,7 +2269,7 @@ next_packet: + } + + /* Check if we need to save state before leaving */ +- if (received && (incomplete || context_next)) { ++ if (received && (!last || context_next)) { + rdata = XGBE_GET_DESC_DATA(ring, ring->cur); + rdata->state_saved = 1; + rdata->state.skb = skb; diff --git a/queue-4.10/amd-xgbe-fix-the-ecc-related-bit-position-definitions.patch b/queue-4.10/amd-xgbe-fix-the-ecc-related-bit-position-definitions.patch new file mode 100644 index 00000000000..1d38b64afb8 --- /dev/null +++ b/queue-4.10/amd-xgbe-fix-the-ecc-related-bit-position-definitions.patch @@ -0,0 +1,67 @@ +From foo@baz Mon Mar 27 18:18:08 CEST 2017 +From: "Lendacky, Thomas" +Date: Wed, 22 Mar 2017 17:25:27 -0500 +Subject: amd-xgbe: Fix the ECC-related bit position definitions + +From: "Lendacky, Thomas" + + +[ Upstream commit f43feef4e6acde10857fcbfdede790d6b3f2c71d ] + +The ECC bit positions that describe whether the ECC interrupt is for +Tx, Rx or descriptor memory and whether the it is a single correctable +or double detected error were defined in incorrectly (reversed order). +Fix the bit position definitions for these settings so that the proper +ECC handling is performed. + +Signed-off-by: Tom Lendacky +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/amd/xgbe/xgbe-common.h | 24 ++++++++++++------------ + 1 file changed, 12 insertions(+), 12 deletions(-) + +--- a/drivers/net/ethernet/amd/xgbe/xgbe-common.h ++++ b/drivers/net/ethernet/amd/xgbe/xgbe-common.h +@@ -984,29 +984,29 @@ + #define XP_ECC_CNT1_DESC_DED_WIDTH 8 + #define XP_ECC_CNT1_DESC_SEC_INDEX 0 + #define XP_ECC_CNT1_DESC_SEC_WIDTH 8 +-#define XP_ECC_IER_DESC_DED_INDEX 0 ++#define XP_ECC_IER_DESC_DED_INDEX 5 + #define XP_ECC_IER_DESC_DED_WIDTH 1 +-#define XP_ECC_IER_DESC_SEC_INDEX 1 ++#define XP_ECC_IER_DESC_SEC_INDEX 4 + #define XP_ECC_IER_DESC_SEC_WIDTH 1 +-#define XP_ECC_IER_RX_DED_INDEX 2 ++#define XP_ECC_IER_RX_DED_INDEX 3 + #define XP_ECC_IER_RX_DED_WIDTH 1 +-#define XP_ECC_IER_RX_SEC_INDEX 3 ++#define XP_ECC_IER_RX_SEC_INDEX 2 + #define XP_ECC_IER_RX_SEC_WIDTH 1 +-#define XP_ECC_IER_TX_DED_INDEX 4 ++#define XP_ECC_IER_TX_DED_INDEX 1 + #define XP_ECC_IER_TX_DED_WIDTH 1 +-#define XP_ECC_IER_TX_SEC_INDEX 5 ++#define XP_ECC_IER_TX_SEC_INDEX 0 + #define XP_ECC_IER_TX_SEC_WIDTH 1 +-#define XP_ECC_ISR_DESC_DED_INDEX 0 ++#define XP_ECC_ISR_DESC_DED_INDEX 5 + #define XP_ECC_ISR_DESC_DED_WIDTH 1 +-#define XP_ECC_ISR_DESC_SEC_INDEX 1 ++#define XP_ECC_ISR_DESC_SEC_INDEX 4 + #define XP_ECC_ISR_DESC_SEC_WIDTH 1 +-#define XP_ECC_ISR_RX_DED_INDEX 2 ++#define XP_ECC_ISR_RX_DED_INDEX 3 + #define XP_ECC_ISR_RX_DED_WIDTH 1 +-#define XP_ECC_ISR_RX_SEC_INDEX 3 ++#define XP_ECC_ISR_RX_SEC_INDEX 2 + #define XP_ECC_ISR_RX_SEC_WIDTH 1 +-#define XP_ECC_ISR_TX_DED_INDEX 4 ++#define XP_ECC_ISR_TX_DED_INDEX 1 + #define XP_ECC_ISR_TX_DED_WIDTH 1 +-#define XP_ECC_ISR_TX_SEC_INDEX 5 ++#define XP_ECC_ISR_TX_SEC_INDEX 0 + #define XP_ECC_ISR_TX_SEC_WIDTH 1 + #define XP_I2C_MUTEX_BUSY_INDEX 31 + #define XP_I2C_MUTEX_BUSY_WIDTH 1 diff --git a/queue-4.10/genetlink-fix-counting-regression-on-ctrl_dumpfamily.patch b/queue-4.10/genetlink-fix-counting-regression-on-ctrl_dumpfamily.patch new file mode 100644 index 00000000000..c8202dd556d --- /dev/null +++ b/queue-4.10/genetlink-fix-counting-regression-on-ctrl_dumpfamily.patch @@ -0,0 +1,58 @@ +From foo@baz Mon Mar 27 18:18:08 CEST 2017 +From: Stanislaw Gruszka +Date: Wed, 22 Mar 2017 16:08:33 +0100 +Subject: genetlink: fix counting regression on ctrl_dumpfamily() + +From: Stanislaw Gruszka + + +[ Upstream commit 1d2a6a5e4bf2921531071fcff8538623dce74efa ] + +Commit 2ae0f17df1cd ("genetlink: use idr to track families") replaced + + if (++n < fams_to_skip) + continue; +into: + + if (n++ < fams_to_skip) + continue; + +This subtle change cause that on retry ctrl_dumpfamily() call we omit +one family that failed to do ctrl_fill_info() on previous call, because +cb->args[0] = n number counts also family that failed to do +ctrl_fill_info(). + +Patch fixes the problem and avoid confusion in the future just decrease +n counter when ctrl_fill_info() fail. + +User visible problem caused by this bug is failure to get access to +some genetlink family i.e. nl80211. However problem is reproducible +only if number of registered genetlink families is big enough to +cause second call of ctrl_dumpfamily(). + +Cc: Xose Vazquez Perez +Cc: Larry Finger +Cc: Johannes Berg +Fixes: 2ae0f17df1cd ("genetlink: use idr to track families") +Signed-off-by: Stanislaw Gruszka +Acked-by: Johannes Berg +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/netlink/genetlink.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/net/netlink/genetlink.c ++++ b/net/netlink/genetlink.c +@@ -783,8 +783,10 @@ static int ctrl_dumpfamily(struct sk_buf + + if (ctrl_fill_info(rt, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, +- skb, CTRL_CMD_NEWFAMILY) < 0) ++ skb, CTRL_CMD_NEWFAMILY) < 0) { ++ n--; + break; ++ } + } + + cb->args[0] = n; diff --git a/queue-4.10/ipv4-provide-stronger-user-input-validation-in-nl_fib_input.patch b/queue-4.10/ipv4-provide-stronger-user-input-validation-in-nl_fib_input.patch new file mode 100644 index 00000000000..7e11b749585 --- /dev/null +++ b/queue-4.10/ipv4-provide-stronger-user-input-validation-in-nl_fib_input.patch @@ -0,0 +1,39 @@ +From foo@baz Mon Mar 27 18:18:08 CEST 2017 +From: Eric Dumazet +Date: Tue, 21 Mar 2017 19:22:28 -0700 +Subject: ipv4: provide stronger user input validation in nl_fib_input() + +From: Eric Dumazet + + +[ Upstream commit c64c0b3cac4c5b8cb093727d2c19743ea3965c0b ] + +Alexander reported a KMSAN splat caused by reads of uninitialized +field (tb_id_in) from user provided struct fib_result_nl + +It turns out nl_fib_input() sanity tests on user input is a bit +wrong : + +User can pretend nlh->nlmsg_len is big enough, but provide +at sendmsg() time a too small buffer. + +Reported-by: Alexander Potapenko +Signed-off-by: Eric Dumazet +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/fib_frontend.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/net/ipv4/fib_frontend.c ++++ b/net/ipv4/fib_frontend.c +@@ -1082,7 +1082,8 @@ static void nl_fib_input(struct sk_buff + + net = sock_net(skb->sk); + nlh = nlmsg_hdr(skb); +- if (skb->len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len || ++ if (skb->len < nlmsg_total_size(sizeof(*frn)) || ++ skb->len < nlh->nlmsg_len || + nlmsg_len(nlh) < sizeof(*frn)) + return; + diff --git a/queue-4.10/ipv6-make-sure-to-initialize-sockc.tsflags-before-first-use.patch b/queue-4.10/ipv6-make-sure-to-initialize-sockc.tsflags-before-first-use.patch new file mode 100644 index 00000000000..0c736abdfea --- /dev/null +++ b/queue-4.10/ipv6-make-sure-to-initialize-sockc.tsflags-before-first-use.patch @@ -0,0 +1,43 @@ +From foo@baz Mon Mar 27 18:18:08 CEST 2017 +From: Alexander Potapenko +Date: Tue, 21 Mar 2017 17:14:27 +0100 +Subject: ipv6: make sure to initialize sockc.tsflags before first use + +From: Alexander Potapenko + + +[ Upstream commit d515684d78148884d5fc425ba904c50f03844020 ] + +In the case udp_sk(sk)->pending is AF_INET6, udpv6_sendmsg() would +jump to do_append_data, skipping the initialization of sockc.tsflags. +Fix the problem by moving sockc.tsflags initialization earlier. + +The bug was detected with KMSAN. + +Fixes: c14ac9451c34 ("sock: enable timestamping using control messages") +Signed-off-by: Alexander Potapenko +Acked-by: Soheil Hassas Yeganeh +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv6/udp.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/ipv6/udp.c ++++ b/net/ipv6/udp.c +@@ -1022,6 +1022,7 @@ int udpv6_sendmsg(struct sock *sk, struc + ipc6.hlimit = -1; + ipc6.tclass = -1; + ipc6.dontfrag = -1; ++ sockc.tsflags = sk->sk_tsflags; + + /* destination address check */ + if (sin6) { +@@ -1146,7 +1147,6 @@ do_udp_sendmsg: + + fl6.flowi6_mark = sk->sk_mark; + fl6.flowi6_uid = sk->sk_uid; +- sockc.tsflags = sk->sk_tsflags; + + if (msg->msg_controllen) { + opt = &opt_space; diff --git a/queue-4.10/net-bcmgenet-do-not-suspend-phy-if-wake-on-lan-is-enabled.patch b/queue-4.10/net-bcmgenet-do-not-suspend-phy-if-wake-on-lan-is-enabled.patch new file mode 100644 index 00000000000..98ccd8cadac --- /dev/null +++ b/queue-4.10/net-bcmgenet-do-not-suspend-phy-if-wake-on-lan-is-enabled.patch @@ -0,0 +1,43 @@ +From foo@baz Mon Mar 27 18:18:08 CEST 2017 +From: Florian Fainelli +Date: Wed, 15 Mar 2017 12:57:21 -0700 +Subject: net: bcmgenet: Do not suspend PHY if Wake-on-LAN is enabled + +From: Florian Fainelli + + +[ Upstream commit 5371bbf4b295eea334ed453efa286afa2c3ccff3 ] + +Suspending the PHY would be putting it in a low power state where it +may no longer allow us to do Wake-on-LAN. + +Fixes: cc013fb48898 ("net: bcmgenet: correctly suspend and resume PHY device") +Signed-off-by: Florian Fainelli +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/broadcom/genet/bcmgenet.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c ++++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c +@@ -3395,7 +3395,8 @@ static int bcmgenet_suspend(struct devic + + bcmgenet_netif_stop(dev); + +- phy_suspend(priv->phydev); ++ if (!device_may_wakeup(d)) ++ phy_suspend(priv->phydev); + + netif_device_detach(dev); + +@@ -3492,7 +3493,8 @@ static int bcmgenet_resume(struct device + + netif_device_attach(dev); + +- phy_resume(priv->phydev); ++ if (!device_may_wakeup(d)) ++ phy_resume(priv->phydev); + + if (priv->eee.eee_enabled) + bcmgenet_eee_enable_set(dev, true); diff --git a/queue-4.10/net-bcmgenet-remove-bcmgenet_internal_phy_setup.patch b/queue-4.10/net-bcmgenet-remove-bcmgenet_internal_phy_setup.patch new file mode 100644 index 00000000000..9a24b7350f7 --- /dev/null +++ b/queue-4.10/net-bcmgenet-remove-bcmgenet_internal_phy_setup.patch @@ -0,0 +1,85 @@ +From foo@baz Mon Mar 27 18:18:08 CEST 2017 +From: Doug Berger +Date: Tue, 21 Mar 2017 14:01:06 -0700 +Subject: net: bcmgenet: remove bcmgenet_internal_phy_setup() + +From: Doug Berger + + +[ Upstream commit 31739eae738ccbe8b9d627c3f2251017ca03f4d2 ] + +Commit 6ac3ce8295e6 ("net: bcmgenet: Remove excessive PHY reset") +removed the bcmgenet_mii_reset() function from bcmgenet_power_up() and +bcmgenet_internal_phy_setup() functions. In so doing it broke the reset +of the internal PHY devices used by the GENETv1-GENETv3 which required +this reset before the UniMAC was enabled. It also broke the internal +GPHY devices used by the GENETv4 because the config_init that installed +the AFE workaround was no longer occurring after the reset of the GPHY +performed by bcmgenet_phy_power_set() in bcmgenet_internal_phy_setup(). +In addition the code in bcmgenet_internal_phy_setup() related to the +"enable APD" comment goes with the bcmgenet_mii_reset() so it should +have also been removed. + +Commit bd4060a6108b ("net: bcmgenet: Power on integrated GPHY in +bcmgenet_power_up()") moved the bcmgenet_phy_power_set() call to the +bcmgenet_power_up() function, but failed to remove it from the +bcmgenet_internal_phy_setup() function. Had it done so, the +bcmgenet_internal_phy_setup() function would have been empty and could +have been removed at that time. + +Commit 5dbebbb44a6a ("net: bcmgenet: Software reset EPHY after power on") +was submitted to correct the functional problems introduced by +commit 6ac3ce8295e6 ("net: bcmgenet: Remove excessive PHY reset"). It +was included in v4.4 and made available on 4.3-stable. Unfortunately, +it didn't fully revert the commit because this bcmgenet_mii_reset() +doesn't apply the soft reset to the internal GPHY used by GENETv4 like +the previous one did. This prevents the restoration of the AFE work- +arounds for internal GPHY devices after the bcmgenet_phy_power_set() in +bcmgenet_internal_phy_setup(). + +This commit takes the alternate approach of removing the unnecessary +bcmgenet_internal_phy_setup() function which shouldn't have been in v4.3 +so that when bcmgenet_mii_reset() was restored it should have only gone +into bcmgenet_power_up(). This will avoid the problems while also +removing the redundancy (and hopefully some of the confusion). + +Fixes: 6ac3ce8295e6 ("net: bcmgenet: Remove excessive PHY reset") +Signed-off-by: Doug Berger +Reviewed-by: Florian Fainelli +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/broadcom/genet/bcmmii.c | 15 --------------- + 1 file changed, 15 deletions(-) + +--- a/drivers/net/ethernet/broadcom/genet/bcmmii.c ++++ b/drivers/net/ethernet/broadcom/genet/bcmmii.c +@@ -220,20 +220,6 @@ void bcmgenet_phy_power_set(struct net_d + udelay(60); + } + +-static void bcmgenet_internal_phy_setup(struct net_device *dev) +-{ +- struct bcmgenet_priv *priv = netdev_priv(dev); +- u32 reg; +- +- /* Power up PHY */ +- bcmgenet_phy_power_set(dev, true); +- /* enable APD */ +- reg = bcmgenet_ext_readl(priv, EXT_EXT_PWR_MGMT); +- reg |= EXT_PWR_DN_EN_LD; +- bcmgenet_ext_writel(priv, reg, EXT_EXT_PWR_MGMT); +- bcmgenet_mii_reset(dev); +-} +- + static void bcmgenet_moca_phy_setup(struct bcmgenet_priv *priv) + { + u32 reg; +@@ -281,7 +267,6 @@ int bcmgenet_mii_config(struct net_devic + + if (priv->internal_phy) { + phy_name = "internal PHY"; +- bcmgenet_internal_phy_setup(dev); + } else if (priv->phy_interface == PHY_INTERFACE_MODE_MOCA) { + phy_name = "MoCA"; + bcmgenet_moca_phy_setup(priv); diff --git a/queue-4.10/net-mlx5-add-missing-entries-for-set-query-rate-limit-commands.patch b/queue-4.10/net-mlx5-add-missing-entries-for-set-query-rate-limit-commands.patch new file mode 100644 index 00000000000..cc0ebc81e7d --- /dev/null +++ b/queue-4.10/net-mlx5-add-missing-entries-for-set-query-rate-limit-commands.patch @@ -0,0 +1,44 @@ +From foo@baz Mon Mar 27 18:18:08 CEST 2017 +From: Or Gerlitz +Date: Tue, 21 Mar 2017 15:59:12 +0200 +Subject: net/mlx5: Add missing entries for set/query rate limit commands + +From: Or Gerlitz + + +[ Upstream commit 1f30a86c58093046dc3e49c23d2618894e098f7a ] + +The switch cases for the rate limit set and query commands were +missing, which could get us wrong under fw error or driver reset +flow, fix that. + +Fixes: 1466cc5b23d1 ('net/mlx5: Rate limit tables support') +Signed-off-by: Or Gerlitz +Reviewed-by: Hadar Hen Zion +Signed-off-by: Saeed Mahameed +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +@@ -361,6 +361,8 @@ static int mlx5_internal_err_ret_value(s + case MLX5_CMD_OP_QUERY_VPORT_COUNTER: + case MLX5_CMD_OP_ALLOC_Q_COUNTER: + case MLX5_CMD_OP_QUERY_Q_COUNTER: ++ case MLX5_CMD_OP_SET_RATE_LIMIT: ++ case MLX5_CMD_OP_QUERY_RATE_LIMIT: + case MLX5_CMD_OP_ALLOC_PD: + case MLX5_CMD_OP_ALLOC_UAR: + case MLX5_CMD_OP_CONFIG_INT_MODERATION: +@@ -497,6 +499,8 @@ const char *mlx5_command_str(int command + MLX5_COMMAND_STR_CASE(ALLOC_Q_COUNTER); + MLX5_COMMAND_STR_CASE(DEALLOC_Q_COUNTER); + MLX5_COMMAND_STR_CASE(QUERY_Q_COUNTER); ++ MLX5_COMMAND_STR_CASE(SET_RATE_LIMIT); ++ MLX5_COMMAND_STR_CASE(QUERY_RATE_LIMIT); + MLX5_COMMAND_STR_CASE(ALLOC_PD); + MLX5_COMMAND_STR_CASE(DEALLOC_PD); + MLX5_COMMAND_STR_CASE(ALLOC_UAR); diff --git a/queue-4.10/net-mlx5-e-switch-don-t-allow-changing-inline-mode-when-flows-are-configured.patch b/queue-4.10/net-mlx5-e-switch-don-t-allow-changing-inline-mode-when-flows-are-configured.patch new file mode 100644 index 00000000000..432cc1357ab --- /dev/null +++ b/queue-4.10/net-mlx5-e-switch-don-t-allow-changing-inline-mode-when-flows-are-configured.patch @@ -0,0 +1,69 @@ +From foo@baz Mon Mar 27 18:18:08 CEST 2017 +From: Roi Dayan +Date: Tue, 21 Mar 2017 15:59:14 +0200 +Subject: net/mlx5: E-Switch, Don't allow changing inline mode when flows are configured + +From: Roi Dayan + + +[ Upstream commit 375f51e2b5b7b9a42b3139aea519cbb1bfc5d6ef ] + +Changing the eswitch inline mode can potentially cause already configured +flows not to match the policy. E.g. set policy L4, add some L4 rules, +set policy to L2 --> bad! Hence we disallow it. + +Keep track of how many offloaded rules are now set and refuse +inline mode changes if this isn't zero. + +Fixes: bffaa916588e ("net/mlx5: E-Switch, Add control for inline mode") +Signed-off-by: Roi Dayan +Reviewed-by: Or Gerlitz +Signed-off-by: Saeed Mahameed +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 1 + + drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 8 ++++++++ + 2 files changed, 9 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h +@@ -201,6 +201,7 @@ struct mlx5_esw_offload { + struct mlx5_eswitch_rep *vport_reps; + DECLARE_HASHTABLE(encap_tbl, 8); + u8 inline_mode; ++ u64 num_flows; + }; + + struct mlx5_eswitch { +--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +@@ -93,6 +93,8 @@ mlx5_eswitch_add_offloaded_rule(struct m + spec, &flow_act, dest, i); + if (IS_ERR(rule)) + mlx5_fc_destroy(esw->dev, counter); ++ else ++ esw->offloads.num_flows++; + + return rule; + } +@@ -108,6 +110,7 @@ mlx5_eswitch_del_offloaded_rule(struct m + counter = mlx5_flow_rule_counter(rule); + mlx5_del_flow_rules(rule); + mlx5_fc_destroy(esw->dev, counter); ++ esw->offloads.num_flows--; + } + } + +@@ -919,6 +922,11 @@ int mlx5_devlink_eswitch_inline_mode_set + MLX5_CAP_INLINE_MODE_VPORT_CONTEXT) + return -EOPNOTSUPP; + ++ if (esw->offloads.num_flows > 0) { ++ esw_warn(dev, "Can't set inline mode when flows are configured\n"); ++ return -EOPNOTSUPP; ++ } ++ + err = esw_inline_mode_from_devlink(mode, &mlx5_mode); + if (err) + goto out; diff --git a/queue-4.10/net-mlx5-increase-number-of-max-qps-in-default-profile.patch b/queue-4.10/net-mlx5-increase-number-of-max-qps-in-default-profile.patch new file mode 100644 index 00000000000..1aaf360886f --- /dev/null +++ b/queue-4.10/net-mlx5-increase-number-of-max-qps-in-default-profile.patch @@ -0,0 +1,34 @@ +From foo@baz Mon Mar 27 18:18:08 CEST 2017 +From: Maor Gottlieb +Date: Tue, 21 Mar 2017 15:59:17 +0200 +Subject: net/mlx5: Increase number of max QPs in default profile + +From: Maor Gottlieb + + +[ Upstream commit 5f40b4ed975c26016cf41953b7510fe90718e21c ] + +With ConnectX-4 sharing SRQs from the same space as QPs, we hit a +limit preventing some applications to allocate needed QPs amount. +Double the size to 256K. + +Fixes: e126ba97dba9e ('mlx5: Add driver for Mellanox Connect-IB adapters') +Signed-off-by: Maor Gottlieb +Signed-off-by: Saeed Mahameed +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/mellanox/mlx5/core/main.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c +@@ -87,7 +87,7 @@ static struct mlx5_profile profile[] = { + [2] = { + .mask = MLX5_PROF_MASK_QP_SIZE | + MLX5_PROF_MASK_MR_CACHE, +- .log_max_qp = 17, ++ .log_max_qp = 18, + .mr_cache[0] = { + .size = 500, + .limit = 250 diff --git a/queue-4.10/net-mlx5e-avoid-supporting-udp-tunnel-port-ndo-for-vf-reps.patch b/queue-4.10/net-mlx5e-avoid-supporting-udp-tunnel-port-ndo-for-vf-reps.patch new file mode 100644 index 00000000000..904d7737c73 --- /dev/null +++ b/queue-4.10/net-mlx5e-avoid-supporting-udp-tunnel-port-ndo-for-vf-reps.patch @@ -0,0 +1,127 @@ +From foo@baz Mon Mar 27 18:18:08 CEST 2017 +From: Paul Blakey +Date: Tue, 21 Mar 2017 15:59:16 +0200 +Subject: net/mlx5e: Avoid supporting udp tunnel port ndo for VF reps + +From: Paul Blakey + + +[ Upstream commit 1ad9a00ae0efc2e9337148d6c382fad3d27bf99a ] + +This was added to allow the TC offloading code to identify offloading +encap/decap vxlan rules. + +The VF reps are effectively related to the same mlx5 PCI device as the +PF. Since the kernel invokes the (say) delete ndo for each netdev, the +FW erred on multiple vxlan dst port deletes when the port was deleted +from the system. + +We fix that by keeping the registration to be carried out only by the +PF. Since the PF serves as the uplink device, the VF reps will look +up a port there and realize if they are ok to offload that. + +Tested: + + + ip link add vxlan1 type vxlan id 44 dev ens5f0 dstport 9999 + ip link set vxlan1 up + ip link del dev vxlan1 + +Fixes: 4a25730eb202 ('net/mlx5e: Add ndo_udp_tunnel_add to VF representors') +Signed-off-by: Paul Blakey +Reviewed-by: Or Gerlitz +Signed-off-by: Saeed Mahameed +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/mellanox/mlx5/core/en.h | 4 ---- + drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 8 ++++---- + drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 2 -- + drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 9 +++++++-- + 4 files changed, 11 insertions(+), 12 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h +@@ -921,10 +921,6 @@ void mlx5e_destroy_netdev(struct mlx5_co + int mlx5e_attach_netdev(struct mlx5_core_dev *mdev, struct net_device *netdev); + void mlx5e_detach_netdev(struct mlx5_core_dev *mdev, struct net_device *netdev); + u32 mlx5e_choose_lro_timeout(struct mlx5_core_dev *mdev, u32 wanted_timeout); +-void mlx5e_add_vxlan_port(struct net_device *netdev, +- struct udp_tunnel_info *ti); +-void mlx5e_del_vxlan_port(struct net_device *netdev, +- struct udp_tunnel_info *ti); + + int mlx5e_get_offload_stats(int attr_id, const struct net_device *dev, + void *sp); +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +@@ -3055,8 +3055,8 @@ static int mlx5e_get_vf_stats(struct net + vf_stats); + } + +-void mlx5e_add_vxlan_port(struct net_device *netdev, +- struct udp_tunnel_info *ti) ++static void mlx5e_add_vxlan_port(struct net_device *netdev, ++ struct udp_tunnel_info *ti) + { + struct mlx5e_priv *priv = netdev_priv(netdev); + +@@ -3069,8 +3069,8 @@ void mlx5e_add_vxlan_port(struct net_dev + mlx5e_vxlan_queue_work(priv, ti->sa_family, be16_to_cpu(ti->port), 1); + } + +-void mlx5e_del_vxlan_port(struct net_device *netdev, +- struct udp_tunnel_info *ti) ++static void mlx5e_del_vxlan_port(struct net_device *netdev, ++ struct udp_tunnel_info *ti) + { + struct mlx5e_priv *priv = netdev_priv(netdev); + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +@@ -394,8 +394,6 @@ static const struct net_device_ops mlx5e + .ndo_get_phys_port_name = mlx5e_rep_get_phys_port_name, + .ndo_setup_tc = mlx5e_rep_ndo_setup_tc, + .ndo_get_stats64 = mlx5e_rep_get_stats, +- .ndo_udp_tunnel_add = mlx5e_add_vxlan_port, +- .ndo_udp_tunnel_del = mlx5e_del_vxlan_port, + .ndo_has_offload_stats = mlx5e_has_offload_stats, + .ndo_get_offload_stats = mlx5e_get_offload_stats, + }; +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +@@ -264,12 +264,15 @@ static int parse_tunnel_attr(struct mlx5 + skb_flow_dissector_target(f->dissector, + FLOW_DISSECTOR_KEY_ENC_PORTS, + f->mask); ++ struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; ++ struct net_device *up_dev = mlx5_eswitch_get_uplink_netdev(esw); ++ struct mlx5e_priv *up_priv = netdev_priv(up_dev); + + /* Full udp dst port must be given */ + if (memchr_inv(&mask->dst, 0xff, sizeof(mask->dst))) + goto vxlan_match_offload_err; + +- if (mlx5e_vxlan_lookup_port(priv, be16_to_cpu(key->dst)) && ++ if (mlx5e_vxlan_lookup_port(up_priv, be16_to_cpu(key->dst)) && + MLX5_CAP_ESW(priv->mdev, vxlan_encap_decap)) + parse_vxlan_attr(spec, f); + else { +@@ -827,6 +830,8 @@ static int mlx5e_attach_encap(struct mlx + struct mlx5_esw_flow_attr *attr) + { + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; ++ struct net_device *up_dev = mlx5_eswitch_get_uplink_netdev(esw); ++ struct mlx5e_priv *up_priv = netdev_priv(up_dev); + unsigned short family = ip_tunnel_info_af(tun_info); + struct ip_tunnel_key *key = &tun_info->key; + struct mlx5_encap_info info; +@@ -849,7 +854,7 @@ vxlan_encap_offload_err: + return -EOPNOTSUPP; + } + +- if (mlx5e_vxlan_lookup_port(priv, be16_to_cpu(key->tp_dst)) && ++ if (mlx5e_vxlan_lookup_port(up_priv, be16_to_cpu(key->tp_dst)) && + MLX5_CAP_ESW(priv->mdev, vxlan_encap_decap)) { + info.tp_dst = key->tp_dst; + info.tun_id = tunnel_id_to_key32(key->tun_id); diff --git a/queue-4.10/net-mlx5e-change-the-tc-offload-rule-add-del-code-path-to-be-per-nic-or-e-switch.patch b/queue-4.10/net-mlx5e-change-the-tc-offload-rule-add-del-code-path-to-be-per-nic-or-e-switch.patch new file mode 100644 index 00000000000..4bf31b1e300 --- /dev/null +++ b/queue-4.10/net-mlx5e-change-the-tc-offload-rule-add-del-code-path-to-be-per-nic-or-e-switch.patch @@ -0,0 +1,144 @@ +From foo@baz Mon Mar 27 18:18:08 CEST 2017 +From: Or Gerlitz +Date: Tue, 21 Mar 2017 15:59:13 +0200 +Subject: net/mlx5e: Change the TC offload rule add/del code path to be per NIC or E-Switch + +From: Or Gerlitz + + +[ Upstream commit d85cdccbb3fe9a632ec9d0f4e4526c8c84fc3523 ] + +Refactor the code to deal with add/del TC rules to have handler per NIC/E-switch +offloading use case, and push the latter into the e-switch code. This provides +better separation and is to be used in down-stream patch for applying a fix. + +Fixes: bffaa916588e ("net/mlx5: E-Switch, Add control for inline mode") +Signed-off-by: Or Gerlitz +Reviewed-by: Roi Dayan +Signed-off-by: Saeed Mahameed +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 57 ++++++++----- + drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 5 + + drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 14 +++ + 3 files changed, 58 insertions(+), 18 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +@@ -128,6 +128,23 @@ err_create_ft: + return rule; + } + ++static void mlx5e_tc_del_nic_flow(struct mlx5e_priv *priv, ++ struct mlx5e_tc_flow *flow) ++{ ++ struct mlx5_fc *counter = NULL; ++ ++ if (!IS_ERR(flow->rule)) { ++ counter = mlx5_flow_rule_counter(flow->rule); ++ mlx5_del_flow_rules(flow->rule); ++ mlx5_fc_destroy(priv->mdev, counter); ++ } ++ ++ if (!mlx5e_tc_num_filters(priv) && (priv->fs.tc.t)) { ++ mlx5_destroy_flow_table(priv->fs.tc.t); ++ priv->fs.tc.t = NULL; ++ } ++} ++ + static struct mlx5_flow_handle * + mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv, + struct mlx5_flow_spec *spec, +@@ -144,7 +161,24 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv + } + + static void mlx5e_detach_encap(struct mlx5e_priv *priv, +- struct mlx5e_tc_flow *flow) { ++ struct mlx5e_tc_flow *flow); ++ ++static void mlx5e_tc_del_fdb_flow(struct mlx5e_priv *priv, ++ struct mlx5e_tc_flow *flow) ++{ ++ struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; ++ ++ mlx5_eswitch_del_offloaded_rule(esw, flow->rule, flow->attr); ++ ++ mlx5_eswitch_del_vlan_action(esw, flow->attr); ++ ++ if (flow->attr->action & MLX5_FLOW_CONTEXT_ACTION_ENCAP) ++ mlx5e_detach_encap(priv, flow); ++} ++ ++static void mlx5e_detach_encap(struct mlx5e_priv *priv, ++ struct mlx5e_tc_flow *flow) ++{ + struct list_head *next = flow->encap.next; + + list_del(&flow->encap); +@@ -169,24 +203,11 @@ static void mlx5e_tc_del_flow(struct mlx + struct mlx5e_tc_flow *flow) + { + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; +- struct mlx5_fc *counter = NULL; + +- if (!IS_ERR(flow->rule)) { +- counter = mlx5_flow_rule_counter(flow->rule); +- mlx5_del_flow_rules(flow->rule); +- mlx5_fc_destroy(priv->mdev, counter); +- } +- +- if (esw && esw->mode == SRIOV_OFFLOADS) { +- mlx5_eswitch_del_vlan_action(esw, flow->attr); +- if (flow->attr->action & MLX5_FLOW_CONTEXT_ACTION_ENCAP) +- mlx5e_detach_encap(priv, flow); +- } +- +- if (!mlx5e_tc_num_filters(priv) && (priv->fs.tc.t)) { +- mlx5_destroy_flow_table(priv->fs.tc.t); +- priv->fs.tc.t = NULL; +- } ++ if (esw && esw->mode == SRIOV_OFFLOADS) ++ mlx5e_tc_del_fdb_flow(priv, flow); ++ else ++ mlx5e_tc_del_nic_flow(priv, flow); + } + + static void parse_vxlan_attr(struct mlx5_flow_spec *spec, +--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h +@@ -263,6 +263,11 @@ struct mlx5_flow_handle * + mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw, + struct mlx5_flow_spec *spec, + struct mlx5_esw_flow_attr *attr); ++void ++mlx5_eswitch_del_offloaded_rule(struct mlx5_eswitch *esw, ++ struct mlx5_flow_handle *rule, ++ struct mlx5_esw_flow_attr *attr); ++ + struct mlx5_flow_handle * + mlx5_eswitch_create_vport_rx_rule(struct mlx5_eswitch *esw, int vport, u32 tirn); + +--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +@@ -97,6 +97,20 @@ mlx5_eswitch_add_offloaded_rule(struct m + return rule; + } + ++void ++mlx5_eswitch_del_offloaded_rule(struct mlx5_eswitch *esw, ++ struct mlx5_flow_handle *rule, ++ struct mlx5_esw_flow_attr *attr) ++{ ++ struct mlx5_fc *counter = NULL; ++ ++ if (!IS_ERR(rule)) { ++ counter = mlx5_flow_rule_counter(rule); ++ mlx5_del_flow_rules(rule); ++ mlx5_fc_destroy(esw->dev, counter); ++ } ++} ++ + static int esw_set_global_vlan_pop(struct mlx5_eswitch *esw, u8 val) + { + struct mlx5_eswitch_rep *rep; diff --git a/queue-4.10/net-mlx5e-count-gso-packets-correctly.patch b/queue-4.10/net-mlx5e-count-gso-packets-correctly.patch new file mode 100644 index 00000000000..6755244b2d2 --- /dev/null +++ b/queue-4.10/net-mlx5e-count-gso-packets-correctly.patch @@ -0,0 +1,71 @@ +From foo@baz Mon Mar 27 18:18:08 CEST 2017 +From: Gal Pressman +Date: Tue, 21 Mar 2017 15:59:18 +0200 +Subject: net/mlx5e: Count GSO packets correctly + +From: Gal Pressman + + +[ Upstream commit d3a4e4da54c7adb420d5f48e89be913b14bdeff1 ] + +TX packets statistics ('tx_packets' counter) used to count GSO packets +as one, even though it contains multiple segments. +This patch will increment the counter by the number of segments, and +align the driver with the behavior of other drivers in the stack. + +Note that no information is lost in this patch due to 'tx_tso_packets' +counter existence. + +Before, ethtool showed: +$ ethtool -S ens6 | egrep "tx_packets|tx_tso_packets" + tx_packets: 61340 + tx_tso_packets: 60954 + tx_packets_phy: 2451115 + +Now, we will see the more logical statistics: +$ ethtool -S ens6 | egrep "tx_packets|tx_tso_packets" + tx_packets: 2451115 + tx_tso_packets: 60954 + tx_packets_phy: 2451115 + +Fixes: e586b3b0baee ("net/mlx5: Ethernet Datapath files") +Signed-off-by: Gal Pressman +Cc: kernel-team@fb.com +Signed-off-by: Saeed Mahameed +Acked-by: Alexei Starovoitov +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +@@ -272,15 +272,18 @@ static netdev_tx_t mlx5e_sq_xmit(struct + sq->stats.tso_bytes += skb->len - ihs; + } + ++ sq->stats.packets += skb_shinfo(skb)->gso_segs; + num_bytes = skb->len + (skb_shinfo(skb)->gso_segs - 1) * ihs; + } else { + bf = sq->bf_budget && + !skb->xmit_more && + !skb_shinfo(skb)->nr_frags; + ihs = mlx5e_get_inline_hdr_size(sq, skb, bf); ++ sq->stats.packets++; + num_bytes = max_t(unsigned int, skb->len, ETH_ZLEN); + } + ++ sq->stats.bytes += num_bytes; + wi->num_bytes = num_bytes; + + if (skb_vlan_tag_present(skb)) { +@@ -377,8 +380,6 @@ static netdev_tx_t mlx5e_sq_xmit(struct + if (bf) + sq->bf_budget--; + +- sq->stats.packets++; +- sq->stats.bytes += num_bytes; + return NETDEV_TX_OK; + + dma_unmap_wqe_err: diff --git a/queue-4.10/net-mlx5e-count-lro-packets-correctly.patch b/queue-4.10/net-mlx5e-count-lro-packets-correctly.patch new file mode 100644 index 00000000000..f50566bb872 --- /dev/null +++ b/queue-4.10/net-mlx5e-count-lro-packets-correctly.patch @@ -0,0 +1,54 @@ +From foo@baz Mon Mar 27 18:18:08 CEST 2017 +From: Gal Pressman +Date: Tue, 21 Mar 2017 15:59:19 +0200 +Subject: net/mlx5e: Count LRO packets correctly + +From: Gal Pressman + + +[ Upstream commit 8ab7e2ae15d84ba758b2c8c6f4075722e9bd2a08 ] + +RX packets statistics ('rx_packets' counter) used to count LRO packets +as one, even though it contains multiple segments. +This patch will increment the counter by the number of segments, and +align the driver with the behavior of other drivers in the stack. + +Note that no information is lost in this patch due to 'rx_lro_packets' +counter existence. + +Before, ethtool showed: +$ ethtool -S ens6 | egrep "rx_packets|rx_lro_packets" + rx_packets: 435277 + rx_lro_packets: 35847 + rx_packets_phy: 1935066 + +Now, we will see the more logical statistics: +$ ethtool -S ens6 | egrep "rx_packets|rx_lro_packets" + rx_packets: 1935066 + rx_lro_packets: 35847 + rx_packets_phy: 1935066 + +Fixes: e586b3b0baee ("net/mlx5: Ethernet Datapath files") +Signed-off-by: Gal Pressman +Cc: kernel-team@fb.com +Signed-off-by: Saeed Mahameed +Acked-by: Alexei Starovoitov +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +@@ -603,6 +603,10 @@ static inline void mlx5e_build_rx_skb(st + if (lro_num_seg > 1) { + mlx5e_lro_update_hdr(skb, cqe, cqe_bcnt); + skb_shinfo(skb)->gso_size = DIV_ROUND_UP(cqe_bcnt, lro_num_seg); ++ /* Subtract one since we already counted this as one ++ * "regular" packet in mlx5e_complete_rx_cqe() ++ */ ++ rq->stats.packets += lro_num_seg - 1; + rq->stats.lro_packets++; + rq->stats.lro_bytes += cqe_bcnt; + } diff --git a/queue-4.10/net-mlx5e-use-the-proper-uapi-values-when-offloading-tc-vlan-actions.patch b/queue-4.10/net-mlx5e-use-the-proper-uapi-values-when-offloading-tc-vlan-actions.patch new file mode 100644 index 00000000000..2daa3c3f6ba --- /dev/null +++ b/queue-4.10/net-mlx5e-use-the-proper-uapi-values-when-offloading-tc-vlan-actions.patch @@ -0,0 +1,45 @@ +From foo@baz Mon Mar 27 18:18:08 CEST 2017 +From: Or Gerlitz +Date: Tue, 21 Mar 2017 15:59:15 +0200 +Subject: net/mlx5e: Use the proper UAPI values when offloading TC vlan actions + +From: Or Gerlitz + + +[ Upstream commit 09c91ddf2cd33489c2c14edfef43ae38d412888e ] + +Currently we use the non UAPI values and we miss erring on +the modify action which is not supported, fix that. + +Fixes: 8b32580df1cb ('net/mlx5e: Add TC vlan action for SRIOV offloads') +Signed-off-by: Or Gerlitz +Reported-by: Petr Machata +Reviewed-by: Jiri Pirko +Signed-off-by: Saeed Mahameed +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +@@ -974,14 +974,16 @@ static int parse_tc_fdb_actions(struct m + } + + if (is_tcf_vlan(a)) { +- if (tcf_vlan_action(a) == VLAN_F_POP) { ++ if (tcf_vlan_action(a) == TCA_VLAN_ACT_POP) { + attr->action |= MLX5_FLOW_CONTEXT_ACTION_VLAN_POP; +- } else if (tcf_vlan_action(a) == VLAN_F_PUSH) { ++ } else if (tcf_vlan_action(a) == TCA_VLAN_ACT_PUSH) { + if (tcf_vlan_push_proto(a) != htons(ETH_P_8021Q)) + return -EOPNOTSUPP; + + attr->action |= MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH; + attr->vlan = tcf_vlan_push_vid(a); ++ } else { /* action is TCA_VLAN_ACT_MODIFY */ ++ return -EOPNOTSUPP; + } + continue; + } diff --git a/queue-4.10/net-openvswitch-set-the-ipv6-source-tunnel-key-address-attribute-correctly.patch b/queue-4.10/net-openvswitch-set-the-ipv6-source-tunnel-key-address-attribute-correctly.patch new file mode 100644 index 00000000000..e78d8234fd2 --- /dev/null +++ b/queue-4.10/net-openvswitch-set-the-ipv6-source-tunnel-key-address-attribute-correctly.patch @@ -0,0 +1,36 @@ +From foo@baz Mon Mar 27 18:18:08 CEST 2017 +From: Or Gerlitz +Date: Wed, 15 Mar 2017 18:10:47 +0200 +Subject: net/openvswitch: Set the ipv6 source tunnel key address attribute correctly + +From: Or Gerlitz + + +[ Upstream commit 3d20f1f7bd575d147ffa75621fa560eea0aec690 ] + +When dealing with ipv6 source tunnel key address attribute +(OVS_TUNNEL_KEY_ATTR_IPV6_SRC) we are wrongly setting the tunnel +dst ip, fix that. + +Fixes: 6b26ba3a7d95 ('openvswitch: netlink attributes for IPv6 tunneling') +Signed-off-by: Or Gerlitz +Reported-by: Paul Blakey +Acked-by: Jiri Benc +Acked-by: Joe Stringer +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/openvswitch/flow_netlink.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/openvswitch/flow_netlink.c ++++ b/net/openvswitch/flow_netlink.c +@@ -588,7 +588,7 @@ static int ip_tun_from_nlattr(const stru + ipv4 = true; + break; + case OVS_TUNNEL_KEY_ATTR_IPV6_SRC: +- SW_FLOW_KEY_PUT(match, tun_key.u.ipv6.dst, ++ SW_FLOW_KEY_PUT(match, tun_key.u.ipv6.src, + nla_get_in6_addr(a), is_mask); + ipv6 = true; + break; diff --git a/queue-4.10/net-properly-release-sk_frag.page.patch b/queue-4.10/net-properly-release-sk_frag.page.patch new file mode 100644 index 00000000000..473ad1e7471 --- /dev/null +++ b/queue-4.10/net-properly-release-sk_frag.page.patch @@ -0,0 +1,52 @@ +From foo@baz Mon Mar 27 18:18:08 CEST 2017 +From: Eric Dumazet +Date: Wed, 15 Mar 2017 13:21:28 -0700 +Subject: net: properly release sk_frag.page + +From: Eric Dumazet + + +[ Upstream commit 22a0e18eac7a9e986fec76c60fa4a2926d1291e2 ] + +I mistakenly added the code to release sk->sk_frag in +sk_common_release() instead of sk_destruct() + +TCP sockets using sk->sk_allocation == GFP_ATOMIC do no call +sk_common_release() at close time, thus leaking one (order-3) page. + +iSCSI is using such sockets. + +Fixes: 5640f7685831 ("net: use a per task frag allocator") +Signed-off-by: Eric Dumazet +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/core/sock.c | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +--- a/net/core/sock.c ++++ b/net/core/sock.c +@@ -1444,6 +1444,11 @@ static void __sk_destruct(struct rcu_hea + pr_debug("%s: optmem leakage (%d bytes) detected\n", + __func__, atomic_read(&sk->sk_omem_alloc)); + ++ if (sk->sk_frag.page) { ++ put_page(sk->sk_frag.page); ++ sk->sk_frag.page = NULL; ++ } ++ + if (sk->sk_peer_cred) + put_cred(sk->sk_peer_cred); + put_pid(sk->sk_peer_pid); +@@ -2774,11 +2779,6 @@ void sk_common_release(struct sock *sk) + + sk_refcnt_debug_release(sk); + +- if (sk->sk_frag.page) { +- put_page(sk->sk_frag.page); +- sk->sk_frag.page = NULL; +- } +- + sock_put(sk); + } + EXPORT_SYMBOL(sk_common_release); diff --git a/queue-4.10/net-solve-a-napi-race.patch b/queue-4.10/net-solve-a-napi-race.patch new file mode 100644 index 00000000000..f5c615a2b01 --- /dev/null +++ b/queue-4.10/net-solve-a-napi-race.patch @@ -0,0 +1,255 @@ +From foo@baz Mon Mar 27 18:18:08 CEST 2017 +From: Eric Dumazet +Date: Thu, 16 Mar 2017 19:02:33 -0700 +Subject: net: solve a NAPI race + +From: Eric Dumazet + + +commit 39e6c8208d7b6fb9d2047850fb3327db567b564b upstream. + +While playing with mlx4 hardware timestamping of RX packets, I found +that some packets were received by TCP stack with a ~200 ms delay... + +Since the timestamp was provided by the NIC, and my probe was added +in tcp_v4_rcv() while in BH handler, I was confident it was not +a sender issue, or a drop in the network. + +This would happen with a very low probability, but hurting RPC +workloads. + +A NAPI driver normally arms the IRQ after the napi_complete_done(), +after NAPI_STATE_SCHED is cleared, so that the hard irq handler can grab +it. + +Problem is that if another point in the stack grabs NAPI_STATE_SCHED bit +while IRQ are not disabled, we might have later an IRQ firing and +finding this bit set, right before napi_complete_done() clears it. + +This can happen with busy polling users, or if gro_flush_timeout is +used. But some other uses of napi_schedule() in drivers can cause this +as well. + +thread 1 thread 2 (could be on same cpu, or not) + +// busy polling or napi_watchdog() +napi_schedule(); +... +napi->poll() + +device polling: +read 2 packets from ring buffer + Additional 3rd packet is +available. + device hard irq + + // does nothing because +NAPI_STATE_SCHED bit is owned by thread 1 + napi_schedule(); + +napi_complete_done(napi, 2); +rearm_irq(); + +Note that rearm_irq() will not force the device to send an additional +IRQ for the packet it already signaled (3rd packet in my example) + +This patch adds a new NAPI_STATE_MISSED bit, that napi_schedule_prep() +can set if it could not grab NAPI_STATE_SCHED + +Then napi_complete_done() properly reschedules the napi to make sure +we do not miss something. + +Since we manipulate multiple bits at once, use cmpxchg() like in +sk_busy_loop() to provide proper transactions. + +In v2, I changed napi_watchdog() to use a relaxed variant of +napi_schedule_prep() : No need to set NAPI_STATE_MISSED from this point. + +In v3, I added more details in the changelog and clears +NAPI_STATE_MISSED in busy_poll_stop() + +In v4, I added the ideas given by Alexander Duyck in v3 review + +Signed-off-by: Eric Dumazet +Cc: Alexander Duyck +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/netdevice.h | 29 +++++----------- + net/core/dev.c | 81 ++++++++++++++++++++++++++++++++++++++++++---- + 2 files changed, 83 insertions(+), 27 deletions(-) + +--- a/include/linux/netdevice.h ++++ b/include/linux/netdevice.h +@@ -330,6 +330,7 @@ struct napi_struct { + + enum { + NAPI_STATE_SCHED, /* Poll is scheduled */ ++ NAPI_STATE_MISSED, /* reschedule a napi */ + NAPI_STATE_DISABLE, /* Disable pending */ + NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */ + NAPI_STATE_HASHED, /* In NAPI hash (busy polling possible) */ +@@ -338,12 +339,13 @@ enum { + }; + + enum { +- NAPIF_STATE_SCHED = (1UL << NAPI_STATE_SCHED), +- NAPIF_STATE_DISABLE = (1UL << NAPI_STATE_DISABLE), +- NAPIF_STATE_NPSVC = (1UL << NAPI_STATE_NPSVC), +- NAPIF_STATE_HASHED = (1UL << NAPI_STATE_HASHED), +- NAPIF_STATE_NO_BUSY_POLL = (1UL << NAPI_STATE_NO_BUSY_POLL), +- NAPIF_STATE_IN_BUSY_POLL = (1UL << NAPI_STATE_IN_BUSY_POLL), ++ NAPIF_STATE_SCHED = BIT(NAPI_STATE_SCHED), ++ NAPIF_STATE_MISSED = BIT(NAPI_STATE_MISSED), ++ NAPIF_STATE_DISABLE = BIT(NAPI_STATE_DISABLE), ++ NAPIF_STATE_NPSVC = BIT(NAPI_STATE_NPSVC), ++ NAPIF_STATE_HASHED = BIT(NAPI_STATE_HASHED), ++ NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL), ++ NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL), + }; + + enum gro_result { +@@ -413,20 +415,7 @@ static inline bool napi_disable_pending( + return test_bit(NAPI_STATE_DISABLE, &n->state); + } + +-/** +- * napi_schedule_prep - check if NAPI can be scheduled +- * @n: NAPI context +- * +- * Test if NAPI routine is already running, and if not mark +- * it as running. This is used as a condition variable to +- * insure only one NAPI poll instance runs. We also make +- * sure there is no pending NAPI disable. +- */ +-static inline bool napi_schedule_prep(struct napi_struct *n) +-{ +- return !napi_disable_pending(n) && +- !test_and_set_bit(NAPI_STATE_SCHED, &n->state); +-} ++bool napi_schedule_prep(struct napi_struct *n); + + /** + * napi_schedule - schedule NAPI poll +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -4913,6 +4913,39 @@ void __napi_schedule(struct napi_struct + EXPORT_SYMBOL(__napi_schedule); + + /** ++ * napi_schedule_prep - check if napi can be scheduled ++ * @n: napi context ++ * ++ * Test if NAPI routine is already running, and if not mark ++ * it as running. This is used as a condition variable ++ * insure only one NAPI poll instance runs. We also make ++ * sure there is no pending NAPI disable. ++ */ ++bool napi_schedule_prep(struct napi_struct *n) ++{ ++ unsigned long val, new; ++ ++ do { ++ val = READ_ONCE(n->state); ++ if (unlikely(val & NAPIF_STATE_DISABLE)) ++ return false; ++ new = val | NAPIF_STATE_SCHED; ++ ++ /* Sets STATE_MISSED bit if STATE_SCHED was already set ++ * This was suggested by Alexander Duyck, as compiler ++ * emits better code than : ++ * if (val & NAPIF_STATE_SCHED) ++ * new |= NAPIF_STATE_MISSED; ++ */ ++ new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED * ++ NAPIF_STATE_MISSED; ++ } while (cmpxchg(&n->state, val, new) != val); ++ ++ return !(val & NAPIF_STATE_SCHED); ++} ++EXPORT_SYMBOL(napi_schedule_prep); ++ ++/** + * __napi_schedule_irqoff - schedule for receive + * @n: entry to schedule + * +@@ -4943,7 +4976,7 @@ EXPORT_SYMBOL(__napi_complete); + + bool napi_complete_done(struct napi_struct *n, int work_done) + { +- unsigned long flags; ++ unsigned long flags, val, new; + + /* + * 1) Don't let napi dequeue from the cpu poll list +@@ -4967,14 +5000,33 @@ bool napi_complete_done(struct napi_stru + else + napi_gro_flush(n, false); + } +- if (likely(list_empty(&n->poll_list))) { +- WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state)); +- } else { ++ if (unlikely(!list_empty(&n->poll_list))) { + /* If n->poll_list is not empty, we need to mask irqs */ + local_irq_save(flags); +- __napi_complete(n); ++ list_del_init(&n->poll_list); + local_irq_restore(flags); + } ++ ++ do { ++ val = READ_ONCE(n->state); ++ ++ WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED)); ++ ++ new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED); ++ ++ /* If STATE_MISSED was set, leave STATE_SCHED set, ++ * because we will call napi->poll() one more time. ++ * This C code was suggested by Alexander Duyck to help gcc. ++ */ ++ new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED * ++ NAPIF_STATE_SCHED; ++ } while (cmpxchg(&n->state, val, new) != val); ++ ++ if (unlikely(val & NAPIF_STATE_MISSED)) { ++ __napi_schedule(n); ++ return false; ++ } ++ + return true; + } + EXPORT_SYMBOL(napi_complete_done); +@@ -5000,6 +5052,16 @@ static void busy_poll_stop(struct napi_s + { + int rc; + ++ /* Busy polling means there is a high chance device driver hard irq ++ * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was ++ * set in napi_schedule_prep(). ++ * Since we are about to call napi->poll() once more, we can safely ++ * clear NAPI_STATE_MISSED. ++ * ++ * Note: x86 could use a single "lock and ..." instruction ++ * to perform these two clear_bit() ++ */ ++ clear_bit(NAPI_STATE_MISSED, &napi->state); + clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state); + + local_bh_disable(); +@@ -5146,8 +5208,13 @@ static enum hrtimer_restart napi_watchdo + struct napi_struct *napi; + + napi = container_of(timer, struct napi_struct, timer); +- if (napi->gro_list) +- napi_schedule(napi); ++ ++ /* Note : we use a relaxed variant of napi_schedule_prep() not setting ++ * NAPI_STATE_MISSED, since we do not react to a device IRQ. ++ */ ++ if (napi->gro_list && !napi_disable_pending(napi) && ++ !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) ++ __napi_schedule_irqoff(napi); + + return HRTIMER_NORESTART; + } diff --git a/queue-4.10/net-unix-properly-re-increment-inflight-counter-of-gc-discarded-candidates.patch b/queue-4.10/net-unix-properly-re-increment-inflight-counter-of-gc-discarded-candidates.patch new file mode 100644 index 00000000000..ccc382070b2 --- /dev/null +++ b/queue-4.10/net-unix-properly-re-increment-inflight-counter-of-gc-discarded-candidates.patch @@ -0,0 +1,111 @@ +From foo@baz Mon Mar 27 18:18:08 CEST 2017 +From: Andrey Ulanov +Date: Tue, 14 Mar 2017 20:16:42 -0700 +Subject: net: unix: properly re-increment inflight counter of GC discarded candidates + +From: Andrey Ulanov + + +[ Upstream commit 7df9c24625b9981779afb8fcdbe2bb4765e61147 ] + +Dmitry has reported that a BUG_ON() condition in unix_notinflight() +may be triggered by a simple code that forwards unix socket in an +SCM_RIGHTS message. +That is caused by incorrect unix socket GC implementation in unix_gc(). + +The GC first collects list of candidates, then (a) decrements their +"children's" inflight counter, (b) checks which inflight counters are +now 0, and then (c) increments all inflight counters back. +(a) and (c) are done by calling scan_children() with inc_inflight or +dec_inflight as the second argument. + +Commit 6209344f5a37 ("net: unix: fix inflight counting bug in garbage +collector") changed scan_children() such that it no longer considers +sockets that do not have UNIX_GC_CANDIDATE flag. It also added a block +of code that that unsets this flag _before_ invoking +scan_children(, dec_iflight, ). This may lead to incorrect inflight +counters for some sockets. + +This change fixes this bug by changing order of operations: +UNIX_GC_CANDIDATE is now unset only after all inflight counters are +restored to the original state. + + kernel BUG at net/unix/garbage.c:149! + RIP: 0010:[] [] + unix_notinflight+0x3b4/0x490 net/unix/garbage.c:149 + Call Trace: + [] unix_detach_fds.isra.19+0xff/0x170 net/unix/af_unix.c:1487 + [] unix_destruct_scm+0xf9/0x210 net/unix/af_unix.c:1496 + [] skb_release_head_state+0x101/0x200 net/core/skbuff.c:655 + [] skb_release_all+0x1a/0x60 net/core/skbuff.c:668 + [] __kfree_skb+0x1a/0x30 net/core/skbuff.c:684 + [] kfree_skb+0x184/0x570 net/core/skbuff.c:705 + [] unix_release_sock+0x5b5/0xbd0 net/unix/af_unix.c:559 + [] unix_release+0x49/0x90 net/unix/af_unix.c:836 + [] sock_release+0x92/0x1f0 net/socket.c:570 + [] sock_close+0x1b/0x20 net/socket.c:1017 + [] __fput+0x34e/0x910 fs/file_table.c:208 + [] ____fput+0x1a/0x20 fs/file_table.c:244 + [] task_work_run+0x1a0/0x280 kernel/task_work.c:116 + [< inline >] exit_task_work include/linux/task_work.h:21 + [] do_exit+0x183a/0x2640 kernel/exit.c:828 + [] do_group_exit+0x14e/0x420 kernel/exit.c:931 + [] get_signal+0x663/0x1880 kernel/signal.c:2307 + [] do_signal+0xc5/0x2190 arch/x86/kernel/signal.c:807 + [] exit_to_usermode_loop+0x1ea/0x2d0 + arch/x86/entry/common.c:156 + [< inline >] prepare_exit_to_usermode arch/x86/entry/common.c:190 + [] syscall_return_slowpath+0x4d3/0x570 + arch/x86/entry/common.c:259 + [] entry_SYSCALL_64_fastpath+0xc4/0xc6 + +Link: https://lkml.org/lkml/2017/3/6/252 +Signed-off-by: Andrey Ulanov +Reported-by: Dmitry Vyukov +Fixes: 6209344 ("net: unix: fix inflight counting bug in garbage collector") +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/unix/garbage.c | 17 +++++++++-------- + 1 file changed, 9 insertions(+), 8 deletions(-) + +--- a/net/unix/garbage.c ++++ b/net/unix/garbage.c +@@ -146,6 +146,7 @@ void unix_notinflight(struct user_struct + if (s) { + struct unix_sock *u = unix_sk(s); + ++ BUG_ON(!atomic_long_read(&u->inflight)); + BUG_ON(list_empty(&u->link)); + + if (atomic_long_dec_and_test(&u->inflight)) +@@ -341,6 +342,14 @@ void unix_gc(void) + } + list_del(&cursor); + ++ /* Now gc_candidates contains only garbage. Restore original ++ * inflight counters for these as well, and remove the skbuffs ++ * which are creating the cycle(s). ++ */ ++ skb_queue_head_init(&hitlist); ++ list_for_each_entry(u, &gc_candidates, link) ++ scan_children(&u->sk, inc_inflight, &hitlist); ++ + /* not_cycle_list contains those sockets which do not make up a + * cycle. Restore these to the inflight list. + */ +@@ -350,14 +359,6 @@ void unix_gc(void) + list_move_tail(&u->link, &gc_inflight_list); + } + +- /* Now gc_candidates contains only garbage. Restore original +- * inflight counters for these as well, and remove the skbuffs +- * which are creating the cycle(s). +- */ +- skb_queue_head_init(&hitlist); +- list_for_each_entry(u, &gc_candidates, link) +- scan_children(&u->sk, inc_inflight, &hitlist); +- + spin_unlock(&unix_gc_lock); + + /* Here we are. Hitlist is filled. Die. */ diff --git a/queue-4.10/net-vrf-reset-rt6i_idev-in-local-dst-after-put.patch b/queue-4.10/net-vrf-reset-rt6i_idev-in-local-dst-after-put.patch new file mode 100644 index 00000000000..662aee09f74 --- /dev/null +++ b/queue-4.10/net-vrf-reset-rt6i_idev-in-local-dst-after-put.patch @@ -0,0 +1,41 @@ +From foo@baz Mon Mar 27 18:18:08 CEST 2017 +From: David Ahern +Date: Fri, 17 Mar 2017 16:07:11 -0700 +Subject: net: vrf: Reset rt6i_idev in local dst after put + +From: David Ahern + + +[ Upstream commit 3dc857f0e8fc22610a59cbb346ba62c6e921863f ] + +The VRF driver takes a reference to the inet6_dev on the VRF device for +its rt6_local dst when handling local traffic through the VRF device as +a loopback. When the device is deleted the driver does a put on the idev +but does not reset rt6i_idev in the rt6_info struct. When the dst is +destroyed, dst_destroy calls ip6_dst_destroy which does a second put for +what is essentially the same reference causing it to be prematurely freed. +Reset rt6i_idev after the put in the vrf driver. + +Fixes: b4869aa2f881e ("net: vrf: ipv6 support for local traffic to + local addresses") +Signed-off-by: David Ahern +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/vrf.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/drivers/net/vrf.c ++++ b/drivers/net/vrf.c +@@ -462,8 +462,10 @@ static void vrf_rt6_release(struct net_d + } + + if (rt6_local) { +- if (rt6_local->rt6i_idev) ++ if (rt6_local->rt6i_idev) { + in6_dev_put(rt6_local->rt6i_idev); ++ rt6_local->rt6i_idev = NULL; ++ } + + dst = &rt6_local->dst; + dev_put(dst->dev); diff --git a/queue-4.10/openvswitch-add-missing-case-ovs_tunnel_key_attr_pad.patch b/queue-4.10/openvswitch-add-missing-case-ovs_tunnel_key_attr_pad.patch new file mode 100644 index 00000000000..59e19410414 --- /dev/null +++ b/queue-4.10/openvswitch-add-missing-case-ovs_tunnel_key_attr_pad.patch @@ -0,0 +1,34 @@ +From foo@baz Mon Mar 27 18:18:08 CEST 2017 +From: Kris Murphy +Date: Thu, 16 Mar 2017 10:51:28 -0500 +Subject: openvswitch: Add missing case OVS_TUNNEL_KEY_ATTR_PAD + +From: Kris Murphy + + +[ Upstream commit 8f3dbfd79ed9ef9770305a7cc4e13dfd31ad2cd0 ] + +Added a case for OVS_TUNNEL_KEY_ATTR_PAD to the switch statement +in ip_tun_from_nlattr in order to prevent the default case +returning an error. + +Fixes: b46f6ded906e ("libnl: nla_put_be64(): align on a 64-bit area") +Signed-off-by: Kris Murphy +Acked-by: Joe Stringer +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/openvswitch/flow_netlink.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/net/openvswitch/flow_netlink.c ++++ b/net/openvswitch/flow_netlink.c +@@ -649,6 +649,8 @@ static int ip_tun_from_nlattr(const stru + tun_flags |= TUNNEL_VXLAN_OPT; + opts_type = type; + break; ++ case OVS_TUNNEL_KEY_ATTR_PAD: ++ break; + default: + OVS_NLERR(log, "Unknown IP tunnel attribute %d", + type); diff --git a/queue-4.10/qmi_wwan-add-dell-dw5811e.patch b/queue-4.10/qmi_wwan-add-dell-dw5811e.patch new file mode 100644 index 00000000000..d1946ed730e --- /dev/null +++ b/queue-4.10/qmi_wwan-add-dell-dw5811e.patch @@ -0,0 +1,32 @@ +From foo@baz Mon Mar 27 18:18:08 CEST 2017 +From: Bjørn Mork +Date: Fri, 17 Mar 2017 17:20:48 +0100 +Subject: qmi_wwan: add Dell DW5811e + +From: Bjørn Mork + + +[ Upstream commit 6bd845d1cf98b45c634baacb8381436dad3c2dd0 ] + +This is a Dell branded Sierra Wireless EM7455. It is operating in +MBIM mode by default, but can be configured to provide two QMI/RMNET +functions. + +Signed-off-by: Bjørn Mork +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/usb/qmi_wwan.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/drivers/net/usb/qmi_wwan.c ++++ b/drivers/net/usb/qmi_wwan.c +@@ -924,6 +924,8 @@ static const struct usb_device_id produc + {QMI_FIXED_INTF(0x413c, 0x81a9, 8)}, /* Dell Wireless 5808e Gobi(TM) 4G LTE Mobile Broadband Card */ + {QMI_FIXED_INTF(0x413c, 0x81b1, 8)}, /* Dell Wireless 5809e Gobi(TM) 4G LTE Mobile Broadband Card */ + {QMI_FIXED_INTF(0x413c, 0x81b3, 8)}, /* Dell Wireless 5809e Gobi(TM) 4G LTE Mobile Broadband Card (rev3) */ ++ {QMI_FIXED_INTF(0x413c, 0x81b6, 8)}, /* Dell Wireless 5811e */ ++ {QMI_FIXED_INTF(0x413c, 0x81b6, 10)}, /* Dell Wireless 5811e */ + {QMI_FIXED_INTF(0x03f0, 0x4e1d, 8)}, /* HP lt4111 LTE/EV-DO/HSPA+ Gobi 4G Module */ + {QMI_FIXED_INTF(0x22de, 0x9061, 3)}, /* WeTelecom WPD-600N */ + {QMI_FIXED_INTF(0x1e0e, 0x9001, 5)}, /* SIMCom 7230E */ diff --git a/queue-4.10/series b/queue-4.10/series new file mode 100644 index 00000000000..d1fb48b27de --- /dev/null +++ b/queue-4.10/series @@ -0,0 +1,24 @@ +net-openvswitch-set-the-ipv6-source-tunnel-key-address-attribute-correctly.patch +net-bcmgenet-do-not-suspend-phy-if-wake-on-lan-is-enabled.patch +net-properly-release-sk_frag.page.patch +amd-xgbe-fix-jumbo-mtu-processing-on-newer-hardware.patch +openvswitch-add-missing-case-ovs_tunnel_key_attr_pad.patch +net-unix-properly-re-increment-inflight-counter-of-gc-discarded-candidates.patch +qmi_wwan-add-dell-dw5811e.patch +net-vrf-reset-rt6i_idev-in-local-dst-after-put.patch +net-mlx5-add-missing-entries-for-set-query-rate-limit-commands.patch +net-mlx5e-change-the-tc-offload-rule-add-del-code-path-to-be-per-nic-or-e-switch.patch +net-mlx5-e-switch-don-t-allow-changing-inline-mode-when-flows-are-configured.patch +net-mlx5e-use-the-proper-uapi-values-when-offloading-tc-vlan-actions.patch +net-mlx5e-avoid-supporting-udp-tunnel-port-ndo-for-vf-reps.patch +net-mlx5-increase-number-of-max-qps-in-default-profile.patch +net-mlx5e-count-gso-packets-correctly.patch +net-mlx5e-count-lro-packets-correctly.patch +ipv6-make-sure-to-initialize-sockc.tsflags-before-first-use.patch +net-bcmgenet-remove-bcmgenet_internal_phy_setup.patch +ipv4-provide-stronger-user-input-validation-in-nl_fib_input.patch +socket-bpf-fix-sk_filter-use-after-free-in-sk_clone_lock.patch +genetlink-fix-counting-regression-on-ctrl_dumpfamily.patch +tcp-initialize-icsk_ack.lrcvtime-at-session-start-time.patch +amd-xgbe-fix-the-ecc-related-bit-position-definitions.patch +net-solve-a-napi-race.patch diff --git a/queue-4.10/socket-bpf-fix-sk_filter-use-after-free-in-sk_clone_lock.patch b/queue-4.10/socket-bpf-fix-sk_filter-use-after-free-in-sk_clone_lock.patch new file mode 100644 index 00000000000..46ddb927645 --- /dev/null +++ b/queue-4.10/socket-bpf-fix-sk_filter-use-after-free-in-sk_clone_lock.patch @@ -0,0 +1,65 @@ +From foo@baz Mon Mar 27 18:18:08 CEST 2017 +From: Daniel Borkmann +Date: Wed, 22 Mar 2017 13:08:08 +0100 +Subject: socket, bpf: fix sk_filter use after free in sk_clone_lock + +From: Daniel Borkmann + + +[ Upstream commit a97e50cc4cb67e1e7bff56f6b41cda62ca832336 ] + +In sk_clone_lock(), we create a new socket and inherit most of the +parent's members via sock_copy() which memcpy()'s various sections. +Now, in case the parent socket had a BPF socket filter attached, +then newsk->sk_filter points to the same instance as the original +sk->sk_filter. + +sk_filter_charge() is then called on the newsk->sk_filter to take a +reference and should that fail due to hitting max optmem, we bail +out and release the newsk instance. + +The issue is that commit 278571baca2a ("net: filter: simplify socket +charging") wrongly combined the dismantle path with the failure path +of xfrm_sk_clone_policy(). This means, even when charging failed, we +call sk_free_unlock_clone() on the newsk, which then still points to +the same sk_filter as the original sk. + +Thus, sk_free_unlock_clone() calls into __sk_destruct() eventually +where it tests for present sk_filter and calls sk_filter_uncharge() +on it, which potentially lets sk_omem_alloc wrap around and releases +the eBPF prog and sk_filter structure from the (still intact) parent. + +Fix it by making sure that when sk_filter_charge() failed, we reset +newsk->sk_filter back to NULL before passing to sk_free_unlock_clone(), +so that we don't mess with the parents sk_filter. + +Only if xfrm_sk_clone_policy() fails, we did reach the point where +either the parent's filter was NULL and as a result newsk's as well +or where we previously had a successful sk_filter_charge(), thus for +that case, we do need sk_filter_uncharge() to release the prior taken +reference on sk_filter. + +Fixes: 278571baca2a ("net: filter: simplify socket charging") +Signed-off-by: Daniel Borkmann +Acked-by: Alexei Starovoitov +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/core/sock.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/net/core/sock.c ++++ b/net/core/sock.c +@@ -1545,6 +1545,12 @@ struct sock *sk_clone_lock(const struct + is_charged = sk_filter_charge(newsk, filter); + + if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { ++ /* We need to make sure that we don't uncharge the new ++ * socket if we couldn't charge it in the first place ++ * as otherwise we uncharge the parent's filter. ++ */ ++ if (!is_charged) ++ RCU_INIT_POINTER(newsk->sk_filter, NULL); + /* It is still raw copy of parent, so invalidate + * destructor and make plain sk_free() */ + newsk->sk_destruct = NULL; diff --git a/queue-4.10/tcp-initialize-icsk_ack.lrcvtime-at-session-start-time.patch b/queue-4.10/tcp-initialize-icsk_ack.lrcvtime-at-session-start-time.patch new file mode 100644 index 00000000000..d4b7029d8b2 --- /dev/null +++ b/queue-4.10/tcp-initialize-icsk_ack.lrcvtime-at-session-start-time.patch @@ -0,0 +1,55 @@ +From foo@baz Mon Mar 27 18:18:08 CEST 2017 +From: Eric Dumazet +Date: Wed, 22 Mar 2017 08:10:21 -0700 +Subject: tcp: initialize icsk_ack.lrcvtime at session start time + +From: Eric Dumazet + + +[ Upstream commit 15bb7745e94a665caf42bfaabf0ce062845b533b ] + +icsk_ack.lrcvtime has a 0 value at socket creation time. + +tcpi_last_data_recv can have bogus value if no payload is ever received. + +This patch initializes icsk_ack.lrcvtime for active sessions +in tcp_finish_connect(), and for passive sessions in +tcp_create_openreq_child() + +Signed-off-by: Eric Dumazet +Acked-by: Neal Cardwell +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/tcp_input.c | 2 +- + net/ipv4/tcp_minisocks.c | 1 + + 2 files changed, 2 insertions(+), 1 deletion(-) + +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -5571,6 +5571,7 @@ void tcp_finish_connect(struct sock *sk, + struct inet_connection_sock *icsk = inet_csk(sk); + + tcp_set_state(sk, TCP_ESTABLISHED); ++ icsk->icsk_ack.lrcvtime = tcp_time_stamp; + + if (skb) { + icsk->icsk_af_ops->sk_rx_dst_set(sk, skb); +@@ -5789,7 +5790,6 @@ static int tcp_rcv_synsent_state_process + * to stand against the temptation 8) --ANK + */ + inet_csk_schedule_ack(sk); +- icsk->icsk_ack.lrcvtime = tcp_time_stamp; + tcp_enter_quickack_mode(sk); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, + TCP_DELACK_MAX, TCP_RTO_MAX); +--- a/net/ipv4/tcp_minisocks.c ++++ b/net/ipv4/tcp_minisocks.c +@@ -466,6 +466,7 @@ struct sock *tcp_create_openreq_child(co + newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); + minmax_reset(&newtp->rtt_min, tcp_time_stamp, ~0U); + newicsk->icsk_rto = TCP_TIMEOUT_INIT; ++ newicsk->icsk_ack.lrcvtime = tcp_time_stamp; + + newtp->packets_out = 0; + newtp->retrans_out = 0; diff --git a/queue-4.9/series b/queue-4.9/series new file mode 100644 index 00000000000..38d832cf3a0 --- /dev/null +++ b/queue-4.9/series @@ -0,0 +1,18 @@ +net-openvswitch-set-the-ipv6-source-tunnel-key-address-attribute-correctly.patch +net-bcmgenet-do-not-suspend-phy-if-wake-on-lan-is-enabled.patch +net-properly-release-sk_frag.page.patch +amd-xgbe-fix-jumbo-mtu-processing-on-newer-hardware.patch +openvswitch-add-missing-case-ovs_tunnel_key_attr_pad.patch +net-unix-properly-re-increment-inflight-counter-of-gc-discarded-candidates.patch +qmi_wwan-add-dell-dw5811e.patch +net-vrf-reset-rt6i_idev-in-local-dst-after-put.patch +net-mlx5-add-missing-entries-for-set-query-rate-limit-commands.patch +net-mlx5e-use-the-proper-uapi-values-when-offloading-tc-vlan-actions.patch +net-mlx5-increase-number-of-max-qps-in-default-profile.patch +net-mlx5e-count-gso-packets-correctly.patch +net-mlx5e-count-lro-packets-correctly.patch +ipv6-make-sure-to-initialize-sockc.tsflags-before-first-use.patch +net-bcmgenet-remove-bcmgenet_internal_phy_setup.patch +ipv4-provide-stronger-user-input-validation-in-nl_fib_input.patch +socket-bpf-fix-sk_filter-use-after-free-in-sk_clone_lock.patch +tcp-initialize-icsk_ack.lrcvtime-at-session-start-time.patch