From 5118fe1daf568d9bb5ee8fbb63ad6e82ef68a6a6 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 27 Mar 2017 18:24:37 +0200 Subject: [PATCH] 4.4-stable patches added patches: amd-xgbe-fix-jumbo-mtu-processing-on-newer-hardware.patch ipv4-provide-stronger-user-input-validation-in-nl_fib_input.patch net-bcmgenet-do-not-suspend-phy-if-wake-on-lan-is-enabled.patch net-bcmgenet-remove-bcmgenet_internal_phy_setup.patch net-mlx5-increase-number-of-max-qps-in-default-profile.patch net-mlx5e-count-lro-packets-correctly.patch net-openvswitch-set-the-ipv6-source-tunnel-key-address-attribute-correctly.patch net-properly-release-sk_frag.page.patch net-unix-properly-re-increment-inflight-counter-of-gc-discarded-candidates.patch socket-bpf-fix-sk_filter-use-after-free-in-sk_clone_lock.patch tcp-initialize-icsk_ack.lrcvtime-at-session-start-time.patch --- ...mbo-mtu-processing-on-newer-hardware.patch | 284 ++++++++++++++++++ ...ser-input-validation-in-nl_fib_input.patch | 39 +++ ...uspend-phy-if-wake-on-lan-is-enabled.patch | 43 +++ ...t-remove-bcmgenet_internal_phy_setup.patch | 85 ++++++ ...number-of-max-qps-in-default-profile.patch | 34 +++ ...et-mlx5e-count-lro-packets-correctly.patch | 54 ++++ ...nnel-key-address-attribute-correctly.patch | 36 +++ .../net-properly-release-sk_frag.page.patch | 52 ++++ ...t-counter-of-gc-discarded-candidates.patch | 111 +++++++ queue-4.4/series | 11 + ...lter-use-after-free-in-sk_clone_lock.patch | 65 ++++ ...k_ack.lrcvtime-at-session-start-time.patch | 55 ++++ 12 files changed, 869 insertions(+) create mode 100644 queue-4.4/amd-xgbe-fix-jumbo-mtu-processing-on-newer-hardware.patch create mode 100644 queue-4.4/ipv4-provide-stronger-user-input-validation-in-nl_fib_input.patch create mode 100644 queue-4.4/net-bcmgenet-do-not-suspend-phy-if-wake-on-lan-is-enabled.patch create mode 100644 queue-4.4/net-bcmgenet-remove-bcmgenet_internal_phy_setup.patch create mode 100644 queue-4.4/net-mlx5-increase-number-of-max-qps-in-default-profile.patch create mode 100644 queue-4.4/net-mlx5e-count-lro-packets-correctly.patch create mode 100644 queue-4.4/net-openvswitch-set-the-ipv6-source-tunnel-key-address-attribute-correctly.patch create mode 100644 queue-4.4/net-properly-release-sk_frag.page.patch create mode 100644 queue-4.4/net-unix-properly-re-increment-inflight-counter-of-gc-discarded-candidates.patch create mode 100644 queue-4.4/series create mode 100644 queue-4.4/socket-bpf-fix-sk_filter-use-after-free-in-sk_clone_lock.patch create mode 100644 queue-4.4/tcp-initialize-icsk_ack.lrcvtime-at-session-start-time.patch diff --git a/queue-4.4/amd-xgbe-fix-jumbo-mtu-processing-on-newer-hardware.patch b/queue-4.4/amd-xgbe-fix-jumbo-mtu-processing-on-newer-hardware.patch new file mode 100644 index 00000000000..26746dae745 --- /dev/null +++ b/queue-4.4/amd-xgbe-fix-jumbo-mtu-processing-on-newer-hardware.patch @@ -0,0 +1,284 @@ +From foo@baz Mon Mar 27 18:22:09 CEST 2017 +From: "Lendacky, Thomas" +Date: Wed, 15 Mar 2017 15:11:23 -0500 +Subject: amd-xgbe: Fix jumbo MTU processing on newer hardware + +From: "Lendacky, Thomas" + + +[ Upstream commit 622c36f143fc9566ba49d7cec994c2da1182d9e2 ] + +Newer hardware does not provide a cumulative payload length when multiple +descriptors are needed to handle the data. Once the MTU increases beyond +the size that can be handled by a single descriptor, the SKB does not get +built properly by the driver. + +The driver will now calculate the size of the data buffers used by the +hardware. The first buffer of the first descriptor is for packet headers +or packet headers and data when the headers can't be split. Subsequent +descriptors in a multi-descriptor chain will not use the first buffer. The +second buffer is used by all the descriptors in the chain for payload data. +Based on whether the driver is processing the first, intermediate, or last +descriptor it can calculate the buffer usage and build the SKB properly. + +Tested and verified on both old and new hardware. + +Signed-off-by: Tom Lendacky +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/amd/xgbe/xgbe-common.h | 6 + + drivers/net/ethernet/amd/xgbe/xgbe-dev.c | 20 +++-- + drivers/net/ethernet/amd/xgbe/xgbe-drv.c | 102 +++++++++++++++++----------- + 3 files changed, 78 insertions(+), 50 deletions(-) + +--- a/drivers/net/ethernet/amd/xgbe/xgbe-common.h ++++ b/drivers/net/ethernet/amd/xgbe/xgbe-common.h +@@ -913,8 +913,8 @@ + #define RX_PACKET_ATTRIBUTES_CSUM_DONE_WIDTH 1 + #define RX_PACKET_ATTRIBUTES_VLAN_CTAG_INDEX 1 + #define RX_PACKET_ATTRIBUTES_VLAN_CTAG_WIDTH 1 +-#define RX_PACKET_ATTRIBUTES_INCOMPLETE_INDEX 2 +-#define RX_PACKET_ATTRIBUTES_INCOMPLETE_WIDTH 1 ++#define RX_PACKET_ATTRIBUTES_LAST_INDEX 2 ++#define RX_PACKET_ATTRIBUTES_LAST_WIDTH 1 + #define RX_PACKET_ATTRIBUTES_CONTEXT_NEXT_INDEX 3 + #define RX_PACKET_ATTRIBUTES_CONTEXT_NEXT_WIDTH 1 + #define RX_PACKET_ATTRIBUTES_CONTEXT_INDEX 4 +@@ -923,6 +923,8 @@ + #define RX_PACKET_ATTRIBUTES_RX_TSTAMP_WIDTH 1 + #define RX_PACKET_ATTRIBUTES_RSS_HASH_INDEX 6 + #define RX_PACKET_ATTRIBUTES_RSS_HASH_WIDTH 1 ++#define RX_PACKET_ATTRIBUTES_FIRST_INDEX 7 ++#define RX_PACKET_ATTRIBUTES_FIRST_WIDTH 1 + + #define RX_NORMAL_DESC0_OVT_INDEX 0 + #define RX_NORMAL_DESC0_OVT_WIDTH 16 +--- a/drivers/net/ethernet/amd/xgbe/xgbe-dev.c ++++ b/drivers/net/ethernet/amd/xgbe/xgbe-dev.c +@@ -1658,10 +1658,15 @@ static int xgbe_dev_read(struct xgbe_cha + + /* Get the header length */ + if (XGMAC_GET_BITS_LE(rdesc->desc3, RX_NORMAL_DESC3, FD)) { ++ XGMAC_SET_BITS(packet->attributes, RX_PACKET_ATTRIBUTES, ++ FIRST, 1); + rdata->rx.hdr_len = XGMAC_GET_BITS_LE(rdesc->desc2, + RX_NORMAL_DESC2, HL); + if (rdata->rx.hdr_len) + pdata->ext_stats.rx_split_header_packets++; ++ } else { ++ XGMAC_SET_BITS(packet->attributes, RX_PACKET_ATTRIBUTES, ++ FIRST, 0); + } + + /* Get the RSS hash */ +@@ -1684,19 +1689,16 @@ static int xgbe_dev_read(struct xgbe_cha + } + } + +- /* Get the packet length */ +- rdata->rx.len = XGMAC_GET_BITS_LE(rdesc->desc3, RX_NORMAL_DESC3, PL); +- +- if (!XGMAC_GET_BITS_LE(rdesc->desc3, RX_NORMAL_DESC3, LD)) { +- /* Not all the data has been transferred for this packet */ +- XGMAC_SET_BITS(packet->attributes, RX_PACKET_ATTRIBUTES, +- INCOMPLETE, 1); ++ /* Not all the data has been transferred for this packet */ ++ if (!XGMAC_GET_BITS_LE(rdesc->desc3, RX_NORMAL_DESC3, LD)) + return 0; +- } + + /* This is the last of the data for this packet */ + XGMAC_SET_BITS(packet->attributes, RX_PACKET_ATTRIBUTES, +- INCOMPLETE, 0); ++ LAST, 1); ++ ++ /* Get the packet length */ ++ rdata->rx.len = XGMAC_GET_BITS_LE(rdesc->desc3, RX_NORMAL_DESC3, PL); + + /* Set checksum done indicator as appropriate */ + if (netdev->features & NETIF_F_RXCSUM) +--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c ++++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c +@@ -1760,13 +1760,12 @@ static struct sk_buff *xgbe_create_skb(s + { + struct sk_buff *skb; + u8 *packet; +- unsigned int copy_len; + + skb = napi_alloc_skb(napi, rdata->rx.hdr.dma_len); + if (!skb) + return NULL; + +- /* Start with the header buffer which may contain just the header ++ /* Pull in the header buffer which may contain just the header + * or the header plus data + */ + dma_sync_single_range_for_cpu(pdata->dev, rdata->rx.hdr.dma_base, +@@ -1775,30 +1774,49 @@ static struct sk_buff *xgbe_create_skb(s + + packet = page_address(rdata->rx.hdr.pa.pages) + + rdata->rx.hdr.pa.pages_offset; +- copy_len = (rdata->rx.hdr_len) ? rdata->rx.hdr_len : len; +- copy_len = min(rdata->rx.hdr.dma_len, copy_len); +- skb_copy_to_linear_data(skb, packet, copy_len); +- skb_put(skb, copy_len); +- +- len -= copy_len; +- if (len) { +- /* Add the remaining data as a frag */ +- dma_sync_single_range_for_cpu(pdata->dev, +- rdata->rx.buf.dma_base, +- rdata->rx.buf.dma_off, +- rdata->rx.buf.dma_len, +- DMA_FROM_DEVICE); +- +- skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, +- rdata->rx.buf.pa.pages, +- rdata->rx.buf.pa.pages_offset, +- len, rdata->rx.buf.dma_len); +- rdata->rx.buf.pa.pages = NULL; +- } ++ skb_copy_to_linear_data(skb, packet, len); ++ skb_put(skb, len); + + return skb; + } + ++static unsigned int xgbe_rx_buf1_len(struct xgbe_ring_data *rdata, ++ struct xgbe_packet_data *packet) ++{ ++ /* Always zero if not the first descriptor */ ++ if (!XGMAC_GET_BITS(packet->attributes, RX_PACKET_ATTRIBUTES, FIRST)) ++ return 0; ++ ++ /* First descriptor with split header, return header length */ ++ if (rdata->rx.hdr_len) ++ return rdata->rx.hdr_len; ++ ++ /* First descriptor but not the last descriptor and no split header, ++ * so the full buffer was used ++ */ ++ if (!XGMAC_GET_BITS(packet->attributes, RX_PACKET_ATTRIBUTES, LAST)) ++ return rdata->rx.hdr.dma_len; ++ ++ /* First descriptor and last descriptor and no split header, so ++ * calculate how much of the buffer was used ++ */ ++ return min_t(unsigned int, rdata->rx.hdr.dma_len, rdata->rx.len); ++} ++ ++static unsigned int xgbe_rx_buf2_len(struct xgbe_ring_data *rdata, ++ struct xgbe_packet_data *packet, ++ unsigned int len) ++{ ++ /* Always the full buffer if not the last descriptor */ ++ if (!XGMAC_GET_BITS(packet->attributes, RX_PACKET_ATTRIBUTES, LAST)) ++ return rdata->rx.buf.dma_len; ++ ++ /* Last descriptor so calculate how much of the buffer was used ++ * for the last bit of data ++ */ ++ return rdata->rx.len - len; ++} ++ + static int xgbe_tx_poll(struct xgbe_channel *channel) + { + struct xgbe_prv_data *pdata = channel->pdata; +@@ -1881,8 +1899,8 @@ static int xgbe_rx_poll(struct xgbe_chan + struct napi_struct *napi; + struct sk_buff *skb; + struct skb_shared_hwtstamps *hwtstamps; +- unsigned int incomplete, error, context_next, context; +- unsigned int len, rdesc_len, max_len; ++ unsigned int last, error, context_next, context; ++ unsigned int len, buf1_len, buf2_len, max_len; + unsigned int received = 0; + int packet_count = 0; + +@@ -1892,7 +1910,7 @@ static int xgbe_rx_poll(struct xgbe_chan + if (!ring) + return 0; + +- incomplete = 0; ++ last = 0; + context_next = 0; + + napi = (pdata->per_channel_irq) ? &channel->napi : &pdata->napi; +@@ -1926,9 +1944,8 @@ read_again: + received++; + ring->cur++; + +- incomplete = XGMAC_GET_BITS(packet->attributes, +- RX_PACKET_ATTRIBUTES, +- INCOMPLETE); ++ last = XGMAC_GET_BITS(packet->attributes, RX_PACKET_ATTRIBUTES, ++ LAST); + context_next = XGMAC_GET_BITS(packet->attributes, + RX_PACKET_ATTRIBUTES, + CONTEXT_NEXT); +@@ -1937,7 +1954,7 @@ read_again: + CONTEXT); + + /* Earlier error, just drain the remaining data */ +- if ((incomplete || context_next) && error) ++ if ((!last || context_next) && error) + goto read_again; + + if (error || packet->errors) { +@@ -1949,16 +1966,22 @@ read_again: + } + + if (!context) { +- /* Length is cumulative, get this descriptor's length */ +- rdesc_len = rdata->rx.len - len; +- len += rdesc_len; ++ /* Get the data length in the descriptor buffers */ ++ buf1_len = xgbe_rx_buf1_len(rdata, packet); ++ len += buf1_len; ++ buf2_len = xgbe_rx_buf2_len(rdata, packet, len); ++ len += buf2_len; + +- if (rdesc_len && !skb) { ++ if (!skb) { + skb = xgbe_create_skb(pdata, napi, rdata, +- rdesc_len); +- if (!skb) ++ buf1_len); ++ if (!skb) { + error = 1; +- } else if (rdesc_len) { ++ goto skip_data; ++ } ++ } ++ ++ if (buf2_len) { + dma_sync_single_range_for_cpu(pdata->dev, + rdata->rx.buf.dma_base, + rdata->rx.buf.dma_off, +@@ -1968,13 +1991,14 @@ read_again: + skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, + rdata->rx.buf.pa.pages, + rdata->rx.buf.pa.pages_offset, +- rdesc_len, ++ buf2_len, + rdata->rx.buf.dma_len); + rdata->rx.buf.pa.pages = NULL; + } + } + +- if (incomplete || context_next) ++skip_data: ++ if (!last || context_next) + goto read_again; + + if (!skb) +@@ -2033,7 +2057,7 @@ next_packet: + } + + /* Check if we need to save state before leaving */ +- if (received && (incomplete || context_next)) { ++ if (received && (!last || context_next)) { + rdata = XGBE_GET_DESC_DATA(ring, ring->cur); + rdata->state_saved = 1; + rdata->state.skb = skb; diff --git a/queue-4.4/ipv4-provide-stronger-user-input-validation-in-nl_fib_input.patch b/queue-4.4/ipv4-provide-stronger-user-input-validation-in-nl_fib_input.patch new file mode 100644 index 00000000000..0f1b94c621b --- /dev/null +++ b/queue-4.4/ipv4-provide-stronger-user-input-validation-in-nl_fib_input.patch @@ -0,0 +1,39 @@ +From foo@baz Mon Mar 27 18:22:09 CEST 2017 +From: Eric Dumazet +Date: Tue, 21 Mar 2017 19:22:28 -0700 +Subject: ipv4: provide stronger user input validation in nl_fib_input() + +From: Eric Dumazet + + +[ Upstream commit c64c0b3cac4c5b8cb093727d2c19743ea3965c0b ] + +Alexander reported a KMSAN splat caused by reads of uninitialized +field (tb_id_in) from user provided struct fib_result_nl + +It turns out nl_fib_input() sanity tests on user input is a bit +wrong : + +User can pretend nlh->nlmsg_len is big enough, but provide +at sendmsg() time a too small buffer. + +Reported-by: Alexander Potapenko +Signed-off-by: Eric Dumazet +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/fib_frontend.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/net/ipv4/fib_frontend.c ++++ b/net/ipv4/fib_frontend.c +@@ -1080,7 +1080,8 @@ static void nl_fib_input(struct sk_buff + + net = sock_net(skb->sk); + nlh = nlmsg_hdr(skb); +- if (skb->len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len || ++ if (skb->len < nlmsg_total_size(sizeof(*frn)) || ++ skb->len < nlh->nlmsg_len || + nlmsg_len(nlh) < sizeof(*frn)) + return; + diff --git a/queue-4.4/net-bcmgenet-do-not-suspend-phy-if-wake-on-lan-is-enabled.patch b/queue-4.4/net-bcmgenet-do-not-suspend-phy-if-wake-on-lan-is-enabled.patch new file mode 100644 index 00000000000..b2838a58a93 --- /dev/null +++ b/queue-4.4/net-bcmgenet-do-not-suspend-phy-if-wake-on-lan-is-enabled.patch @@ -0,0 +1,43 @@ +From foo@baz Mon Mar 27 18:22:09 CEST 2017 +From: Florian Fainelli +Date: Wed, 15 Mar 2017 12:57:21 -0700 +Subject: net: bcmgenet: Do not suspend PHY if Wake-on-LAN is enabled + +From: Florian Fainelli + + +[ Upstream commit 5371bbf4b295eea334ed453efa286afa2c3ccff3 ] + +Suspending the PHY would be putting it in a low power state where it +may no longer allow us to do Wake-on-LAN. + +Fixes: cc013fb48898 ("net: bcmgenet: correctly suspend and resume PHY device") +Signed-off-by: Florian Fainelli +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/broadcom/genet/bcmgenet.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c ++++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c +@@ -3495,7 +3495,8 @@ static int bcmgenet_suspend(struct devic + + bcmgenet_netif_stop(dev); + +- phy_suspend(priv->phydev); ++ if (!device_may_wakeup(d)) ++ phy_suspend(priv->phydev); + + netif_device_detach(dev); + +@@ -3592,7 +3593,8 @@ static int bcmgenet_resume(struct device + + netif_device_attach(dev); + +- phy_resume(priv->phydev); ++ if (!device_may_wakeup(d)) ++ phy_resume(priv->phydev); + + if (priv->eee.eee_enabled) + bcmgenet_eee_enable_set(dev, true); diff --git a/queue-4.4/net-bcmgenet-remove-bcmgenet_internal_phy_setup.patch b/queue-4.4/net-bcmgenet-remove-bcmgenet_internal_phy_setup.patch new file mode 100644 index 00000000000..cf7758d38d8 --- /dev/null +++ b/queue-4.4/net-bcmgenet-remove-bcmgenet_internal_phy_setup.patch @@ -0,0 +1,85 @@ +From foo@baz Mon Mar 27 18:22:09 CEST 2017 +From: Doug Berger +Date: Tue, 21 Mar 2017 14:01:06 -0700 +Subject: net: bcmgenet: remove bcmgenet_internal_phy_setup() + +From: Doug Berger + + +[ Upstream commit 31739eae738ccbe8b9d627c3f2251017ca03f4d2 ] + +Commit 6ac3ce8295e6 ("net: bcmgenet: Remove excessive PHY reset") +removed the bcmgenet_mii_reset() function from bcmgenet_power_up() and +bcmgenet_internal_phy_setup() functions. In so doing it broke the reset +of the internal PHY devices used by the GENETv1-GENETv3 which required +this reset before the UniMAC was enabled. It also broke the internal +GPHY devices used by the GENETv4 because the config_init that installed +the AFE workaround was no longer occurring after the reset of the GPHY +performed by bcmgenet_phy_power_set() in bcmgenet_internal_phy_setup(). +In addition the code in bcmgenet_internal_phy_setup() related to the +"enable APD" comment goes with the bcmgenet_mii_reset() so it should +have also been removed. + +Commit bd4060a6108b ("net: bcmgenet: Power on integrated GPHY in +bcmgenet_power_up()") moved the bcmgenet_phy_power_set() call to the +bcmgenet_power_up() function, but failed to remove it from the +bcmgenet_internal_phy_setup() function. Had it done so, the +bcmgenet_internal_phy_setup() function would have been empty and could +have been removed at that time. + +Commit 5dbebbb44a6a ("net: bcmgenet: Software reset EPHY after power on") +was submitted to correct the functional problems introduced by +commit 6ac3ce8295e6 ("net: bcmgenet: Remove excessive PHY reset"). It +was included in v4.4 and made available on 4.3-stable. Unfortunately, +it didn't fully revert the commit because this bcmgenet_mii_reset() +doesn't apply the soft reset to the internal GPHY used by GENETv4 like +the previous one did. This prevents the restoration of the AFE work- +arounds for internal GPHY devices after the bcmgenet_phy_power_set() in +bcmgenet_internal_phy_setup(). + +This commit takes the alternate approach of removing the unnecessary +bcmgenet_internal_phy_setup() function which shouldn't have been in v4.3 +so that when bcmgenet_mii_reset() was restored it should have only gone +into bcmgenet_power_up(). This will avoid the problems while also +removing the redundancy (and hopefully some of the confusion). + +Fixes: 6ac3ce8295e6 ("net: bcmgenet: Remove excessive PHY reset") +Signed-off-by: Doug Berger +Reviewed-by: Florian Fainelli +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/broadcom/genet/bcmmii.c | 15 --------------- + 1 file changed, 15 deletions(-) + +--- a/drivers/net/ethernet/broadcom/genet/bcmmii.c ++++ b/drivers/net/ethernet/broadcom/genet/bcmmii.c +@@ -220,20 +220,6 @@ void bcmgenet_phy_power_set(struct net_d + udelay(60); + } + +-static void bcmgenet_internal_phy_setup(struct net_device *dev) +-{ +- struct bcmgenet_priv *priv = netdev_priv(dev); +- u32 reg; +- +- /* Power up PHY */ +- bcmgenet_phy_power_set(dev, true); +- /* enable APD */ +- reg = bcmgenet_ext_readl(priv, EXT_EXT_PWR_MGMT); +- reg |= EXT_PWR_DN_EN_LD; +- bcmgenet_ext_writel(priv, reg, EXT_EXT_PWR_MGMT); +- bcmgenet_mii_reset(dev); +-} +- + static void bcmgenet_moca_phy_setup(struct bcmgenet_priv *priv) + { + u32 reg; +@@ -281,7 +267,6 @@ int bcmgenet_mii_config(struct net_devic + + if (priv->internal_phy) { + phy_name = "internal PHY"; +- bcmgenet_internal_phy_setup(dev); + } else if (priv->phy_interface == PHY_INTERFACE_MODE_MOCA) { + phy_name = "MoCA"; + bcmgenet_moca_phy_setup(priv); diff --git a/queue-4.4/net-mlx5-increase-number-of-max-qps-in-default-profile.patch b/queue-4.4/net-mlx5-increase-number-of-max-qps-in-default-profile.patch new file mode 100644 index 00000000000..27e345d201f --- /dev/null +++ b/queue-4.4/net-mlx5-increase-number-of-max-qps-in-default-profile.patch @@ -0,0 +1,34 @@ +From foo@baz Mon Mar 27 18:22:09 CEST 2017 +From: Maor Gottlieb +Date: Tue, 21 Mar 2017 15:59:17 +0200 +Subject: net/mlx5: Increase number of max QPs in default profile + +From: Maor Gottlieb + + +[ Upstream commit 5f40b4ed975c26016cf41953b7510fe90718e21c ] + +With ConnectX-4 sharing SRQs from the same space as QPs, we hit a +limit preventing some applications to allocate needed QPs amount. +Double the size to 256K. + +Fixes: e126ba97dba9e ('mlx5: Add driver for Mellanox Connect-IB adapters') +Signed-off-by: Maor Gottlieb +Signed-off-by: Saeed Mahameed +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/mellanox/mlx5/core/main.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c +@@ -85,7 +85,7 @@ static struct mlx5_profile profile[] = { + [2] = { + .mask = MLX5_PROF_MASK_QP_SIZE | + MLX5_PROF_MASK_MR_CACHE, +- .log_max_qp = 17, ++ .log_max_qp = 18, + .mr_cache[0] = { + .size = 500, + .limit = 250 diff --git a/queue-4.4/net-mlx5e-count-lro-packets-correctly.patch b/queue-4.4/net-mlx5e-count-lro-packets-correctly.patch new file mode 100644 index 00000000000..14317f7eb2c --- /dev/null +++ b/queue-4.4/net-mlx5e-count-lro-packets-correctly.patch @@ -0,0 +1,54 @@ +From foo@baz Mon Mar 27 18:22:09 CEST 2017 +From: Gal Pressman +Date: Tue, 21 Mar 2017 15:59:19 +0200 +Subject: net/mlx5e: Count LRO packets correctly + +From: Gal Pressman + + +[ Upstream commit 8ab7e2ae15d84ba758b2c8c6f4075722e9bd2a08 ] + +RX packets statistics ('rx_packets' counter) used to count LRO packets +as one, even though it contains multiple segments. +This patch will increment the counter by the number of segments, and +align the driver with the behavior of other drivers in the stack. + +Note that no information is lost in this patch due to 'rx_lro_packets' +counter existence. + +Before, ethtool showed: +$ ethtool -S ens6 | egrep "rx_packets|rx_lro_packets" + rx_packets: 435277 + rx_lro_packets: 35847 + rx_packets_phy: 1935066 + +Now, we will see the more logical statistics: +$ ethtool -S ens6 | egrep "rx_packets|rx_lro_packets" + rx_packets: 1935066 + rx_lro_packets: 35847 + rx_packets_phy: 1935066 + +Fixes: e586b3b0baee ("net/mlx5: Ethernet Datapath files") +Signed-off-by: Gal Pressman +Cc: kernel-team@fb.com +Signed-off-by: Saeed Mahameed +Acked-by: Alexei Starovoitov +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +@@ -197,6 +197,10 @@ static inline void mlx5e_build_rx_skb(st + if (lro_num_seg > 1) { + mlx5e_lro_update_hdr(skb, cqe); + skb_shinfo(skb)->gso_size = DIV_ROUND_UP(cqe_bcnt, lro_num_seg); ++ /* Subtract one since we already counted this as one ++ * "regular" packet in mlx5e_complete_rx_cqe() ++ */ ++ rq->stats.packets += lro_num_seg - 1; + rq->stats.lro_packets++; + rq->stats.lro_bytes += cqe_bcnt; + } diff --git a/queue-4.4/net-openvswitch-set-the-ipv6-source-tunnel-key-address-attribute-correctly.patch b/queue-4.4/net-openvswitch-set-the-ipv6-source-tunnel-key-address-attribute-correctly.patch new file mode 100644 index 00000000000..b348a3883d2 --- /dev/null +++ b/queue-4.4/net-openvswitch-set-the-ipv6-source-tunnel-key-address-attribute-correctly.patch @@ -0,0 +1,36 @@ +From foo@baz Mon Mar 27 18:22:09 CEST 2017 +From: Or Gerlitz +Date: Wed, 15 Mar 2017 18:10:47 +0200 +Subject: net/openvswitch: Set the ipv6 source tunnel key address attribute correctly + +From: Or Gerlitz + + +[ Upstream commit 3d20f1f7bd575d147ffa75621fa560eea0aec690 ] + +When dealing with ipv6 source tunnel key address attribute +(OVS_TUNNEL_KEY_ATTR_IPV6_SRC) we are wrongly setting the tunnel +dst ip, fix that. + +Fixes: 6b26ba3a7d95 ('openvswitch: netlink attributes for IPv6 tunneling') +Signed-off-by: Or Gerlitz +Reported-by: Paul Blakey +Acked-by: Jiri Benc +Acked-by: Joe Stringer +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/openvswitch/flow_netlink.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/openvswitch/flow_netlink.c ++++ b/net/openvswitch/flow_netlink.c +@@ -588,7 +588,7 @@ static int ip_tun_from_nlattr(const stru + ipv4 = true; + break; + case OVS_TUNNEL_KEY_ATTR_IPV6_SRC: +- SW_FLOW_KEY_PUT(match, tun_key.u.ipv6.dst, ++ SW_FLOW_KEY_PUT(match, tun_key.u.ipv6.src, + nla_get_in6_addr(a), is_mask); + ipv6 = true; + break; diff --git a/queue-4.4/net-properly-release-sk_frag.page.patch b/queue-4.4/net-properly-release-sk_frag.page.patch new file mode 100644 index 00000000000..a251a4c83ae --- /dev/null +++ b/queue-4.4/net-properly-release-sk_frag.page.patch @@ -0,0 +1,52 @@ +From foo@baz Mon Mar 27 18:22:09 CEST 2017 +From: Eric Dumazet +Date: Wed, 15 Mar 2017 13:21:28 -0700 +Subject: net: properly release sk_frag.page + +From: Eric Dumazet + + +[ Upstream commit 22a0e18eac7a9e986fec76c60fa4a2926d1291e2 ] + +I mistakenly added the code to release sk->sk_frag in +sk_common_release() instead of sk_destruct() + +TCP sockets using sk->sk_allocation == GFP_ATOMIC do no call +sk_common_release() at close time, thus leaking one (order-3) page. + +iSCSI is using such sockets. + +Fixes: 5640f7685831 ("net: use a per task frag allocator") +Signed-off-by: Eric Dumazet +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/core/sock.c | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +--- a/net/core/sock.c ++++ b/net/core/sock.c +@@ -1459,6 +1459,11 @@ void sk_destruct(struct sock *sk) + pr_debug("%s: optmem leakage (%d bytes) detected\n", + __func__, atomic_read(&sk->sk_omem_alloc)); + ++ if (sk->sk_frag.page) { ++ put_page(sk->sk_frag.page); ++ sk->sk_frag.page = NULL; ++ } ++ + if (sk->sk_peer_cred) + put_cred(sk->sk_peer_cred); + put_pid(sk->sk_peer_pid); +@@ -2691,11 +2696,6 @@ void sk_common_release(struct sock *sk) + + sk_refcnt_debug_release(sk); + +- if (sk->sk_frag.page) { +- put_page(sk->sk_frag.page); +- sk->sk_frag.page = NULL; +- } +- + sock_put(sk); + } + EXPORT_SYMBOL(sk_common_release); diff --git a/queue-4.4/net-unix-properly-re-increment-inflight-counter-of-gc-discarded-candidates.patch b/queue-4.4/net-unix-properly-re-increment-inflight-counter-of-gc-discarded-candidates.patch new file mode 100644 index 00000000000..e0b4d5adf30 --- /dev/null +++ b/queue-4.4/net-unix-properly-re-increment-inflight-counter-of-gc-discarded-candidates.patch @@ -0,0 +1,111 @@ +From foo@baz Mon Mar 27 18:22:09 CEST 2017 +From: Andrey Ulanov +Date: Tue, 14 Mar 2017 20:16:42 -0700 +Subject: net: unix: properly re-increment inflight counter of GC discarded candidates + +From: Andrey Ulanov + + +[ Upstream commit 7df9c24625b9981779afb8fcdbe2bb4765e61147 ] + +Dmitry has reported that a BUG_ON() condition in unix_notinflight() +may be triggered by a simple code that forwards unix socket in an +SCM_RIGHTS message. +That is caused by incorrect unix socket GC implementation in unix_gc(). + +The GC first collects list of candidates, then (a) decrements their +"children's" inflight counter, (b) checks which inflight counters are +now 0, and then (c) increments all inflight counters back. +(a) and (c) are done by calling scan_children() with inc_inflight or +dec_inflight as the second argument. + +Commit 6209344f5a37 ("net: unix: fix inflight counting bug in garbage +collector") changed scan_children() such that it no longer considers +sockets that do not have UNIX_GC_CANDIDATE flag. It also added a block +of code that that unsets this flag _before_ invoking +scan_children(, dec_iflight, ). This may lead to incorrect inflight +counters for some sockets. + +This change fixes this bug by changing order of operations: +UNIX_GC_CANDIDATE is now unset only after all inflight counters are +restored to the original state. + + kernel BUG at net/unix/garbage.c:149! + RIP: 0010:[] [] + unix_notinflight+0x3b4/0x490 net/unix/garbage.c:149 + Call Trace: + [] unix_detach_fds.isra.19+0xff/0x170 net/unix/af_unix.c:1487 + [] unix_destruct_scm+0xf9/0x210 net/unix/af_unix.c:1496 + [] skb_release_head_state+0x101/0x200 net/core/skbuff.c:655 + [] skb_release_all+0x1a/0x60 net/core/skbuff.c:668 + [] __kfree_skb+0x1a/0x30 net/core/skbuff.c:684 + [] kfree_skb+0x184/0x570 net/core/skbuff.c:705 + [] unix_release_sock+0x5b5/0xbd0 net/unix/af_unix.c:559 + [] unix_release+0x49/0x90 net/unix/af_unix.c:836 + [] sock_release+0x92/0x1f0 net/socket.c:570 + [] sock_close+0x1b/0x20 net/socket.c:1017 + [] __fput+0x34e/0x910 fs/file_table.c:208 + [] ____fput+0x1a/0x20 fs/file_table.c:244 + [] task_work_run+0x1a0/0x280 kernel/task_work.c:116 + [< inline >] exit_task_work include/linux/task_work.h:21 + [] do_exit+0x183a/0x2640 kernel/exit.c:828 + [] do_group_exit+0x14e/0x420 kernel/exit.c:931 + [] get_signal+0x663/0x1880 kernel/signal.c:2307 + [] do_signal+0xc5/0x2190 arch/x86/kernel/signal.c:807 + [] exit_to_usermode_loop+0x1ea/0x2d0 + arch/x86/entry/common.c:156 + [< inline >] prepare_exit_to_usermode arch/x86/entry/common.c:190 + [] syscall_return_slowpath+0x4d3/0x570 + arch/x86/entry/common.c:259 + [] entry_SYSCALL_64_fastpath+0xc4/0xc6 + +Link: https://lkml.org/lkml/2017/3/6/252 +Signed-off-by: Andrey Ulanov +Reported-by: Dmitry Vyukov +Fixes: 6209344 ("net: unix: fix inflight counting bug in garbage collector") +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/unix/garbage.c | 17 +++++++++-------- + 1 file changed, 9 insertions(+), 8 deletions(-) + +--- a/net/unix/garbage.c ++++ b/net/unix/garbage.c +@@ -146,6 +146,7 @@ void unix_notinflight(struct user_struct + if (s) { + struct unix_sock *u = unix_sk(s); + ++ BUG_ON(!atomic_long_read(&u->inflight)); + BUG_ON(list_empty(&u->link)); + + if (atomic_long_dec_and_test(&u->inflight)) +@@ -341,6 +342,14 @@ void unix_gc(void) + } + list_del(&cursor); + ++ /* Now gc_candidates contains only garbage. Restore original ++ * inflight counters for these as well, and remove the skbuffs ++ * which are creating the cycle(s). ++ */ ++ skb_queue_head_init(&hitlist); ++ list_for_each_entry(u, &gc_candidates, link) ++ scan_children(&u->sk, inc_inflight, &hitlist); ++ + /* not_cycle_list contains those sockets which do not make up a + * cycle. Restore these to the inflight list. + */ +@@ -350,14 +359,6 @@ void unix_gc(void) + list_move_tail(&u->link, &gc_inflight_list); + } + +- /* Now gc_candidates contains only garbage. Restore original +- * inflight counters for these as well, and remove the skbuffs +- * which are creating the cycle(s). +- */ +- skb_queue_head_init(&hitlist); +- list_for_each_entry(u, &gc_candidates, link) +- scan_children(&u->sk, inc_inflight, &hitlist); +- + spin_unlock(&unix_gc_lock); + + /* Here we are. Hitlist is filled. Die. */ diff --git a/queue-4.4/series b/queue-4.4/series new file mode 100644 index 00000000000..57f63b52232 --- /dev/null +++ b/queue-4.4/series @@ -0,0 +1,11 @@ +net-openvswitch-set-the-ipv6-source-tunnel-key-address-attribute-correctly.patch +net-bcmgenet-do-not-suspend-phy-if-wake-on-lan-is-enabled.patch +net-properly-release-sk_frag.page.patch +amd-xgbe-fix-jumbo-mtu-processing-on-newer-hardware.patch +net-unix-properly-re-increment-inflight-counter-of-gc-discarded-candidates.patch +net-mlx5-increase-number-of-max-qps-in-default-profile.patch +net-mlx5e-count-lro-packets-correctly.patch +net-bcmgenet-remove-bcmgenet_internal_phy_setup.patch +ipv4-provide-stronger-user-input-validation-in-nl_fib_input.patch +socket-bpf-fix-sk_filter-use-after-free-in-sk_clone_lock.patch +tcp-initialize-icsk_ack.lrcvtime-at-session-start-time.patch diff --git a/queue-4.4/socket-bpf-fix-sk_filter-use-after-free-in-sk_clone_lock.patch b/queue-4.4/socket-bpf-fix-sk_filter-use-after-free-in-sk_clone_lock.patch new file mode 100644 index 00000000000..ae4989a611b --- /dev/null +++ b/queue-4.4/socket-bpf-fix-sk_filter-use-after-free-in-sk_clone_lock.patch @@ -0,0 +1,65 @@ +From foo@baz Mon Mar 27 18:22:09 CEST 2017 +From: Daniel Borkmann +Date: Wed, 22 Mar 2017 13:08:08 +0100 +Subject: socket, bpf: fix sk_filter use after free in sk_clone_lock + +From: Daniel Borkmann + + +[ Upstream commit a97e50cc4cb67e1e7bff56f6b41cda62ca832336 ] + +In sk_clone_lock(), we create a new socket and inherit most of the +parent's members via sock_copy() which memcpy()'s various sections. +Now, in case the parent socket had a BPF socket filter attached, +then newsk->sk_filter points to the same instance as the original +sk->sk_filter. + +sk_filter_charge() is then called on the newsk->sk_filter to take a +reference and should that fail due to hitting max optmem, we bail +out and release the newsk instance. + +The issue is that commit 278571baca2a ("net: filter: simplify socket +charging") wrongly combined the dismantle path with the failure path +of xfrm_sk_clone_policy(). This means, even when charging failed, we +call sk_free_unlock_clone() on the newsk, which then still points to +the same sk_filter as the original sk. + +Thus, sk_free_unlock_clone() calls into __sk_destruct() eventually +where it tests for present sk_filter and calls sk_filter_uncharge() +on it, which potentially lets sk_omem_alloc wrap around and releases +the eBPF prog and sk_filter structure from the (still intact) parent. + +Fix it by making sure that when sk_filter_charge() failed, we reset +newsk->sk_filter back to NULL before passing to sk_free_unlock_clone(), +so that we don't mess with the parents sk_filter. + +Only if xfrm_sk_clone_policy() fails, we did reach the point where +either the parent's filter was NULL and as a result newsk's as well +or where we previously had a successful sk_filter_charge(), thus for +that case, we do need sk_filter_uncharge() to release the prior taken +reference on sk_filter. + +Fixes: 278571baca2a ("net: filter: simplify socket charging") +Signed-off-by: Daniel Borkmann +Acked-by: Alexei Starovoitov +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/core/sock.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/net/core/sock.c ++++ b/net/core/sock.c +@@ -1557,6 +1557,12 @@ struct sock *sk_clone_lock(const struct + is_charged = sk_filter_charge(newsk, filter); + + if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { ++ /* We need to make sure that we don't uncharge the new ++ * socket if we couldn't charge it in the first place ++ * as otherwise we uncharge the parent's filter. ++ */ ++ if (!is_charged) ++ RCU_INIT_POINTER(newsk->sk_filter, NULL); + /* It is still raw copy of parent, so invalidate + * destructor and make plain sk_free() */ + newsk->sk_destruct = NULL; diff --git a/queue-4.4/tcp-initialize-icsk_ack.lrcvtime-at-session-start-time.patch b/queue-4.4/tcp-initialize-icsk_ack.lrcvtime-at-session-start-time.patch new file mode 100644 index 00000000000..3535c109fc2 --- /dev/null +++ b/queue-4.4/tcp-initialize-icsk_ack.lrcvtime-at-session-start-time.patch @@ -0,0 +1,55 @@ +From foo@baz Mon Mar 27 18:22:09 CEST 2017 +From: Eric Dumazet +Date: Wed, 22 Mar 2017 08:10:21 -0700 +Subject: tcp: initialize icsk_ack.lrcvtime at session start time + +From: Eric Dumazet + + +[ Upstream commit 15bb7745e94a665caf42bfaabf0ce062845b533b ] + +icsk_ack.lrcvtime has a 0 value at socket creation time. + +tcpi_last_data_recv can have bogus value if no payload is ever received. + +This patch initializes icsk_ack.lrcvtime for active sessions +in tcp_finish_connect(), and for passive sessions in +tcp_create_openreq_child() + +Signed-off-by: Eric Dumazet +Acked-by: Neal Cardwell +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/tcp_input.c | 2 +- + net/ipv4/tcp_minisocks.c | 1 + + 2 files changed, 2 insertions(+), 1 deletion(-) + +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -5435,6 +5435,7 @@ void tcp_finish_connect(struct sock *sk, + struct inet_connection_sock *icsk = inet_csk(sk); + + tcp_set_state(sk, TCP_ESTABLISHED); ++ icsk->icsk_ack.lrcvtime = tcp_time_stamp; + + if (skb) { + icsk->icsk_af_ops->sk_rx_dst_set(sk, skb); +@@ -5647,7 +5648,6 @@ static int tcp_rcv_synsent_state_process + * to stand against the temptation 8) --ANK + */ + inet_csk_schedule_ack(sk); +- icsk->icsk_ack.lrcvtime = tcp_time_stamp; + tcp_enter_quickack_mode(sk); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, + TCP_DELACK_MAX, TCP_RTO_MAX); +--- a/net/ipv4/tcp_minisocks.c ++++ b/net/ipv4/tcp_minisocks.c +@@ -472,6 +472,7 @@ struct sock *tcp_create_openreq_child(co + newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); + newtp->rtt_min[0].rtt = ~0U; + newicsk->icsk_rto = TCP_TIMEOUT_INIT; ++ newicsk->icsk_ack.lrcvtime = tcp_time_stamp; + + newtp->packets_out = 0; + newtp->retrans_out = 0; -- 2.47.3