From: Greg Kroah-Hartman Date: Wed, 1 Aug 2018 06:20:13 +0000 (+0200) Subject: 4.17-stable patches X-Git-Tag: v4.17.12~2 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=61cf20af956c9ff6e962c8d9fd9b04f78093d0b0;p=thirdparty%2Fkernel%2Fstable-queue.git 4.17-stable patches added patches: cxgb4-added-missing-break-in-ndo_udp_tunnel_-add-del.patch ipv4-remove-bug_on-from-fib_compute_spec_dst.patch net-ena-fix-use-of-uninitialized-dma-address-bits-field.patch net-fix-amd-xgbe-flow-control-issue.patch net-lan78xx-fix-rx-handling-before-first-packet-is-send.patch net-mdio-mux-bcm-iproc-fix-wrong-getter-and-setter-pair.patch net-rollback-orig-value-on-failure-of-dev_qdisc_change_tx_queue_len.patch net-stmmac-align-dma-stuff-to-largest-cache-line-length.patch netdevsim-don-t-leak-devlink-resources.patch netlink-do-not-subscribe-to-non-existent-groups.patch netlink-don-t-shift-with-ub-on-nlk-ngroups.patch rds-rdma-fix-the-null-ptr-deref-in-rds_ib_get_mr.patch tcp-ack-immediately-when-a-cwr-packet-arrives.patch tcp-add-max_quickacks-param-to-tcp_incr_quickack-and-tcp_enter_quickack_mode.patch tcp-add-one-more-quick-ack-after-after-ecn-events.patch tcp-do-not-aggressively-quick-ack-after-ecn-events.patch tcp-do-not-force-quickack-when-receiving-out-of-order-packets.patch tcp-refactor-tcp_ecn_check_ce-to-remove-sk-type-cast.patch tcp_bbr-fix-bw-probing-to-raise-in-flight-data-for-very-small-bdps.patch virtio_net-fix-incosistent-received-bytes-counter.patch xen-netfront-wait-xenbus-state-change-when-load-module-manually.patch --- diff --git a/queue-4.17/cxgb4-added-missing-break-in-ndo_udp_tunnel_-add-del.patch b/queue-4.17/cxgb4-added-missing-break-in-ndo_udp_tunnel_-add-del.patch new file mode 100644 index 00000000000..d766ada1924 --- /dev/null +++ b/queue-4.17/cxgb4-added-missing-break-in-ndo_udp_tunnel_-add-del.patch @@ -0,0 +1,40 @@ +From foo@baz Wed Aug 1 08:19:18 CEST 2018 +From: Arjun Vynipadath +Date: Wed, 25 Jul 2018 19:39:52 +0530 +Subject: cxgb4: Added missing break in ndo_udp_tunnel_{add/del} + +From: Arjun Vynipadath + +[ Upstream commit 942a656f1f228f06a37adad0e6c347773cfe7bd6 ] + +Break statements were missing for Geneve case in +ndo_udp_tunnel_{add/del}, thereby raw mac matchall +entries were not getting added. + +Fixes: c746fc0e8b2d("cxgb4: add geneve offload support for T6") +Signed-off-by: Arjun Vynipadath +Signed-off-by: Ganesh Goudar +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c ++++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +@@ -3066,6 +3066,7 @@ static void cxgb_del_udp_tunnel(struct n + + adapter->geneve_port = 0; + t4_write_reg(adapter, MPS_RX_GENEVE_TYPE_A, 0); ++ break; + default: + return; + } +@@ -3151,6 +3152,7 @@ static void cxgb_add_udp_tunnel(struct n + + t4_write_reg(adapter, MPS_RX_GENEVE_TYPE_A, + GENEVE_V(be16_to_cpu(ti->port)) | GENEVE_EN_F); ++ break; + default: + return; + } diff --git a/queue-4.17/ipv4-remove-bug_on-from-fib_compute_spec_dst.patch b/queue-4.17/ipv4-remove-bug_on-from-fib_compute_spec_dst.patch new file mode 100644 index 00000000000..066c75d6a04 --- /dev/null +++ b/queue-4.17/ipv4-remove-bug_on-from-fib_compute_spec_dst.patch @@ -0,0 +1,48 @@ +From foo@baz Wed Aug 1 08:19:18 CEST 2018 +From: Lorenzo Bianconi +Date: Fri, 27 Jul 2018 18:15:46 +0200 +Subject: ipv4: remove BUG_ON() from fib_compute_spec_dst + +From: Lorenzo Bianconi + +[ Upstream commit 9fc12023d6f51551d6ca9ed7e02ecc19d79caf17 ] + +Remove BUG_ON() from fib_compute_spec_dst routine and check +in_dev pointer during flowi4 data structure initialization. +fib_compute_spec_dst routine can be run concurrently with device removal +where ip_ptr net_device pointer is set to NULL. This can happen +if userspace enables pkt info on UDP rx socket and the device +is removed while traffic is flowing + +Fixes: 35ebf65e851c ("ipv4: Create and use fib_compute_spec_dst() helper") +Signed-off-by: Lorenzo Bianconi +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/fib_frontend.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/net/ipv4/fib_frontend.c ++++ b/net/ipv4/fib_frontend.c +@@ -292,19 +292,19 @@ __be32 fib_compute_spec_dst(struct sk_bu + return ip_hdr(skb)->daddr; + + in_dev = __in_dev_get_rcu(dev); +- BUG_ON(!in_dev); + + net = dev_net(dev); + + scope = RT_SCOPE_UNIVERSE; + if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) { ++ bool vmark = in_dev && IN_DEV_SRC_VMARK(in_dev); + struct flowi4 fl4 = { + .flowi4_iif = LOOPBACK_IFINDEX, + .flowi4_oif = l3mdev_master_ifindex_rcu(dev), + .daddr = ip_hdr(skb)->saddr, + .flowi4_tos = RT_TOS(ip_hdr(skb)->tos), + .flowi4_scope = scope, +- .flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0, ++ .flowi4_mark = vmark ? skb->mark : 0, + }; + if (!fib_lookup(net, &fl4, &res, 0)) + return FIB_RES_PREFSRC(net, res); diff --git a/queue-4.17/net-ena-fix-use-of-uninitialized-dma-address-bits-field.patch b/queue-4.17/net-ena-fix-use-of-uninitialized-dma-address-bits-field.patch new file mode 100644 index 00000000000..5fadaa454ee --- /dev/null +++ b/queue-4.17/net-ena-fix-use-of-uninitialized-dma-address-bits-field.patch @@ -0,0 +1,44 @@ +From foo@baz Wed Aug 1 08:19:18 CEST 2018 +From: Gal Pressman +Date: Thu, 26 Jul 2018 23:40:33 +0300 +Subject: net: ena: Fix use of uninitialized DMA address bits field + +From: Gal Pressman + +[ Upstream commit 101f0cd4f2216d32f1b8a75a2154cf3997484ee2 ] + +UBSAN triggers the following undefined behaviour warnings: +[...] +[ 13.236124] UBSAN: Undefined behaviour in drivers/net/ethernet/amazon/ena/ena_eth_com.c:468:22 +[ 13.240043] shift exponent 64 is too large for 64-bit type 'long long unsigned int' +[...] +[ 13.744769] UBSAN: Undefined behaviour in drivers/net/ethernet/amazon/ena/ena_eth_com.c:373:4 +[ 13.748694] shift exponent 64 is too large for 64-bit type 'long long unsigned int' +[...] + +When splitting the address to high and low, GENMASK_ULL is used to generate +a bitmask with dma_addr_bits field from io_sq (in ena_com_prepare_tx and +ena_com_add_single_rx_desc). +The problem is that dma_addr_bits is not initialized with a proper value +(besides being cleared in ena_com_create_io_queue). +Assign dma_addr_bits the correct value that is stored in ena_dev when +initializing the SQ. + +Fixes: 1738cd3ed342 ("net: ena: Add a driver for Amazon Elastic Network Adapters (ENA)") +Signed-off-by: Gal Pressman +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/amazon/ena/ena_com.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/drivers/net/ethernet/amazon/ena/ena_com.c ++++ b/drivers/net/ethernet/amazon/ena/ena_com.c +@@ -333,6 +333,7 @@ static int ena_com_init_io_sq(struct ena + + memset(&io_sq->desc_addr, 0x0, sizeof(io_sq->desc_addr)); + ++ io_sq->dma_addr_bits = ena_dev->dma_addr_bits; + io_sq->desc_entry_size = + (io_sq->direction == ENA_COM_IO_QUEUE_DIRECTION_TX) ? + sizeof(struct ena_eth_io_tx_desc) : diff --git a/queue-4.17/net-fix-amd-xgbe-flow-control-issue.patch b/queue-4.17/net-fix-amd-xgbe-flow-control-issue.patch new file mode 100644 index 00000000000..b3da99b6855 --- /dev/null +++ b/queue-4.17/net-fix-amd-xgbe-flow-control-issue.patch @@ -0,0 +1,42 @@ +From foo@baz Wed Aug 1 08:19:18 CEST 2018 +From: tangpengpeng +Date: Thu, 26 Jul 2018 14:45:16 +0800 +Subject: net: fix amd-xgbe flow-control issue + +From: tangpengpeng + +[ Upstream commit 7f3fc7ddf719cd6faaf787722c511f6918ac6aab ] + +If we enable or disable xgbe flow-control by ethtool , +it does't work.Because the parameter is not properly +assigned,so we need to adjust the assignment order +of the parameters. + +Fixes: c1ce2f77366b ("amd-xgbe: Fix flow control setting logic") +Signed-off-by: tangpengpeng +Acked-by: Tom Lendacky +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/amd/xgbe/xgbe-mdio.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c ++++ b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c +@@ -1128,14 +1128,14 @@ static void xgbe_phy_adjust_link(struct + + if (pdata->tx_pause != pdata->phy.tx_pause) { + new_state = 1; +- pdata->hw_if.config_tx_flow_control(pdata); + pdata->tx_pause = pdata->phy.tx_pause; ++ pdata->hw_if.config_tx_flow_control(pdata); + } + + if (pdata->rx_pause != pdata->phy.rx_pause) { + new_state = 1; +- pdata->hw_if.config_rx_flow_control(pdata); + pdata->rx_pause = pdata->phy.rx_pause; ++ pdata->hw_if.config_rx_flow_control(pdata); + } + + /* Speed support */ diff --git a/queue-4.17/net-lan78xx-fix-rx-handling-before-first-packet-is-send.patch b/queue-4.17/net-lan78xx-fix-rx-handling-before-first-packet-is-send.patch new file mode 100644 index 00000000000..bd1fcbb04b0 --- /dev/null +++ b/queue-4.17/net-lan78xx-fix-rx-handling-before-first-packet-is-send.patch @@ -0,0 +1,35 @@ +From foo@baz Wed Aug 1 08:19:18 CEST 2018 +From: Stefan Wahren +Date: Sat, 28 Jul 2018 09:52:10 +0200 +Subject: net: lan78xx: fix rx handling before first packet is send + +From: Stefan Wahren + +[ Upstream commit 136f55f660192ce04af091642efc75d85e017364 ] + +As long the bh tasklet isn't scheduled once, no packet from the rx path +will be handled. Since the tx path also schedule the same tasklet +this situation only persits until the first packet transmission. +So fix this issue by scheduling the tasklet after link reset. + +Link: https://github.com/raspberrypi/linux/issues/2617 +Fixes: 55d7de9de6c3 ("Microchip's LAN7800 family USB 2/3 to 10/100/1000 Ethernet") +Suggested-by: Floris Bos +Signed-off-by: Stefan Wahren +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/usb/lan78xx.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/drivers/net/usb/lan78xx.c ++++ b/drivers/net/usb/lan78xx.c +@@ -1216,6 +1216,8 @@ static int lan78xx_link_reset(struct lan + mod_timer(&dev->stat_monitor, + jiffies + STAT_UPDATE_TIMER); + } ++ ++ tasklet_schedule(&dev->bh); + } + + return ret; diff --git a/queue-4.17/net-mdio-mux-bcm-iproc-fix-wrong-getter-and-setter-pair.patch b/queue-4.17/net-mdio-mux-bcm-iproc-fix-wrong-getter-and-setter-pair.patch new file mode 100644 index 00000000000..d20204f51bc --- /dev/null +++ b/queue-4.17/net-mdio-mux-bcm-iproc-fix-wrong-getter-and-setter-pair.patch @@ -0,0 +1,35 @@ +From foo@baz Wed Aug 1 08:19:18 CEST 2018 +From: Anton Vasilyev +Date: Fri, 27 Jul 2018 18:57:47 +0300 +Subject: net: mdio-mux: bcm-iproc: fix wrong getter and setter pair + +From: Anton Vasilyev + +[ Upstream commit b0753408aadf32c7ece9e6b765017881e54af833 ] + +mdio_mux_iproc_probe() uses platform_set_drvdata() to store md pointer +in device, whereas mdio_mux_iproc_remove() restores md pointer by +dev_get_platdata(&pdev->dev). This leads to wrong resources release. + +The patch replaces getter to platform_get_drvdata. + +Fixes: 98bc865a1ec8 ("net: mdio-mux: Add MDIO mux driver for iProc SoCs") +Signed-off-by: Anton Vasilyev +Reviewed-by: Andrew Lunn +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/phy/mdio-mux-bcm-iproc.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/net/phy/mdio-mux-bcm-iproc.c ++++ b/drivers/net/phy/mdio-mux-bcm-iproc.c +@@ -218,7 +218,7 @@ out: + + static int mdio_mux_iproc_remove(struct platform_device *pdev) + { +- struct iproc_mdiomux_desc *md = dev_get_platdata(&pdev->dev); ++ struct iproc_mdiomux_desc *md = platform_get_drvdata(pdev); + + mdio_mux_uninit(md->mux_handle); + mdiobus_unregister(md->mii_bus); diff --git a/queue-4.17/net-rollback-orig-value-on-failure-of-dev_qdisc_change_tx_queue_len.patch b/queue-4.17/net-rollback-orig-value-on-failure-of-dev_qdisc_change_tx_queue_len.patch new file mode 100644 index 00000000000..f364cc754d9 --- /dev/null +++ b/queue-4.17/net-rollback-orig-value-on-failure-of-dev_qdisc_change_tx_queue_len.patch @@ -0,0 +1,60 @@ +From foo@baz Wed Aug 1 08:19:18 CEST 2018 +From: Tariq Toukan +Date: Tue, 24 Jul 2018 14:12:20 +0300 +Subject: net: rollback orig value on failure of dev_qdisc_change_tx_queue_len + +From: Tariq Toukan + +[ Upstream commit 7effaf06c3cdef6855e127886c7405b9ab62f90d ] + +Fix dev_change_tx_queue_len so it rolls back original value +upon a failure in dev_qdisc_change_tx_queue_len. +This is already done for notifirers' failures, share the code. + +In case of failure in dev_qdisc_change_tx_queue_len, some tx queues +would still be of the new length, while they should be reverted. +Currently, the revert is not done, and is marked with a TODO label +in dev_qdisc_change_tx_queue_len, and should find some nice solution +to do it. +Yet it is still better to not apply the newly requested value. + +Fixes: 48bfd55e7e41 ("net_sched: plug in qdisc ops change_tx_queue_len") +Signed-off-by: Tariq Toukan +Reviewed-by: Eran Ben Elisha +Reported-by: Ran Rozenstein +Cc: Cong Wang +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/core/dev.c | 17 ++++++++++------- + 1 file changed, 10 insertions(+), 7 deletions(-) + +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -7113,16 +7113,19 @@ int dev_change_tx_queue_len(struct net_d + dev->tx_queue_len = new_len; + res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev); + res = notifier_to_errno(res); +- if (res) { +- netdev_err(dev, +- "refused to change device tx_queue_len\n"); +- dev->tx_queue_len = orig_len; +- return res; +- } +- return dev_qdisc_change_tx_queue_len(dev); ++ if (res) ++ goto err_rollback; ++ res = dev_qdisc_change_tx_queue_len(dev); ++ if (res) ++ goto err_rollback; + } + + return 0; ++ ++err_rollback: ++ netdev_err(dev, "refused to change device tx_queue_len\n"); ++ dev->tx_queue_len = orig_len; ++ return res; + } + + /** diff --git a/queue-4.17/net-stmmac-align-dma-stuff-to-largest-cache-line-length.patch b/queue-4.17/net-stmmac-align-dma-stuff-to-largest-cache-line-length.patch new file mode 100644 index 00000000000..ee769b8165d --- /dev/null +++ b/queue-4.17/net-stmmac-align-dma-stuff-to-largest-cache-line-length.patch @@ -0,0 +1,38 @@ +From foo@baz Wed Aug 1 08:19:18 CEST 2018 +From: Eugeniy Paltsev +Date: Thu, 26 Jul 2018 15:05:37 +0300 +Subject: NET: stmmac: align DMA stuff to largest cache line length + +From: Eugeniy Paltsev + +[ Upstream commit 9939a46d90c6c76f4533d534dbadfa7b39dc6acc ] + +As for today STMMAC_ALIGN macro (which is used to align DMA stuff) +relies on L1 line length (L1_CACHE_BYTES). +This isn't correct in case of system with several cache levels +which might have L1 cache line length smaller than L2 line. This +can lead to sharing one cache line between DMA buffer and other +data, so we can lose this data while invalidate DMA buffer before +DMA transaction. + +Fix that by using SMP_CACHE_BYTES instead of L1_CACHE_BYTES for +aligning. + +Signed-off-by: Eugeniy Paltsev +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c ++++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +@@ -51,7 +51,7 @@ + #include + #include "dwmac1000.h" + +-#define STMMAC_ALIGN(x) L1_CACHE_ALIGN(x) ++#define STMMAC_ALIGN(x) __ALIGN_KERNEL(x, SMP_CACHE_BYTES) + #define TSO_MAX_BUFF_SIZE (SZ_16K - 1) + + /* Module parameters */ diff --git a/queue-4.17/netdevsim-don-t-leak-devlink-resources.patch b/queue-4.17/netdevsim-don-t-leak-devlink-resources.patch new file mode 100644 index 00000000000..111c0a885a8 --- /dev/null +++ b/queue-4.17/netdevsim-don-t-leak-devlink-resources.patch @@ -0,0 +1,31 @@ +From foo@baz Wed Aug 1 08:19:18 CEST 2018 +From: Jakub Kicinski +Date: Wed, 25 Jul 2018 15:39:27 -0700 +Subject: netdevsim: don't leak devlink resources + +From: Jakub Kicinski + +[ Upstream commit c259b4fb33ee6e7667bf1d34bf0803b7c5fdbdce ] + +Devlink resources registered with devlink_resource_register() have +to be unregistered. + +Fixes: 37923ed6b8ce ("netdevsim: Add simple FIB resource controller via devlink") +Signed-off-by: Jakub Kicinski +Reviewed-by: Quentin Monnet +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/netdevsim/devlink.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/drivers/net/netdevsim/devlink.c ++++ b/drivers/net/netdevsim/devlink.c +@@ -206,6 +206,7 @@ void nsim_devlink_teardown(struct netdev + struct net *net = nsim_to_net(ns); + bool *reg_devlink = net_generic(net, nsim_devlink_id); + ++ devlink_resources_unregister(ns->devlink, NULL); + devlink_unregister(ns->devlink); + devlink_free(ns->devlink); + ns->devlink = NULL; diff --git a/queue-4.17/netlink-do-not-subscribe-to-non-existent-groups.patch b/queue-4.17/netlink-do-not-subscribe-to-non-existent-groups.patch new file mode 100644 index 00000000000..2a61d9815c2 --- /dev/null +++ b/queue-4.17/netlink-do-not-subscribe-to-non-existent-groups.patch @@ -0,0 +1,35 @@ +From foo@baz Wed Aug 1 08:19:18 CEST 2018 +From: Dmitry Safonov +Date: Fri, 27 Jul 2018 16:54:44 +0100 +Subject: netlink: Do not subscribe to non-existent groups + +From: Dmitry Safonov + +[ Upstream commit 7acf9d4237c46894e0fa0492dd96314a41742e84 ] + +Make ABI more strict about subscribing to group > ngroups. +Code doesn't check for that and it looks bogus. +(one can subscribe to non-existing group) +Still, it's possible to bind() to all possible groups with (-1) + +Cc: "David S. Miller" +Cc: Herbert Xu +Cc: Steffen Klassert +Cc: netdev@vger.kernel.org +Signed-off-by: Dmitry Safonov +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/netlink/af_netlink.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/net/netlink/af_netlink.c ++++ b/net/netlink/af_netlink.c +@@ -1008,6 +1008,7 @@ static int netlink_bind(struct socket *s + if (err) + return err; + } ++ groups &= (1UL << nlk->ngroups) - 1; + + bound = nlk->bound; + if (bound) { diff --git a/queue-4.17/netlink-don-t-shift-with-ub-on-nlk-ngroups.patch b/queue-4.17/netlink-don-t-shift-with-ub-on-nlk-ngroups.patch new file mode 100644 index 00000000000..e618c4cbb3d --- /dev/null +++ b/queue-4.17/netlink-don-t-shift-with-ub-on-nlk-ngroups.patch @@ -0,0 +1,37 @@ +From foo@baz Wed Aug 1 08:19:18 CEST 2018 +From: Dmitry Safonov +Date: Mon, 30 Jul 2018 18:32:36 +0100 +Subject: netlink: Don't shift with UB on nlk->ngroups + +From: Dmitry Safonov + +[ Upstream commit 61f4b23769f0cc72ae62c9a81cf08f0397d40da8 ] + +On i386 nlk->ngroups might be 32 or 0. Which leads to UB, resulting in +hang during boot. +Check for 0 ngroups and use (unsigned long long) as a type to shift. + +Fixes: 7acf9d4237c4 ("netlink: Do not subscribe to non-existent groups"). +Reported-by: kernel test robot +Signed-off-by: Dmitry Safonov +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/netlink/af_netlink.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +--- a/net/netlink/af_netlink.c ++++ b/net/netlink/af_netlink.c +@@ -1008,7 +1008,11 @@ static int netlink_bind(struct socket *s + if (err) + return err; + } +- groups &= (1UL << nlk->ngroups) - 1; ++ ++ if (nlk->ngroups == 0) ++ groups = 0; ++ else ++ groups &= (1ULL << nlk->ngroups) - 1; + + bound = nlk->bound; + if (bound) { diff --git a/queue-4.17/rds-rdma-fix-the-null-ptr-deref-in-rds_ib_get_mr.patch b/queue-4.17/rds-rdma-fix-the-null-ptr-deref-in-rds_ib_get_mr.patch new file mode 100644 index 00000000000..64a431ff62f --- /dev/null +++ b/queue-4.17/rds-rdma-fix-the-null-ptr-deref-in-rds_ib_get_mr.patch @@ -0,0 +1,255 @@ +From foo@baz Wed Aug 1 08:19:18 CEST 2018 +From: Avinash Repaka +Date: Tue, 24 Jul 2018 20:31:58 -0700 +Subject: RDS: RDMA: Fix the NULL-ptr deref in rds_ib_get_mr + +From: Avinash Repaka + +[ Upstream commit 9e630bcb7701f94dbd729fe57d37c089c763ad9f ] + +Registration of a memory region(MR) through FRMR/fastreg(unlike FMR) +needs a connection/qp. With a proxy qp, this dependency on connection +will be removed, but that needs more infrastructure patches, which is a +work in progress. + +As an intermediate fix, the get_mr returns EOPNOTSUPP when connection +details are not populated. The MR registration through sendmsg() will +continue to work even with fast registration, since connection in this +case is formed upfront. + +This patch fixes the following crash: +kasan: GPF could be caused by NULL-ptr deref or user memory access +general protection fault: 0000 [#1] SMP KASAN +Modules linked in: +CPU: 1 PID: 4244 Comm: syzkaller468044 Not tainted 4.16.0-rc6+ #361 +Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS +Google 01/01/2011 +RIP: 0010:rds_ib_get_mr+0x5c/0x230 net/rds/ib_rdma.c:544 +RSP: 0018:ffff8801b059f890 EFLAGS: 00010202 +RAX: dffffc0000000000 RBX: ffff8801b07e1300 RCX: ffffffff8562d96e +RDX: 000000000000000d RSI: 0000000000000001 RDI: 0000000000000068 +RBP: ffff8801b059f8b8 R08: ffffed0036274244 R09: ffff8801b13a1200 +R10: 0000000000000004 R11: ffffed0036274243 R12: ffff8801b13a1200 +R13: 0000000000000001 R14: ffff8801ca09fa9c R15: 0000000000000000 +FS: 00007f4d050af700(0000) GS:ffff8801db300000(0000) +knlGS:0000000000000000 +CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +CR2: 00007f4d050aee78 CR3: 00000001b0d9b006 CR4: 00000000001606e0 +DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 +DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 +Call Trace: + __rds_rdma_map+0x710/0x1050 net/rds/rdma.c:271 + rds_get_mr_for_dest+0x1d4/0x2c0 net/rds/rdma.c:357 + rds_setsockopt+0x6cc/0x980 net/rds/af_rds.c:347 + SYSC_setsockopt net/socket.c:1849 [inline] + SyS_setsockopt+0x189/0x360 net/socket.c:1828 + do_syscall_64+0x281/0x940 arch/x86/entry/common.c:287 + entry_SYSCALL_64_after_hwframe+0x42/0xb7 +RIP: 0033:0x4456d9 +RSP: 002b:00007f4d050aedb8 EFLAGS: 00000246 ORIG_RAX: 0000000000000036 +RAX: ffffffffffffffda RBX: 00000000006dac3c RCX: 00000000004456d9 +RDX: 0000000000000007 RSI: 0000000000000114 RDI: 0000000000000004 +RBP: 00000000006dac38 R08: 00000000000000a0 R09: 0000000000000000 +R10: 0000000020000380 R11: 0000000000000246 R12: 0000000000000000 +R13: 00007fffbfb36d6f R14: 00007f4d050af9c0 R15: 0000000000000005 +Code: fa 48 c1 ea 03 80 3c 02 00 0f 85 cc 01 00 00 4c 8b bb 80 04 00 00 +48 +b8 00 00 00 00 00 fc ff df 49 8d 7f 68 48 89 fa 48 c1 ea 03 <80> 3c 02 +00 0f +85 9c 01 00 00 4d 8b 7f 68 48 b8 00 00 00 00 00 +RIP: rds_ib_get_mr+0x5c/0x230 net/rds/ib_rdma.c:544 RSP: +ffff8801b059f890 +---[ end trace 7e1cea13b85473b0 ]--- + +Reported-by: syzbot+b51c77ef956678a65834@syzkaller.appspotmail.com +Signed-off-by: Santosh Shilimkar +Signed-off-by: Avinash Repaka + +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/rds/ib_frmr.c | 5 +++++ + net/rds/ib_mr.h | 3 ++- + net/rds/ib_rdma.c | 21 +++++++++++++-------- + net/rds/rdma.c | 13 ++++++++----- + net/rds/rds.h | 5 ++++- + net/rds/send.c | 12 +++++++----- + 6 files changed, 39 insertions(+), 20 deletions(-) + +--- a/net/rds/ib_frmr.c ++++ b/net/rds/ib_frmr.c +@@ -344,6 +344,11 @@ struct rds_ib_mr *rds_ib_reg_frmr(struct + struct rds_ib_frmr *frmr; + int ret; + ++ if (!ic) { ++ /* TODO: Add FRWR support for RDS_GET_MR using proxy qp*/ ++ return ERR_PTR(-EOPNOTSUPP); ++ } ++ + do { + if (ibmr) + rds_ib_free_frmr(ibmr, true); +--- a/net/rds/ib_mr.h ++++ b/net/rds/ib_mr.h +@@ -115,7 +115,8 @@ void rds_ib_get_mr_info(struct rds_ib_de + struct rds_info_rdma_connection *iinfo); + void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *); + void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, +- struct rds_sock *rs, u32 *key_ret); ++ struct rds_sock *rs, u32 *key_ret, ++ struct rds_connection *conn); + void rds_ib_sync_mr(void *trans_private, int dir); + void rds_ib_free_mr(void *trans_private, int invalidate); + void rds_ib_flush_mrs(void); +--- a/net/rds/ib_rdma.c ++++ b/net/rds/ib_rdma.c +@@ -537,11 +537,12 @@ void rds_ib_flush_mrs(void) + } + + void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, +- struct rds_sock *rs, u32 *key_ret) ++ struct rds_sock *rs, u32 *key_ret, ++ struct rds_connection *conn) + { + struct rds_ib_device *rds_ibdev; + struct rds_ib_mr *ibmr = NULL; +- struct rds_ib_connection *ic = rs->rs_conn->c_transport_data; ++ struct rds_ib_connection *ic = NULL; + int ret; + + rds_ibdev = rds_ib_get_device(rs->rs_bound_addr); +@@ -550,6 +551,9 @@ void *rds_ib_get_mr(struct scatterlist * + goto out; + } + ++ if (conn) ++ ic = conn->c_transport_data; ++ + if (!rds_ibdev->mr_8k_pool || !rds_ibdev->mr_1m_pool) { + ret = -ENODEV; + goto out; +@@ -559,17 +563,18 @@ void *rds_ib_get_mr(struct scatterlist * + ibmr = rds_ib_reg_frmr(rds_ibdev, ic, sg, nents, key_ret); + else + ibmr = rds_ib_reg_fmr(rds_ibdev, sg, nents, key_ret); +- if (ibmr) +- rds_ibdev = NULL; +- +- out: +- if (!ibmr) ++ if (IS_ERR(ibmr)) { ++ ret = PTR_ERR(ibmr); + pr_warn("RDS/IB: rds_ib_get_mr failed (errno=%d)\n", ret); ++ } else { ++ return ibmr; ++ } + ++ out: + if (rds_ibdev) + rds_ib_dev_put(rds_ibdev); + +- return ibmr; ++ return ERR_PTR(ret); + } + + void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool) +--- a/net/rds/rdma.c ++++ b/net/rds/rdma.c +@@ -170,7 +170,8 @@ static int rds_pin_pages(unsigned long u + } + + static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, +- u64 *cookie_ret, struct rds_mr **mr_ret) ++ u64 *cookie_ret, struct rds_mr **mr_ret, ++ struct rds_conn_path *cp) + { + struct rds_mr *mr = NULL, *found; + unsigned int nr_pages; +@@ -269,7 +270,8 @@ static int __rds_rdma_map(struct rds_soc + * Note that dma_map() implies that pending writes are + * flushed to RAM, so no dma_sync is needed here. */ + trans_private = rs->rs_transport->get_mr(sg, nents, rs, +- &mr->r_key); ++ &mr->r_key, ++ cp ? cp->cp_conn : NULL); + + if (IS_ERR(trans_private)) { + for (i = 0 ; i < nents; i++) +@@ -330,7 +332,7 @@ int rds_get_mr(struct rds_sock *rs, char + sizeof(struct rds_get_mr_args))) + return -EFAULT; + +- return __rds_rdma_map(rs, &args, NULL, NULL); ++ return __rds_rdma_map(rs, &args, NULL, NULL, NULL); + } + + int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen) +@@ -354,7 +356,7 @@ int rds_get_mr_for_dest(struct rds_sock + new_args.cookie_addr = args.cookie_addr; + new_args.flags = args.flags; + +- return __rds_rdma_map(rs, &new_args, NULL, NULL); ++ return __rds_rdma_map(rs, &new_args, NULL, NULL, NULL); + } + + /* +@@ -782,7 +784,8 @@ int rds_cmsg_rdma_map(struct rds_sock *r + rm->m_rdma_cookie != 0) + return -EINVAL; + +- return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->rdma.op_rdma_mr); ++ return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, ++ &rm->rdma.op_rdma_mr, rm->m_conn_path); + } + + /* +--- a/net/rds/rds.h ++++ b/net/rds/rds.h +@@ -464,6 +464,8 @@ struct rds_message { + struct scatterlist *op_sg; + } data; + }; ++ ++ struct rds_conn_path *m_conn_path; + }; + + /* +@@ -544,7 +546,8 @@ struct rds_transport { + unsigned int avail); + void (*exit)(void); + void *(*get_mr)(struct scatterlist *sg, unsigned long nr_sg, +- struct rds_sock *rs, u32 *key_ret); ++ struct rds_sock *rs, u32 *key_ret, ++ struct rds_connection *conn); + void (*sync_mr)(void *trans_private, int direction); + void (*free_mr)(void *trans_private, int invalidate); + void (*flush_mrs)(void); +--- a/net/rds/send.c ++++ b/net/rds/send.c +@@ -1169,6 +1169,13 @@ int rds_sendmsg(struct socket *sock, str + rs->rs_conn = conn; + } + ++ if (conn->c_trans->t_mp_capable) ++ cpath = &conn->c_path[rds_send_mprds_hash(rs, conn)]; ++ else ++ cpath = &conn->c_path[0]; ++ ++ rm->m_conn_path = cpath; ++ + /* Parse any control messages the user may have included. */ + ret = rds_cmsg_send(rs, rm, msg, &allocated_mr); + if (ret) { +@@ -1192,11 +1199,6 @@ int rds_sendmsg(struct socket *sock, str + goto out; + } + +- if (conn->c_trans->t_mp_capable) +- cpath = &conn->c_path[rds_send_mprds_hash(rs, conn)]; +- else +- cpath = &conn->c_path[0]; +- + if (rds_destroy_pending(conn)) { + ret = -EAGAIN; + goto out; diff --git a/queue-4.17/series b/queue-4.17/series index 7d2c73685a4..d3e2ea6243e 100644 --- a/queue-4.17/series +++ b/queue-4.17/series @@ -311,3 +311,24 @@ net-dsa-qca8k-force-cpu-port-to-its-highest-bandwidth.patch net-dsa-qca8k-enable-rxmac-when-bringing-up-a-port.patch net-dsa-qca8k-add-qca8334-binding-documentation.patch net-dsa-qca8k-allow-overwriting-cpu-port-setting.patch +ipv4-remove-bug_on-from-fib_compute_spec_dst.patch +netdevsim-don-t-leak-devlink-resources.patch +net-ena-fix-use-of-uninitialized-dma-address-bits-field.patch +net-fix-amd-xgbe-flow-control-issue.patch +net-lan78xx-fix-rx-handling-before-first-packet-is-send.patch +net-mdio-mux-bcm-iproc-fix-wrong-getter-and-setter-pair.patch +net-stmmac-align-dma-stuff-to-largest-cache-line-length.patch +rds-rdma-fix-the-null-ptr-deref-in-rds_ib_get_mr.patch +tcp_bbr-fix-bw-probing-to-raise-in-flight-data-for-very-small-bdps.patch +virtio_net-fix-incosistent-received-bytes-counter.patch +xen-netfront-wait-xenbus-state-change-when-load-module-manually.patch +cxgb4-added-missing-break-in-ndo_udp_tunnel_-add-del.patch +net-rollback-orig-value-on-failure-of-dev_qdisc_change_tx_queue_len.patch +netlink-do-not-subscribe-to-non-existent-groups.patch +netlink-don-t-shift-with-ub-on-nlk-ngroups.patch +tcp-do-not-force-quickack-when-receiving-out-of-order-packets.patch +tcp-add-max_quickacks-param-to-tcp_incr_quickack-and-tcp_enter_quickack_mode.patch +tcp-do-not-aggressively-quick-ack-after-ecn-events.patch +tcp-refactor-tcp_ecn_check_ce-to-remove-sk-type-cast.patch +tcp-add-one-more-quick-ack-after-after-ecn-events.patch +tcp-ack-immediately-when-a-cwr-packet-arrives.patch diff --git a/queue-4.17/tcp-ack-immediately-when-a-cwr-packet-arrives.patch b/queue-4.17/tcp-ack-immediately-when-a-cwr-packet-arrives.patch new file mode 100644 index 00000000000..84f15f45087 --- /dev/null +++ b/queue-4.17/tcp-ack-immediately-when-a-cwr-packet-arrives.patch @@ -0,0 +1,92 @@ +From foo@baz Wed Aug 1 08:19:18 CEST 2018 +From: Lawrence Brakmo +Date: Mon, 23 Jul 2018 17:49:39 -0700 +Subject: tcp: ack immediately when a cwr packet arrives + +From: Lawrence Brakmo + +[ Upstream commit 9aee40006190a3cda9a4d2dbae71e92617c8c362 ] + +We observed high 99 and 99.9% latencies when doing RPCs with DCTCP. The +problem is triggered when the last packet of a request arrives CE +marked. The reply will carry the ECE mark causing TCP to shrink its cwnd +to 1 (because there are no packets in flight). When the 1st packet of +the next request arrives, the ACK was sometimes delayed even though it +is CWR marked, adding up to 40ms to the RPC latency. + +This patch insures that CWR marked data packets arriving will be acked +immediately. + +Packetdrill script to reproduce the problem: + +0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 +0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 +0.000 setsockopt(3, SOL_TCP, TCP_CONGESTION, "dctcp", 5) = 0 +0.000 bind(3, ..., ...) = 0 +0.000 listen(3, 1) = 0 + +0.100 < [ect0] SEW 0:0(0) win 32792 +0.100 > SE. 0:0(0) ack 1 +0.110 < [ect0] . 1:1(0) ack 1 win 257 +0.200 accept(3, ..., ...) = 4 + +0.200 < [ect0] . 1:1001(1000) ack 1 win 257 +0.200 > [ect01] . 1:1(0) ack 1001 + +0.200 write(4, ..., 1) = 1 +0.200 > [ect01] P. 1:2(1) ack 1001 + +0.200 < [ect0] . 1001:2001(1000) ack 2 win 257 +0.200 write(4, ..., 1) = 1 +0.200 > [ect01] P. 2:3(1) ack 2001 + +0.200 < [ect0] . 2001:3001(1000) ack 3 win 257 +0.200 < [ect0] . 3001:4001(1000) ack 3 win 257 +0.200 > [ect01] . 3:3(0) ack 4001 + +0.210 < [ce] P. 4001:4501(500) ack 3 win 257 + ++0.001 read(4, ..., 4500) = 4500 ++0 write(4, ..., 1) = 1 ++0 > [ect01] PE. 3:4(1) ack 4501 + ++0.010 < [ect0] W. 4501:5501(1000) ack 4 win 257 +// Previously the ACK sequence below would be 4501, causing a long RTO ++0.040~+0.045 > [ect01] . 4:4(0) ack 5501 // delayed ack + ++0.311 < [ect0] . 5501:6501(1000) ack 4 win 257 // More data ++0 > [ect01] . 4:4(0) ack 6501 // now acks everything + ++0.500 < F. 9501:9501(0) ack 4 win 257 + +Modified based on comments by Neal Cardwell + +Signed-off-by: Lawrence Brakmo +Acked-by: Neal Cardwell +Acked-by: Yuchung Cheng +Signed-off-by: Eric Dumazet +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/tcp_input.c | 9 ++++++++- + 1 file changed, 8 insertions(+), 1 deletion(-) + +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -227,8 +227,15 @@ static void tcp_ecn_queue_cwr(struct tcp + + static void tcp_ecn_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb) + { +- if (tcp_hdr(skb)->cwr) ++ if (tcp_hdr(skb)->cwr) { + tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; ++ ++ /* If the sender is telling us it has entered CWR, then its ++ * cwnd may be very low (even just 1 packet), so we should ACK ++ * immediately. ++ */ ++ tcp_enter_quickack_mode((struct sock *)tp, 2); ++ } + } + + static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) diff --git a/queue-4.17/tcp-add-max_quickacks-param-to-tcp_incr_quickack-and-tcp_enter_quickack_mode.patch b/queue-4.17/tcp-add-max_quickacks-param-to-tcp_incr_quickack-and-tcp_enter_quickack_mode.patch new file mode 100644 index 00000000000..98ccb97ed0f --- /dev/null +++ b/queue-4.17/tcp-add-max_quickacks-param-to-tcp_incr_quickack-and-tcp_enter_quickack_mode.patch @@ -0,0 +1,150 @@ +From foo@baz Wed Aug 1 08:19:18 CEST 2018 +From: Eric Dumazet +Date: Mon, 21 May 2018 15:08:56 -0700 +Subject: tcp: add max_quickacks param to tcp_incr_quickack and tcp_enter_quickack_mode + +From: Eric Dumazet + +[ Upstream commit 9a9c9b51e54618861420093ae6e9b50a961914c5 ] + +We want to add finer control of the number of ACK packets sent after +ECN events. + +This patch is not changing current behavior, it only enables following +change. + +Signed-off-by: Eric Dumazet +Acked-by: Soheil Hassas Yeganeh +Acked-by: Neal Cardwell +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + include/net/tcp.h | 2 +- + net/ipv4/tcp_dctcp.c | 4 ++-- + net/ipv4/tcp_input.c | 24 +++++++++++++----------- + 3 files changed, 16 insertions(+), 14 deletions(-) + +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -342,7 +342,7 @@ ssize_t tcp_splice_read(struct socket *s + struct pipe_inode_info *pipe, size_t len, + unsigned int flags); + +-void tcp_enter_quickack_mode(struct sock *sk); ++void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks); + static inline void tcp_dec_quickack_mode(struct sock *sk, + const unsigned int pkts) + { +--- a/net/ipv4/tcp_dctcp.c ++++ b/net/ipv4/tcp_dctcp.c +@@ -138,7 +138,7 @@ static void dctcp_ce_state_0_to_1(struct + */ + if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) + __tcp_send_ack(sk, ca->prior_rcv_nxt); +- tcp_enter_quickack_mode(sk); ++ tcp_enter_quickack_mode(sk, 1); + } + + ca->prior_rcv_nxt = tp->rcv_nxt; +@@ -159,7 +159,7 @@ static void dctcp_ce_state_1_to_0(struct + */ + if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) + __tcp_send_ack(sk, ca->prior_rcv_nxt); +- tcp_enter_quickack_mode(sk); ++ tcp_enter_quickack_mode(sk, 1); + } + + ca->prior_rcv_nxt = tp->rcv_nxt; +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -184,21 +184,23 @@ static void tcp_measure_rcv_mss(struct s + } + } + +-static void tcp_incr_quickack(struct sock *sk) ++static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks) + { + struct inet_connection_sock *icsk = inet_csk(sk); + unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss); + + if (quickacks == 0) + quickacks = 2; ++ quickacks = min(quickacks, max_quickacks); + if (quickacks > icsk->icsk_ack.quick) +- icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS); ++ icsk->icsk_ack.quick = quickacks; + } + +-void tcp_enter_quickack_mode(struct sock *sk) ++void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) + { + struct inet_connection_sock *icsk = inet_csk(sk); +- tcp_incr_quickack(sk); ++ ++ tcp_incr_quickack(sk, max_quickacks); + icsk->icsk_ack.pingpong = 0; + icsk->icsk_ack.ato = TCP_ATO_MIN; + } +@@ -243,7 +245,7 @@ static void __tcp_ecn_check_ce(struct tc + * it is probably a retransmit. + */ + if (tp->ecn_flags & TCP_ECN_SEEN) +- tcp_enter_quickack_mode((struct sock *)tp); ++ tcp_enter_quickack_mode((struct sock *)tp, TCP_MAX_QUICKACKS); + break; + case INET_ECN_CE: + if (tcp_ca_needs_ecn((struct sock *)tp)) +@@ -251,7 +253,7 @@ static void __tcp_ecn_check_ce(struct tc + + if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { + /* Better not delay acks, sender can have a very low cwnd */ +- tcp_enter_quickack_mode((struct sock *)tp); ++ tcp_enter_quickack_mode((struct sock *)tp, TCP_MAX_QUICKACKS); + tp->ecn_flags |= TCP_ECN_DEMAND_CWR; + } + tp->ecn_flags |= TCP_ECN_SEEN; +@@ -666,7 +668,7 @@ static void tcp_event_data_recv(struct s + /* The _first_ data packet received, initialize + * delayed ACK engine. + */ +- tcp_incr_quickack(sk); ++ tcp_incr_quickack(sk, TCP_MAX_QUICKACKS); + icsk->icsk_ack.ato = TCP_ATO_MIN; + } else { + int m = now - icsk->icsk_ack.lrcvtime; +@@ -682,7 +684,7 @@ static void tcp_event_data_recv(struct s + /* Too long gap. Apparently sender failed to + * restart window, so that we send ACKs quickly. + */ +- tcp_incr_quickack(sk); ++ tcp_incr_quickack(sk, TCP_MAX_QUICKACKS); + sk_mem_reclaim(sk); + } + } +@@ -4136,7 +4138,7 @@ static void tcp_send_dupack(struct sock + if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && + before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); +- tcp_enter_quickack_mode(sk); ++ tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); + + if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) { + u32 end_seq = TCP_SKB_CB(skb)->end_seq; +@@ -4667,7 +4669,7 @@ queue_and_out: + tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); + + out_of_window: +- tcp_enter_quickack_mode(sk); ++ tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); + inet_csk_schedule_ack(sk); + drop: + tcp_drop(sk, skb); +@@ -5744,7 +5746,7 @@ static int tcp_rcv_synsent_state_process + * to stand against the temptation 8) --ANK + */ + inet_csk_schedule_ack(sk); +- tcp_enter_quickack_mode(sk); ++ tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, + TCP_DELACK_MAX, TCP_RTO_MAX); + diff --git a/queue-4.17/tcp-add-one-more-quick-ack-after-after-ecn-events.patch b/queue-4.17/tcp-add-one-more-quick-ack-after-after-ecn-events.patch new file mode 100644 index 00000000000..a357de08458 --- /dev/null +++ b/queue-4.17/tcp-add-one-more-quick-ack-after-after-ecn-events.patch @@ -0,0 +1,48 @@ +From foo@baz Wed Aug 1 08:19:18 CEST 2018 +From: Eric Dumazet +Date: Wed, 27 Jun 2018 08:47:21 -0700 +Subject: tcp: add one more quick ack after after ECN events + +From: Eric Dumazet + +[ Upstream commit 15ecbe94a45ef88491ca459b26efdd02f91edb6d ] + +Larry Brakmo proposal ( https://patchwork.ozlabs.org/patch/935233/ +tcp: force cwnd at least 2 in tcp_cwnd_reduction) made us rethink +about our recent patch removing ~16 quick acks after ECN events. + +tcp_enter_quickack_mode(sk, 1) makes sure one immediate ack is sent, +but in the case the sender cwnd was lowered to 1, we do not want +to have a delayed ack for the next packet we will receive. + +Fixes: 522040ea5fdd ("tcp: do not aggressively quick ack after ECN events") +Signed-off-by: Eric Dumazet +Reported-by: Neal Cardwell +Cc: Lawrence Brakmo +Acked-by: Neal Cardwell +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/tcp_input.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -247,7 +247,7 @@ static void __tcp_ecn_check_ce(struct so + * it is probably a retransmit. + */ + if (tp->ecn_flags & TCP_ECN_SEEN) +- tcp_enter_quickack_mode(sk, 1); ++ tcp_enter_quickack_mode(sk, 2); + break; + case INET_ECN_CE: + if (tcp_ca_needs_ecn(sk)) +@@ -255,7 +255,7 @@ static void __tcp_ecn_check_ce(struct so + + if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { + /* Better not delay acks, sender can have a very low cwnd */ +- tcp_enter_quickack_mode(sk, 1); ++ tcp_enter_quickack_mode(sk, 2); + tp->ecn_flags |= TCP_ECN_DEMAND_CWR; + } + tp->ecn_flags |= TCP_ECN_SEEN; diff --git a/queue-4.17/tcp-do-not-aggressively-quick-ack-after-ecn-events.patch b/queue-4.17/tcp-do-not-aggressively-quick-ack-after-ecn-events.patch new file mode 100644 index 00000000000..81377c2459a --- /dev/null +++ b/queue-4.17/tcp-do-not-aggressively-quick-ack-after-ecn-events.patch @@ -0,0 +1,50 @@ +From foo@baz Wed Aug 1 08:19:18 CEST 2018 +From: Eric Dumazet +Date: Mon, 21 May 2018 15:08:57 -0700 +Subject: tcp: do not aggressively quick ack after ECN events + +From: Eric Dumazet + +[ Upstream commit 522040ea5fdd1c33bbf75e1d7c7c0422b96a94ef ] + +ECN signals currently forces TCP to enter quickack mode for +up to 16 (TCP_MAX_QUICKACKS) following incoming packets. + +We believe this is not needed, and only sending one immediate ack +for the current packet should be enough. + +This should reduce the extra load noticed in DCTCP environments, +after congestion events. + +This is part 2 of our effort to reduce pure ACK packets. + +Signed-off-by: Eric Dumazet +Acked-by: Soheil Hassas Yeganeh +Acked-by: Yuchung Cheng +Acked-by: Neal Cardwell +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/tcp_input.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -245,7 +245,7 @@ static void __tcp_ecn_check_ce(struct tc + * it is probably a retransmit. + */ + if (tp->ecn_flags & TCP_ECN_SEEN) +- tcp_enter_quickack_mode((struct sock *)tp, TCP_MAX_QUICKACKS); ++ tcp_enter_quickack_mode((struct sock *)tp, 1); + break; + case INET_ECN_CE: + if (tcp_ca_needs_ecn((struct sock *)tp)) +@@ -253,7 +253,7 @@ static void __tcp_ecn_check_ce(struct tc + + if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { + /* Better not delay acks, sender can have a very low cwnd */ +- tcp_enter_quickack_mode((struct sock *)tp, TCP_MAX_QUICKACKS); ++ tcp_enter_quickack_mode((struct sock *)tp, 1); + tp->ecn_flags |= TCP_ECN_DEMAND_CWR; + } + tp->ecn_flags |= TCP_ECN_SEEN; diff --git a/queue-4.17/tcp-do-not-force-quickack-when-receiving-out-of-order-packets.patch b/queue-4.17/tcp-do-not-force-quickack-when-receiving-out-of-order-packets.patch new file mode 100644 index 00000000000..73529f513d5 --- /dev/null +++ b/queue-4.17/tcp-do-not-force-quickack-when-receiving-out-of-order-packets.patch @@ -0,0 +1,37 @@ +From foo@baz Wed Aug 1 08:19:18 CEST 2018 +From: Eric Dumazet +Date: Thu, 17 May 2018 14:47:25 -0700 +Subject: tcp: do not force quickack when receiving out-of-order packets + +From: Eric Dumazet + +[ Upstream commit a3893637e1eb0ef5eb1bbc52b3a8d2dfa317a35d ] + +As explained in commit 9f9843a751d0 ("tcp: properly handle stretch +acks in slow start"), TCP stacks have to consider how many packets +are acknowledged in one single ACK, because of GRO, but also +because of ACK compression or losses. + +We plan to add SACK compression in the following patch, we +must therefore not call tcp_enter_quickack_mode() + +Signed-off-by: Eric Dumazet +Acked-by: Neal Cardwell +Acked-by: Soheil Hassas Yeganeh +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/tcp_input.c | 2 -- + 1 file changed, 2 deletions(-) + +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -4678,8 +4678,6 @@ drop: + if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp))) + goto out_of_window; + +- tcp_enter_quickack_mode(sk); +- + if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { + /* Partial packet, seq < rcv_next < end_seq */ + SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n", diff --git a/queue-4.17/tcp-refactor-tcp_ecn_check_ce-to-remove-sk-type-cast.patch b/queue-4.17/tcp-refactor-tcp_ecn_check_ce-to-remove-sk-type-cast.patch new file mode 100644 index 00000000000..1424c59253d --- /dev/null +++ b/queue-4.17/tcp-refactor-tcp_ecn_check_ce-to-remove-sk-type-cast.patch @@ -0,0 +1,97 @@ +From foo@baz Wed Aug 1 08:19:18 CEST 2018 +From: Yousuk Seung +Date: Mon, 4 Jun 2018 15:29:51 -0700 +Subject: tcp: refactor tcp_ecn_check_ce to remove sk type cast + +From: Yousuk Seung + +[ Upstream commit f4c9f85f3b2cb7669830cd04d0be61192a4d2436 ] + +Refactor tcp_ecn_check_ce and __tcp_ecn_check_ce to accept struct sock* +instead of tcp_sock* to clean up type casts. This is a pure refactor +patch. + +Signed-off-by: Yousuk Seung +Signed-off-by: Neal Cardwell +Signed-off-by: Yuchung Cheng +Signed-off-by: Eric Dumazet +Acked-by: Soheil Hassas Yeganeh +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/tcp_input.c | 26 ++++++++++++++------------ + 1 file changed, 14 insertions(+), 12 deletions(-) + +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -236,8 +236,10 @@ static void tcp_ecn_withdraw_cwr(struct + tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; + } + +-static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb) ++static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) + { ++ struct tcp_sock *tp = tcp_sk(sk); ++ + switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) { + case INET_ECN_NOT_ECT: + /* Funny extension: if ECT is not set on a segment, +@@ -245,31 +247,31 @@ static void __tcp_ecn_check_ce(struct tc + * it is probably a retransmit. + */ + if (tp->ecn_flags & TCP_ECN_SEEN) +- tcp_enter_quickack_mode((struct sock *)tp, 1); ++ tcp_enter_quickack_mode(sk, 1); + break; + case INET_ECN_CE: +- if (tcp_ca_needs_ecn((struct sock *)tp)) +- tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_IS_CE); ++ if (tcp_ca_needs_ecn(sk)) ++ tcp_ca_event(sk, CA_EVENT_ECN_IS_CE); + + if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { + /* Better not delay acks, sender can have a very low cwnd */ +- tcp_enter_quickack_mode((struct sock *)tp, 1); ++ tcp_enter_quickack_mode(sk, 1); + tp->ecn_flags |= TCP_ECN_DEMAND_CWR; + } + tp->ecn_flags |= TCP_ECN_SEEN; + break; + default: +- if (tcp_ca_needs_ecn((struct sock *)tp)) +- tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_NO_CE); ++ if (tcp_ca_needs_ecn(sk)) ++ tcp_ca_event(sk, CA_EVENT_ECN_NO_CE); + tp->ecn_flags |= TCP_ECN_SEEN; + break; + } + } + +-static void tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb) ++static void tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) + { +- if (tp->ecn_flags & TCP_ECN_OK) +- __tcp_ecn_check_ce(tp, skb); ++ if (tcp_sk(sk)->ecn_flags & TCP_ECN_OK) ++ __tcp_ecn_check_ce(sk, skb); + } + + static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th) +@@ -690,7 +692,7 @@ static void tcp_event_data_recv(struct s + } + icsk->icsk_ack.lrcvtime = now; + +- tcp_ecn_check_ce(tp, skb); ++ tcp_ecn_check_ce(sk, skb); + + if (skb->len >= 128) + tcp_grow_window(sk, skb); +@@ -4406,7 +4408,7 @@ static void tcp_data_queue_ofo(struct so + u32 seq, end_seq; + bool fragstolen; + +- tcp_ecn_check_ce(tp, skb); ++ tcp_ecn_check_ce(sk, skb); + + if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP); diff --git a/queue-4.17/tcp_bbr-fix-bw-probing-to-raise-in-flight-data-for-very-small-bdps.patch b/queue-4.17/tcp_bbr-fix-bw-probing-to-raise-in-flight-data-for-very-small-bdps.patch new file mode 100644 index 00000000000..2b4b929cd7f --- /dev/null +++ b/queue-4.17/tcp_bbr-fix-bw-probing-to-raise-in-flight-data-for-very-small-bdps.patch @@ -0,0 +1,56 @@ +From foo@baz Wed Aug 1 08:19:18 CEST 2018 +From: Neal Cardwell +Date: Fri, 27 Jul 2018 17:19:12 -0400 +Subject: tcp_bbr: fix bw probing to raise in-flight data for very small BDPs + +From: Neal Cardwell + +[ Upstream commit 383d470936c05554219094a4d364d964cb324827 ] + +For some very small BDPs (with just a few packets) there was a +quantization effect where the target number of packets in flight +during the super-unity-gain (1.25x) phase of gain cycling was +implicitly truncated to a number of packets no larger than the normal +unity-gain (1.0x) phase of gain cycling. This meant that in multi-flow +scenarios some flows could get stuck with a lower bandwidth, because +they did not push enough packets inflight to discover that there was +more bandwidth available. This was really only an issue in multi-flow +LAN scenarios, where RTTs and BDPs are low enough for this to be an +issue. + +This fix ensures that gain cycling can raise inflight for small BDPs +by ensuring that in PROBE_BW mode target inflight values with a +super-unity gain are always greater than inflight values with a gain +<= 1. Importantly, this applies whether the inflight value is +calculated for use as a cwnd value, or as a target inflight value for +the end of the super-unity phase in bbr_is_next_cycle_phase() (both +need to be bigger to ensure we can probe with more packets in flight +reliably). + +This is a candidate fix for stable releases. + +Fixes: 0f8782ea1497 ("tcp_bbr: add BBR congestion control") +Signed-off-by: Neal Cardwell +Acked-by: Yuchung Cheng +Acked-by: Soheil Hassas Yeganeh +Acked-by: Priyaranjan Jha +Reviewed-by: Eric Dumazet +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/tcp_bbr.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/net/ipv4/tcp_bbr.c ++++ b/net/ipv4/tcp_bbr.c +@@ -354,6 +354,10 @@ static u32 bbr_target_cwnd(struct sock * + /* Reduce delayed ACKs by rounding up cwnd to the next even number. */ + cwnd = (cwnd + 1) & ~1U; + ++ /* Ensure gain cycling gets inflight above BDP even for small BDPs. */ ++ if (bbr->mode == BBR_PROBE_BW && gain > BBR_UNIT) ++ cwnd += 2; ++ + return cwnd; + } + diff --git a/queue-4.17/virtio_net-fix-incosistent-received-bytes-counter.patch b/queue-4.17/virtio_net-fix-incosistent-received-bytes-counter.patch new file mode 100644 index 00000000000..4b98f07f66f --- /dev/null +++ b/queue-4.17/virtio_net-fix-incosistent-received-bytes-counter.patch @@ -0,0 +1,171 @@ +From foo@baz Wed Aug 1 08:19:18 CEST 2018 +From: Toshiaki Makita +Date: Mon, 23 Jul 2018 23:36:04 +0900 +Subject: virtio_net: Fix incosistent received bytes counter + +From: Toshiaki Makita + +[ Upstream commit ecbc42ca5d665e9238a4cdb595024d2e6cf87f2d ] + +When received packets are dropped in virtio_net driver, received packets +counter is incremented but bytes counter is not. +As a result, for instance if we drop all packets by XDP, only received +is counted and bytes stays 0, which looks inconsistent. +IMHO received packets/bytes should be counted if packets are produced by +the hypervisor, like what common NICs on physical machines are doing. +So fix the bytes counter. + +Signed-off-by: Toshiaki Makita +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/virtio_net.c | 41 +++++++++++++++++++++++------------------ + 1 file changed, 23 insertions(+), 18 deletions(-) + +--- a/drivers/net/virtio_net.c ++++ b/drivers/net/virtio_net.c +@@ -551,7 +551,8 @@ static struct sk_buff *receive_small(str + struct receive_queue *rq, + void *buf, void *ctx, + unsigned int len, +- unsigned int *xdp_xmit) ++ unsigned int *xdp_xmit, ++ unsigned int *rbytes) + { + struct sk_buff *skb; + struct bpf_prog *xdp_prog; +@@ -567,6 +568,7 @@ static struct sk_buff *receive_small(str + int err; + + len -= vi->hdr_len; ++ *rbytes += len; + + rcu_read_lock(); + xdp_prog = rcu_dereference(rq->xdp_prog); +@@ -666,11 +668,13 @@ static struct sk_buff *receive_big(struc + struct virtnet_info *vi, + struct receive_queue *rq, + void *buf, +- unsigned int len) ++ unsigned int len, ++ unsigned int *rbytes) + { + struct page *page = buf; + struct sk_buff *skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE); + ++ *rbytes += len - vi->hdr_len; + if (unlikely(!skb)) + goto err; + +@@ -688,7 +692,8 @@ static struct sk_buff *receive_mergeable + void *buf, + void *ctx, + unsigned int len, +- unsigned int *xdp_xmit) ++ unsigned int *xdp_xmit, ++ unsigned int *rbytes) + { + struct virtio_net_hdr_mrg_rxbuf *hdr = buf; + u16 num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers); +@@ -702,6 +707,7 @@ static struct sk_buff *receive_mergeable + int err; + + head_skb = NULL; ++ *rbytes += len - vi->hdr_len; + + rcu_read_lock(); + xdp_prog = rcu_dereference(rq->xdp_prog); +@@ -831,6 +837,7 @@ static struct sk_buff *receive_mergeable + goto err_buf; + } + ++ *rbytes += len; + page = virt_to_head_page(buf); + + truesize = mergeable_ctx_to_truesize(ctx); +@@ -886,6 +893,7 @@ err_skb: + dev->stats.rx_length_errors++; + break; + } ++ *rbytes += len; + page = virt_to_head_page(buf); + put_page(page); + } +@@ -896,14 +904,13 @@ xdp_xmit: + return NULL; + } + +-static int receive_buf(struct virtnet_info *vi, struct receive_queue *rq, +- void *buf, unsigned int len, void **ctx, +- unsigned int *xdp_xmit) ++static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq, ++ void *buf, unsigned int len, void **ctx, ++ unsigned int *xdp_xmit, unsigned int *rbytes) + { + struct net_device *dev = vi->dev; + struct sk_buff *skb; + struct virtio_net_hdr_mrg_rxbuf *hdr; +- int ret; + + if (unlikely(len < vi->hdr_len + ETH_HLEN)) { + pr_debug("%s: short packet %i\n", dev->name, len); +@@ -915,23 +922,22 @@ static int receive_buf(struct virtnet_in + } else { + put_page(virt_to_head_page(buf)); + } +- return 0; ++ return; + } + + if (vi->mergeable_rx_bufs) +- skb = receive_mergeable(dev, vi, rq, buf, ctx, len, xdp_xmit); ++ skb = receive_mergeable(dev, vi, rq, buf, ctx, len, xdp_xmit, ++ rbytes); + else if (vi->big_packets) +- skb = receive_big(dev, vi, rq, buf, len); ++ skb = receive_big(dev, vi, rq, buf, len, rbytes); + else +- skb = receive_small(dev, vi, rq, buf, ctx, len, xdp_xmit); ++ skb = receive_small(dev, vi, rq, buf, ctx, len, xdp_xmit, rbytes); + + if (unlikely(!skb)) +- return 0; ++ return; + + hdr = skb_vnet_hdr(skb); + +- ret = skb->len; +- + if (hdr->hdr.flags & VIRTIO_NET_HDR_F_DATA_VALID) + skb->ip_summed = CHECKSUM_UNNECESSARY; + +@@ -948,12 +954,11 @@ static int receive_buf(struct virtnet_in + ntohs(skb->protocol), skb->len, skb->pkt_type); + + napi_gro_receive(&rq->napi, skb); +- return ret; ++ return; + + frame_err: + dev->stats.rx_frame_errors++; + dev_kfree_skb(skb); +- return 0; + } + + /* Unlike mergeable buffers, all buffers are allocated to the +@@ -1203,13 +1208,13 @@ static int virtnet_receive(struct receiv + + while (received < budget && + (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) { +- bytes += receive_buf(vi, rq, buf, len, ctx, xdp_xmit); ++ receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &bytes); + received++; + } + } else { + while (received < budget && + (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) { +- bytes += receive_buf(vi, rq, buf, len, NULL, xdp_xmit); ++ receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &bytes); + received++; + } + } diff --git a/queue-4.17/xen-netfront-wait-xenbus-state-change-when-load-module-manually.patch b/queue-4.17/xen-netfront-wait-xenbus-state-change-when-load-module-manually.patch new file mode 100644 index 00000000000..529d2019574 --- /dev/null +++ b/queue-4.17/xen-netfront-wait-xenbus-state-change-when-load-module-manually.patch @@ -0,0 +1,67 @@ +From foo@baz Wed Aug 1 08:19:18 CEST 2018 +From: Xiao Liang +Date: Fri, 27 Jul 2018 17:56:08 +0800 +Subject: xen-netfront: wait xenbus state change when load module manually + +From: Xiao Liang + +[ Upstream commit 822fb18a82abaf4ee7058793d95d340f5dab7bfc ] + +When loading module manually, after call xenbus_switch_state to initializes +the state of the netfront device, the driver state did not change so fast +that may lead no dev created in latest kernel. This patch adds wait to make +sure xenbus knows the driver is not in closed/unknown state. + +Current state: +[vm]# ethtool eth0 +Settings for eth0: + Link detected: yes +[vm]# modprobe -r xen_netfront +[vm]# modprobe xen_netfront +[vm]# ethtool eth0 +Settings for eth0: +Cannot get device settings: No such device +Cannot get wake-on-lan settings: No such device +Cannot get message level: No such device +Cannot get link status: No such device +No data available + +With the patch installed. +[vm]# ethtool eth0 +Settings for eth0: + Link detected: yes +[vm]# modprobe -r xen_netfront +[vm]# modprobe xen_netfront +[vm]# ethtool eth0 +Settings for eth0: + Link detected: yes + +Signed-off-by: Xiao Liang +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/xen-netfront.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/drivers/net/xen-netfront.c ++++ b/drivers/net/xen-netfront.c +@@ -87,6 +87,7 @@ struct netfront_cb { + /* IRQ name is queue name with "-tx" or "-rx" appended */ + #define IRQ_NAME_SIZE (QUEUE_NAME_SIZE + 3) + ++static DECLARE_WAIT_QUEUE_HEAD(module_load_q); + static DECLARE_WAIT_QUEUE_HEAD(module_unload_q); + + struct netfront_stats { +@@ -1330,6 +1331,11 @@ static struct net_device *xennet_create_ + netif_carrier_off(netdev); + + xenbus_switch_state(dev, XenbusStateInitialising); ++ wait_event(module_load_q, ++ xenbus_read_driver_state(dev->otherend) != ++ XenbusStateClosed && ++ xenbus_read_driver_state(dev->otherend) != ++ XenbusStateUnknown); + return netdev; + + exit: