From: Greg Kroah-Hartman Date: Sat, 12 Aug 2023 18:53:26 +0000 (+0200) Subject: 6.1-stable patches X-Git-Tag: v4.14.323~31 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=c185de85fa97f02f9561283c4440e511974bf950;p=thirdparty%2Fkernel%2Fstable-queue.git 6.1-stable patches added patches: dmaengine-mcf-edma-fix-a-potential-un-allocated-memory-access.patch dmaengine-owl-dma-modify-mismatched-function-name.patch gpio-sim-mark-the-gpio-chip-as-a-one-that-can-sleep.patch gpio-ws16c48-fix-off-by-one-error-in-ws16c48-resource-region-extent.patch ibmvnic-do-partial-reset-on-login-failure.patch ibmvnic-enforce-stronger-sanity-checks-on-login-response.patch ibmvnic-ensure-login-failure-recovery-is-safe-from-other-resets.patch ibmvnic-handle-dma-unmapping-of-login-buffs-in-release-functions.patch ibmvnic-unmap-dma-login-rsp-buffer-on-send-login-fail.patch net-dsa-ocelot-call-dsa_tag_8021q_unregister-under-rtnl_lock-on-driver-remove.patch net-hns3-add-wait-until-mac-link-down.patch net-hns3-fix-deadlock-issue-when-externel_lb-and-reset-are-executed-together.patch net-hns3-fix-strscpy-causing-content-truncation-issue.patch net-hns3-refactor-hclge_mac_link_status_wait-for-interface-reuse.patch net-mlx5-allow-0-for-total-host-vfs.patch net-mlx5-lag-check-correct-bucket-when-modifying-lag.patch net-mlx5-reload-auxiliary-devices-in-pci-error-handlers.patch net-mlx5-skip-clock-update-work-when-device-is-in-error-state.patch net-phy-at803x-remove-set-get-wol-callbacks-for-ar8032.patch nexthop-fix-infinite-nexthop-bucket-dump-when-using-maximum-nexthop-id.patch nexthop-fix-infinite-nexthop-dump-when-using-maximum-nexthop-id.patch nexthop-make-nexthop-bucket-dump-more-efficient.patch --- diff --git a/queue-6.1/dmaengine-mcf-edma-fix-a-potential-un-allocated-memory-access.patch b/queue-6.1/dmaengine-mcf-edma-fix-a-potential-un-allocated-memory-access.patch new file mode 100644 index 00000000000..1e6d9606058 --- /dev/null +++ b/queue-6.1/dmaengine-mcf-edma-fix-a-potential-un-allocated-memory-access.patch @@ -0,0 +1,61 @@ +From 0a46781c89dece85386885a407244ca26e5c1c44 Mon Sep 17 00:00:00 2001 +From: Christophe JAILLET +Date: Wed, 12 Jul 2023 18:26:45 +0530 +Subject: dmaengine: mcf-edma: Fix a potential un-allocated memory access + +From: Christophe JAILLET + +commit 0a46781c89dece85386885a407244ca26e5c1c44 upstream. + +When 'mcf_edma' is allocated, some space is allocated for a +flexible array at the end of the struct. 'chans' item are allocated, that is +to say 'pdata->dma_channels'. + +Then, this number of item is stored in 'mcf_edma->n_chans'. + +A few lines later, if 'mcf_edma->n_chans' is 0, then a default value of 64 +is set. + +This ends to no space allocated by devm_kzalloc() because chans was 0, but +64 items are read and/or written in some not allocated memory. + +Change the logic to define a default value before allocating the memory. + +Fixes: e7a3ff92eaf1 ("dmaengine: fsl-edma: add ColdFire mcf5441x edma support") +Signed-off-by: Christophe JAILLET +Link: https://lore.kernel.org/r/f55d914407c900828f6fad3ea5fa791a5f17b9a4.1685172449.git.christophe.jaillet@wanadoo.fr +Signed-off-by: Vinod Koul +Signed-off-by: Greg Kroah-Hartman +--- + drivers/dma/mcf-edma.c | 13 +++++++------ + 1 file changed, 7 insertions(+), 6 deletions(-) + +--- a/drivers/dma/mcf-edma.c ++++ b/drivers/dma/mcf-edma.c +@@ -191,7 +191,13 @@ static int mcf_edma_probe(struct platfor + return -EINVAL; + } + +- chans = pdata->dma_channels; ++ if (!pdata->dma_channels) { ++ dev_info(&pdev->dev, "setting default channel number to 64"); ++ chans = 64; ++ } else { ++ chans = pdata->dma_channels; ++ } ++ + len = sizeof(*mcf_edma) + sizeof(*mcf_chan) * chans; + mcf_edma = devm_kzalloc(&pdev->dev, len, GFP_KERNEL); + if (!mcf_edma) +@@ -203,11 +209,6 @@ static int mcf_edma_probe(struct platfor + mcf_edma->drvdata = &mcf_data; + mcf_edma->big_endian = 1; + +- if (!mcf_edma->n_chans) { +- dev_info(&pdev->dev, "setting default channel number to 64"); +- mcf_edma->n_chans = 64; +- } +- + mutex_init(&mcf_edma->fsl_edma_mutex); + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); diff --git a/queue-6.1/dmaengine-owl-dma-modify-mismatched-function-name.patch b/queue-6.1/dmaengine-owl-dma-modify-mismatched-function-name.patch new file mode 100644 index 00000000000..3def81bec66 --- /dev/null +++ b/queue-6.1/dmaengine-owl-dma-modify-mismatched-function-name.patch @@ -0,0 +1,34 @@ +From 74d7221c1f9c9f3a8c316a3557ca7dca8b99d14c Mon Sep 17 00:00:00 2001 +From: Zhang Jianhua +Date: Sat, 22 Jul 2023 15:32:44 +0000 +Subject: dmaengine: owl-dma: Modify mismatched function name + +From: Zhang Jianhua + +commit 74d7221c1f9c9f3a8c316a3557ca7dca8b99d14c upstream. + +No functional modification involved. + +drivers/dma/owl-dma.c:208: warning: expecting prototype for struct owl_dma_pchan. Prototype was for struct owl_dma_vchan instead HDRTEST usr/include/sound/asequencer.h + +Fixes: 47e20577c24d ("dmaengine: Add Actions Semi Owl family S900 DMA driver") +Signed-off-by: Zhang Jianhua +Reviewed-by: Randy Dunlap +Link: https://lore.kernel.org/r/20230722153244.2086949-1-chris.zjh@huawei.com +Signed-off-by: Vinod Koul +Signed-off-by: Greg Kroah-Hartman +--- + drivers/dma/owl-dma.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/dma/owl-dma.c ++++ b/drivers/dma/owl-dma.c +@@ -192,7 +192,7 @@ struct owl_dma_pchan { + }; + + /** +- * struct owl_dma_pchan - Wrapper for DMA ENGINE channel ++ * struct owl_dma_vchan - Wrapper for DMA ENGINE channel + * @vc: wrapped virtual channel + * @pchan: the physical channel utilized by this channel + * @txd: active transaction on this channel diff --git a/queue-6.1/gpio-sim-mark-the-gpio-chip-as-a-one-that-can-sleep.patch b/queue-6.1/gpio-sim-mark-the-gpio-chip-as-a-one-that-can-sleep.patch new file mode 100644 index 00000000000..69b9b615043 --- /dev/null +++ b/queue-6.1/gpio-sim-mark-the-gpio-chip-as-a-one-that-can-sleep.patch @@ -0,0 +1,32 @@ +From 5a78d5db9c90c9dc84212f40a5f2687b7cafc8ec Mon Sep 17 00:00:00 2001 +From: Bartosz Golaszewski +Date: Tue, 1 Aug 2023 21:09:51 +0200 +Subject: gpio: sim: mark the GPIO chip as a one that can sleep + +From: Bartosz Golaszewski + +commit 5a78d5db9c90c9dc84212f40a5f2687b7cafc8ec upstream. + +Simulated chips use a mutex for synchronization in driver callbacks so +they must not be called from interrupt context. Set the can_sleep field +of the GPIO chip to true to force users to only use threaded irqs. + +Fixes: cb8c474e79be ("gpio: sim: new testing module") +Signed-off-by: Bartosz Golaszewski +Reviewed-by: Andy Shevchenko +Reviewed-by: Linus Walleij +Signed-off-by: Greg Kroah-Hartman +--- + drivers/gpio/gpio-sim.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/drivers/gpio/gpio-sim.c ++++ b/drivers/gpio/gpio-sim.c +@@ -425,6 +425,7 @@ static int gpio_sim_add_bank(struct fwno + gc->set_config = gpio_sim_set_config; + gc->to_irq = gpio_sim_to_irq; + gc->free = gpio_sim_free; ++ gc->can_sleep = true; + + ret = devm_gpiochip_add_data(dev, gc, chip); + if (ret) diff --git a/queue-6.1/gpio-ws16c48-fix-off-by-one-error-in-ws16c48-resource-region-extent.patch b/queue-6.1/gpio-ws16c48-fix-off-by-one-error-in-ws16c48-resource-region-extent.patch new file mode 100644 index 00000000000..36f001f626a --- /dev/null +++ b/queue-6.1/gpio-ws16c48-fix-off-by-one-error-in-ws16c48-resource-region-extent.patch @@ -0,0 +1,38 @@ +From 33f83d13ded164cd49ce2a3bd2770115abc64e6f Mon Sep 17 00:00:00 2001 +From: William Breathitt Gray +Date: Thu, 10 Aug 2023 18:00:44 -0400 +Subject: gpio: ws16c48: Fix off-by-one error in WS16C48 resource region extent + +From: William Breathitt Gray + +commit 33f83d13ded164cd49ce2a3bd2770115abc64e6f upstream. + +The WinSystems WS16C48 I/O address region spans offsets 0x0 through 0xA, +which is a total of 11 bytes. Fix the WS16C48_EXTENT define to the +correct value of 11 so that access to necessary device registers is +properly requested in the ws16c48_probe() callback by the +devm_request_region() function call. + +Fixes: 2c05a0f29f41 ("gpio: ws16c48: Implement and utilize register structures") +Cc: stable@vger.kernel.org +Cc: Paul Demetrotion +Signed-off-by: William Breathitt Gray +Reviewed-by: Andy Shevchenko +Reviewed-by: Linus Walleij +Signed-off-by: Bartosz Golaszewski +Signed-off-by: Greg Kroah-Hartman +--- + drivers/gpio/gpio-ws16c48.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/gpio/gpio-ws16c48.c ++++ b/drivers/gpio/gpio-ws16c48.c +@@ -18,7 +18,7 @@ + #include + #include + +-#define WS16C48_EXTENT 10 ++#define WS16C48_EXTENT 11 + #define MAX_NUM_WS16C48 max_num_isa_dev(WS16C48_EXTENT) + + static unsigned int base[MAX_NUM_WS16C48]; diff --git a/queue-6.1/ibmvnic-do-partial-reset-on-login-failure.patch b/queue-6.1/ibmvnic-do-partial-reset-on-login-failure.patch new file mode 100644 index 00000000000..1d70eb6e004 --- /dev/null +++ b/queue-6.1/ibmvnic-do-partial-reset-on-login-failure.patch @@ -0,0 +1,113 @@ +From 23cc5f667453ca7645a24c8d21bf84dbf61107b2 Mon Sep 17 00:00:00 2001 +From: Nick Child +Date: Wed, 9 Aug 2023 17:10:37 -0500 +Subject: ibmvnic: Do partial reset on login failure + +From: Nick Child + +commit 23cc5f667453ca7645a24c8d21bf84dbf61107b2 upstream. + +Perform a partial reset before sending a login request if any of the +following are true: + 1. If a previous request times out. This can be dangerous because the + VIOS could still receive the old login request at any point after + the timeout. Therefore, it is best to re-register the CRQ's and + sub-CRQ's before retrying. + 2. If the previous request returns an error that is not described in + PAPR. PAPR provides procedures if the login returns with partial + success or aborted return codes (section L.5.1) but other values + do not have a defined procedure. Previously, these conditions + just returned error from the login function rather than trying + to resolve the issue. + This can cause further issues since most callers of the login + function are not prepared to handle an error when logging in. This + improper cleanup can lead to the device being permanently DOWN'd. + For example, if the VIOS believes that the device is already logged + in then it will return INVALID_STATE (-7). If we never re-register + CRQ's then it will always think that the device is already logged + in. This leaves the device inoperable. + +The partial reset involves freeing the sub-CRQs, freeing the CRQ then +registering and initializing a new CRQ and sub-CRQs. This essentially +restarts all communication with VIOS to allow for a fresh login attempt +that will be unhindered by any previous failed attempts. + +Fixes: dff515a3e71d ("ibmvnic: Harden device login requests") +Signed-off-by: Nick Child +Reviewed-by: Simon Horman +Link: https://lore.kernel.org/r/20230809221038.51296-4-nnac123@linux.ibm.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/ibm/ibmvnic.c | 46 ++++++++++++++++++++++++++++++++----- + 1 file changed, 40 insertions(+), 6 deletions(-) + +--- a/drivers/net/ethernet/ibm/ibmvnic.c ++++ b/drivers/net/ethernet/ibm/ibmvnic.c +@@ -96,6 +96,8 @@ static int pending_scrq(struct ibmvnic_a + static union sub_crq *ibmvnic_next_scrq(struct ibmvnic_adapter *, + struct ibmvnic_sub_crq_queue *); + static int ibmvnic_poll(struct napi_struct *napi, int data); ++static int reset_sub_crq_queues(struct ibmvnic_adapter *adapter); ++static inline void reinit_init_done(struct ibmvnic_adapter *adapter); + static void send_query_map(struct ibmvnic_adapter *adapter); + static int send_request_map(struct ibmvnic_adapter *, dma_addr_t, u32, u8); + static int send_request_unmap(struct ibmvnic_adapter *, u8); +@@ -1336,11 +1338,9 @@ static int ibmvnic_login(struct net_devi + + if (!wait_for_completion_timeout(&adapter->init_done, + timeout)) { +- netdev_warn(netdev, "Login timed out, retrying...\n"); +- retry = true; +- adapter->init_done_rc = 0; +- retry_count++; +- continue; ++ netdev_warn(netdev, "Login timed out\n"); ++ adapter->login_pending = false; ++ goto partial_reset; + } + + if (adapter->init_done_rc == ABORTED) { +@@ -1385,7 +1385,41 @@ static int ibmvnic_login(struct net_devi + } else if (adapter->init_done_rc) { + netdev_warn(netdev, "Adapter login failed, init_done_rc = %d\n", + adapter->init_done_rc); +- return -EIO; ++ ++partial_reset: ++ /* adapter login failed, so free any CRQs or sub-CRQs ++ * and register again before attempting to login again. ++ * If we don't do this then the VIOS may think that ++ * we are already logged in and reject any subsequent ++ * attempts ++ */ ++ netdev_warn(netdev, ++ "Freeing and re-registering CRQs before attempting to login again\n"); ++ retry = true; ++ adapter->init_done_rc = 0; ++ retry_count++; ++ release_sub_crqs(adapter, true); ++ reinit_init_done(adapter); ++ release_crq_queue(adapter); ++ /* If we don't sleep here then we risk an unnecessary ++ * failover event from the VIOS. This is a known VIOS ++ * issue caused by a vnic device freeing and registering ++ * a CRQ too quickly. ++ */ ++ msleep(1500); ++ rc = init_crq_queue(adapter); ++ if (rc) { ++ netdev_err(netdev, "login recovery: init CRQ failed %d\n", ++ rc); ++ return -EIO; ++ } ++ ++ rc = ibmvnic_reset_init(adapter, false); ++ if (rc) { ++ netdev_err(netdev, "login recovery: Reset init failed %d\n", ++ rc); ++ return -EIO; ++ } + } + } while (retry); + diff --git a/queue-6.1/ibmvnic-enforce-stronger-sanity-checks-on-login-response.patch b/queue-6.1/ibmvnic-enforce-stronger-sanity-checks-on-login-response.patch new file mode 100644 index 00000000000..2a698e7601d --- /dev/null +++ b/queue-6.1/ibmvnic-enforce-stronger-sanity-checks-on-login-response.patch @@ -0,0 +1,77 @@ +From db17ba719bceb52f0ae4ebca0e4c17d9a3bebf05 Mon Sep 17 00:00:00 2001 +From: Nick Child +Date: Wed, 9 Aug 2023 17:10:34 -0500 +Subject: ibmvnic: Enforce stronger sanity checks on login response + +From: Nick Child + +commit db17ba719bceb52f0ae4ebca0e4c17d9a3bebf05 upstream. + +Ensure that all offsets in a login response buffer are within the size +of the allocated response buffer. Any offsets or lengths that surpass +the allocation are likely the result of an incomplete response buffer. +In these cases, a full reset is necessary. + +When attempting to login, the ibmvnic device will allocate a response +buffer and pass a reference to the VIOS. The VIOS will then send the +ibmvnic device a LOGIN_RSP CRQ to signal that the buffer has been filled +with data. If the ibmvnic device does not get a response in 20 seconds, +the old buffer is freed and a new login request is sent. With 2 +outstanding requests, any LOGIN_RSP CRQ's could be for the older +login request. If this is the case then the login response buffer (which +is for the newer login request) could be incomplete and contain invalid +data. Therefore, we must enforce strict sanity checks on the response +buffer values. + +Testing has shown that the `off_rxadd_buff_size` value is filled in last +by the VIOS and will be the smoking gun for these circumstances. + +Until VIOS can implement a mechanism for tracking outstanding response +buffers and a method for mapping a LOGIN_RSP CRQ to a particular login +response buffer, the best ibmvnic can do in this situation is perform a +full reset. + +Fixes: dff515a3e71d ("ibmvnic: Harden device login requests") +Signed-off-by: Nick Child +Reviewed-by: Simon Horman +Link: https://lore.kernel.org/r/20230809221038.51296-1-nnac123@linux.ibm.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/ibm/ibmvnic.c | 18 ++++++++++++++++++ + 1 file changed, 18 insertions(+) + +--- a/drivers/net/ethernet/ibm/ibmvnic.c ++++ b/drivers/net/ethernet/ibm/ibmvnic.c +@@ -5192,6 +5192,7 @@ static int handle_login_rsp(union ibmvni + int num_tx_pools; + int num_rx_pools; + u64 *size_array; ++ u32 rsp_len; + int i; + + /* CHECK: Test/set of login_pending does not need to be atomic +@@ -5243,6 +5244,23 @@ static int handle_login_rsp(union ibmvni + ibmvnic_reset(adapter, VNIC_RESET_FATAL); + return -EIO; + } ++ ++ rsp_len = be32_to_cpu(login_rsp->len); ++ if (be32_to_cpu(login->login_rsp_len) < rsp_len || ++ rsp_len <= be32_to_cpu(login_rsp->off_txsubm_subcrqs) || ++ rsp_len <= be32_to_cpu(login_rsp->off_rxadd_subcrqs) || ++ rsp_len <= be32_to_cpu(login_rsp->off_rxadd_buff_size) || ++ rsp_len <= be32_to_cpu(login_rsp->off_supp_tx_desc)) { ++ /* This can happen if a login request times out and there are ++ * 2 outstanding login requests sent, the LOGIN_RSP crq ++ * could have been for the older login request. So we are ++ * parsing the newer response buffer which may be incomplete ++ */ ++ dev_err(dev, "FATAL: Login rsp offsets/lengths invalid\n"); ++ ibmvnic_reset(adapter, VNIC_RESET_FATAL); ++ return -EIO; ++ } ++ + size_array = (u64 *)((u8 *)(adapter->login_rsp_buf) + + be32_to_cpu(adapter->login_rsp_buf->off_rxadd_buff_size)); + /* variable buffer sizes are not supported, so just read the diff --git a/queue-6.1/ibmvnic-ensure-login-failure-recovery-is-safe-from-other-resets.patch b/queue-6.1/ibmvnic-ensure-login-failure-recovery-is-safe-from-other-resets.patch new file mode 100644 index 00000000000..c5dc5506761 --- /dev/null +++ b/queue-6.1/ibmvnic-ensure-login-failure-recovery-is-safe-from-other-resets.patch @@ -0,0 +1,145 @@ +From 6db541ae279bd4e76dbd939e5fbf298396166242 Mon Sep 17 00:00:00 2001 +From: Nick Child +Date: Wed, 9 Aug 2023 17:10:38 -0500 +Subject: ibmvnic: Ensure login failure recovery is safe from other resets + +From: Nick Child + +commit 6db541ae279bd4e76dbd939e5fbf298396166242 upstream. + +If a login request fails, the recovery process should be protected +against parallel resets. It is a known issue that freeing and +registering CRQ's in quick succession can result in a failover CRQ from +the VIOS. Processing a failover during login recovery is dangerous for +two reasons: + 1. This will result in two parallel initialization processes, this can + cause serious issues during login. + 2. It is possible that the failover CRQ is received but never executed. + We get notified of a pending failover through a transport event CRQ. + The reset is not performed until a INIT CRQ request is received. + Previously, if CRQ init fails during login recovery, then the ibmvnic + irq is freed and the login process returned error. If failover_pending + is true (a transport event was received), then the ibmvnic device + would never be able to process the reset since it cannot receive the + CRQ_INIT request due to the irq being freed. This leaved the device + in a inoperable state. + +Therefore, the login failure recovery process must be hardened against +these possible issues. Possible failovers (due to quick CRQ free and +init) must be avoided and any issues during re-initialization should be +dealt with instead of being propagated up the stack. This logic is +similar to that of ibmvnic_probe(). + +Fixes: dff515a3e71d ("ibmvnic: Harden device login requests") +Signed-off-by: Nick Child +Reviewed-by: Simon Horman +Link: https://lore.kernel.org/r/20230809221038.51296-5-nnac123@linux.ibm.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/ibm/ibmvnic.c | 70 +++++++++++++++++++++++++------------ + 1 file changed, 48 insertions(+), 22 deletions(-) + +--- a/drivers/net/ethernet/ibm/ibmvnic.c ++++ b/drivers/net/ethernet/ibm/ibmvnic.c +@@ -115,6 +115,7 @@ static void ibmvnic_tx_scrq_clean_buffer + static void free_long_term_buff(struct ibmvnic_adapter *adapter, + struct ibmvnic_long_term_buff *ltb); + static void ibmvnic_disable_irqs(struct ibmvnic_adapter *adapter); ++static void flush_reset_queue(struct ibmvnic_adapter *adapter); + + struct ibmvnic_stat { + char name[ETH_GSTRING_LEN]; +@@ -1316,8 +1317,8 @@ static const char *adapter_state_to_stri + + static int ibmvnic_login(struct net_device *netdev) + { ++ unsigned long flags, timeout = msecs_to_jiffies(20000); + struct ibmvnic_adapter *adapter = netdev_priv(netdev); +- unsigned long timeout = msecs_to_jiffies(20000); + int retry_count = 0; + int retries = 10; + bool retry; +@@ -1382,6 +1383,7 @@ static int ibmvnic_login(struct net_devi + "SCRQ irq initialization failed\n"); + return rc; + } ++ /* Default/timeout error handling, reset and start fresh */ + } else if (adapter->init_done_rc) { + netdev_warn(netdev, "Adapter login failed, init_done_rc = %d\n", + adapter->init_done_rc); +@@ -1397,29 +1399,53 @@ partial_reset: + "Freeing and re-registering CRQs before attempting to login again\n"); + retry = true; + adapter->init_done_rc = 0; +- retry_count++; + release_sub_crqs(adapter, true); +- reinit_init_done(adapter); +- release_crq_queue(adapter); +- /* If we don't sleep here then we risk an unnecessary +- * failover event from the VIOS. This is a known VIOS +- * issue caused by a vnic device freeing and registering +- * a CRQ too quickly. ++ /* Much of this is similar logic as ibmvnic_probe(), ++ * we are essentially re-initializing communication ++ * with the server. We really should not run any ++ * resets/failovers here because this is already a form ++ * of reset and we do not want parallel resets occurring + */ +- msleep(1500); +- rc = init_crq_queue(adapter); +- if (rc) { +- netdev_err(netdev, "login recovery: init CRQ failed %d\n", +- rc); +- return -EIO; +- } +- +- rc = ibmvnic_reset_init(adapter, false); +- if (rc) { +- netdev_err(netdev, "login recovery: Reset init failed %d\n", +- rc); +- return -EIO; +- } ++ do { ++ reinit_init_done(adapter); ++ /* Clear any failovers we got in the previous ++ * pass since we are re-initializing the CRQ ++ */ ++ adapter->failover_pending = false; ++ release_crq_queue(adapter); ++ /* If we don't sleep here then we risk an ++ * unnecessary failover event from the VIOS. ++ * This is a known VIOS issue caused by a vnic ++ * device freeing and registering a CRQ too ++ * quickly. ++ */ ++ msleep(1500); ++ /* Avoid any resets, since we are currently ++ * resetting. ++ */ ++ spin_lock_irqsave(&adapter->rwi_lock, flags); ++ flush_reset_queue(adapter); ++ spin_unlock_irqrestore(&adapter->rwi_lock, ++ flags); ++ ++ rc = init_crq_queue(adapter); ++ if (rc) { ++ netdev_err(netdev, "login recovery: init CRQ failed %d\n", ++ rc); ++ return -EIO; ++ } ++ ++ rc = ibmvnic_reset_init(adapter, false); ++ if (rc) ++ netdev_err(netdev, "login recovery: Reset init failed %d\n", ++ rc); ++ /* IBMVNIC_CRQ_INIT will return EAGAIN if it ++ * fails, since ibmvnic_reset_init will free ++ * irq's in failure, we won't be able to receive ++ * new CRQs so we need to keep trying. probe() ++ * handles this similarly. ++ */ ++ } while (rc == -EAGAIN && retry_count++ < retries); + } + } while (retry); + diff --git a/queue-6.1/ibmvnic-handle-dma-unmapping-of-login-buffs-in-release-functions.patch b/queue-6.1/ibmvnic-handle-dma-unmapping-of-login-buffs-in-release-functions.patch new file mode 100644 index 00000000000..1f9320eddda --- /dev/null +++ b/queue-6.1/ibmvnic-handle-dma-unmapping-of-login-buffs-in-release-functions.patch @@ -0,0 +1,73 @@ +From d78a671eb8996af19d6311ecdee9790d2fa479f0 Mon Sep 17 00:00:00 2001 +From: Nick Child +Date: Wed, 9 Aug 2023 17:10:36 -0500 +Subject: ibmvnic: Handle DMA unmapping of login buffs in release functions + +From: Nick Child + +commit d78a671eb8996af19d6311ecdee9790d2fa479f0 upstream. + +Rather than leaving the DMA unmapping of the login buffers to the +login response handler, move this work into the login release functions. +Previously, these functions were only used for freeing the allocated +buffers. This could lead to issues if there are more than one +outstanding login buffer requests, which is possible if a login request +times out. + +If a login request times out, then there is another call to send login. +The send login function makes a call to the login buffer release +function. In the past, this freed the buffers but did not DMA unmap. +Therefore, the VIOS could still write to the old login (now freed) +buffer. It is for this reason that it is a good idea to leave the DMA +unmap call to the login buffers release function. + +Since the login buffer release functions now handle DMA unmapping, +remove the duplicate DMA unmapping in handle_login_rsp(). + +Fixes: dff515a3e71d ("ibmvnic: Harden device login requests") +Signed-off-by: Nick Child +Reviewed-by: Simon Horman +Link: https://lore.kernel.org/r/20230809221038.51296-3-nnac123@linux.ibm.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/ibm/ibmvnic.c | 15 ++++++++++----- + 1 file changed, 10 insertions(+), 5 deletions(-) + +--- a/drivers/net/ethernet/ibm/ibmvnic.c ++++ b/drivers/net/ethernet/ibm/ibmvnic.c +@@ -1397,12 +1397,22 @@ static int ibmvnic_login(struct net_devi + + static void release_login_buffer(struct ibmvnic_adapter *adapter) + { ++ if (!adapter->login_buf) ++ return; ++ ++ dma_unmap_single(&adapter->vdev->dev, adapter->login_buf_token, ++ adapter->login_buf_sz, DMA_TO_DEVICE); + kfree(adapter->login_buf); + adapter->login_buf = NULL; + } + + static void release_login_rsp_buffer(struct ibmvnic_adapter *adapter) + { ++ if (!adapter->login_rsp_buf) ++ return; ++ ++ dma_unmap_single(&adapter->vdev->dev, adapter->login_rsp_buf_token, ++ adapter->login_rsp_buf_sz, DMA_FROM_DEVICE); + kfree(adapter->login_rsp_buf); + adapter->login_rsp_buf = NULL; + } +@@ -5207,11 +5217,6 @@ static int handle_login_rsp(union ibmvni + } + adapter->login_pending = false; + +- dma_unmap_single(dev, adapter->login_buf_token, adapter->login_buf_sz, +- DMA_TO_DEVICE); +- dma_unmap_single(dev, adapter->login_rsp_buf_token, +- adapter->login_rsp_buf_sz, DMA_FROM_DEVICE); +- + /* If the number of queues requested can't be allocated by the + * server, the login response will return with code 1. We will need + * to resend the login buffer with fewer queues requested. diff --git a/queue-6.1/ibmvnic-unmap-dma-login-rsp-buffer-on-send-login-fail.patch b/queue-6.1/ibmvnic-unmap-dma-login-rsp-buffer-on-send-login-fail.patch new file mode 100644 index 00000000000..11d51358481 --- /dev/null +++ b/queue-6.1/ibmvnic-unmap-dma-login-rsp-buffer-on-send-login-fail.patch @@ -0,0 +1,41 @@ +From 411c565b4bc63e9584a8493882bd566e35a90588 Mon Sep 17 00:00:00 2001 +From: Nick Child +Date: Wed, 9 Aug 2023 17:10:35 -0500 +Subject: ibmvnic: Unmap DMA login rsp buffer on send login fail + +From: Nick Child + +commit 411c565b4bc63e9584a8493882bd566e35a90588 upstream. + +If the LOGIN CRQ fails to send then we must DMA unmap the response +buffer. Previously, if the CRQ failed then the memory was freed without +DMA unmapping. + +Fixes: c98d9cc4170d ("ibmvnic: send_login should check for crq errors") +Signed-off-by: Nick Child +Reviewed-by: Simon Horman +Link: https://lore.kernel.org/r/20230809221038.51296-2-nnac123@linux.ibm.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/ibm/ibmvnic.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +--- a/drivers/net/ethernet/ibm/ibmvnic.c ++++ b/drivers/net/ethernet/ibm/ibmvnic.c +@@ -4626,11 +4626,14 @@ static int send_login(struct ibmvnic_ada + if (rc) { + adapter->login_pending = false; + netdev_err(adapter->netdev, "Failed to send login, rc=%d\n", rc); +- goto buf_rsp_map_failed; ++ goto buf_send_failed; + } + + return 0; + ++buf_send_failed: ++ dma_unmap_single(dev, rsp_buffer_token, rsp_buffer_size, ++ DMA_FROM_DEVICE); + buf_rsp_map_failed: + kfree(login_rsp_buffer); + adapter->login_rsp_buf = NULL; diff --git a/queue-6.1/net-dsa-ocelot-call-dsa_tag_8021q_unregister-under-rtnl_lock-on-driver-remove.patch b/queue-6.1/net-dsa-ocelot-call-dsa_tag_8021q_unregister-under-rtnl_lock-on-driver-remove.patch new file mode 100644 index 00000000000..2d968a2b2e0 --- /dev/null +++ b/queue-6.1/net-dsa-ocelot-call-dsa_tag_8021q_unregister-under-rtnl_lock-on-driver-remove.patch @@ -0,0 +1,91 @@ +From a94c16a2fda010866b8858a386a8bfbeba4f72c5 Mon Sep 17 00:00:00 2001 +From: Vladimir Oltean +Date: Thu, 3 Aug 2023 16:42:53 +0300 +Subject: net: dsa: ocelot: call dsa_tag_8021q_unregister() under rtnl_lock() on driver remove + +From: Vladimir Oltean + +commit a94c16a2fda010866b8858a386a8bfbeba4f72c5 upstream. + +When the tagging protocol in current use is "ocelot-8021q" and we unbind +the driver, we see this splat: + +$ echo '0000:00:00.2' > /sys/bus/pci/drivers/fsl_enetc/unbind +mscc_felix 0000:00:00.5 swp0: left promiscuous mode +sja1105 spi2.0: Link is Down +DSA: tree 1 torn down +mscc_felix 0000:00:00.5 swp2: left promiscuous mode +sja1105 spi2.2: Link is Down +DSA: tree 3 torn down +fsl_enetc 0000:00:00.2 eno2: left promiscuous mode +mscc_felix 0000:00:00.5: Link is Down +------------[ cut here ]------------ +RTNL: assertion failed at net/dsa/tag_8021q.c (409) +WARNING: CPU: 1 PID: 329 at net/dsa/tag_8021q.c:409 dsa_tag_8021q_unregister+0x12c/0x1a0 +Modules linked in: +CPU: 1 PID: 329 Comm: bash Not tainted 6.5.0-rc3+ #771 +pc : dsa_tag_8021q_unregister+0x12c/0x1a0 +lr : dsa_tag_8021q_unregister+0x12c/0x1a0 +Call trace: + dsa_tag_8021q_unregister+0x12c/0x1a0 + felix_tag_8021q_teardown+0x130/0x150 + felix_teardown+0x3c/0xd8 + dsa_tree_teardown_switches+0xbc/0xe0 + dsa_unregister_switch+0x168/0x260 + felix_pci_remove+0x30/0x60 + pci_device_remove+0x4c/0x100 + device_release_driver_internal+0x188/0x288 + device_links_unbind_consumers+0xfc/0x138 + device_release_driver_internal+0xe0/0x288 + device_driver_detach+0x24/0x38 + unbind_store+0xd8/0x108 + drv_attr_store+0x30/0x50 +---[ end trace 0000000000000000 ]--- +------------[ cut here ]------------ +RTNL: assertion failed at net/8021q/vlan_core.c (376) +WARNING: CPU: 1 PID: 329 at net/8021q/vlan_core.c:376 vlan_vid_del+0x1b8/0x1f0 +CPU: 1 PID: 329 Comm: bash Tainted: G W 6.5.0-rc3+ #771 +pc : vlan_vid_del+0x1b8/0x1f0 +lr : vlan_vid_del+0x1b8/0x1f0 + dsa_tag_8021q_unregister+0x8c/0x1a0 + felix_tag_8021q_teardown+0x130/0x150 + felix_teardown+0x3c/0xd8 + dsa_tree_teardown_switches+0xbc/0xe0 + dsa_unregister_switch+0x168/0x260 + felix_pci_remove+0x30/0x60 + pci_device_remove+0x4c/0x100 + device_release_driver_internal+0x188/0x288 + device_links_unbind_consumers+0xfc/0x138 + device_release_driver_internal+0xe0/0x288 + device_driver_detach+0x24/0x38 + unbind_store+0xd8/0x108 + drv_attr_store+0x30/0x50 +DSA: tree 0 torn down + +This was somewhat not so easy to spot, because "ocelot-8021q" is not the +default tagging protocol, and thus, not everyone who tests the unbinding +path may have switched to it beforehand. The default +felix_tag_npi_teardown() does not require rtnl_lock() to be held. + +Fixes: 7c83a7c539ab ("net: dsa: add a second tagger for Ocelot switches based on tag_8021q") +Signed-off-by: Vladimir Oltean +Link: https://lore.kernel.org/r/20230803134253.2711124-1-vladimir.oltean@nxp.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/dsa/ocelot/felix.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/drivers/net/dsa/ocelot/felix.c ++++ b/drivers/net/dsa/ocelot/felix.c +@@ -1606,8 +1606,10 @@ static void felix_teardown(struct dsa_sw + struct felix *felix = ocelot_to_felix(ocelot); + struct dsa_port *dp; + ++ rtnl_lock(); + if (felix->tag_proto_ops) + felix->tag_proto_ops->teardown(ds); ++ rtnl_unlock(); + + dsa_switch_for_each_available_port(dp, ds) + ocelot_deinit_port(ocelot, dp->index); diff --git a/queue-6.1/net-hns3-add-wait-until-mac-link-down.patch b/queue-6.1/net-hns3-add-wait-until-mac-link-down.patch new file mode 100644 index 00000000000..d00cdc89504 --- /dev/null +++ b/queue-6.1/net-hns3-add-wait-until-mac-link-down.patch @@ -0,0 +1,54 @@ +From 6265e242f7b95f2c1195b42ec912b84ad161470e Mon Sep 17 00:00:00 2001 +From: Jie Wang +Date: Mon, 7 Aug 2023 19:34:51 +0800 +Subject: net: hns3: add wait until mac link down + +From: Jie Wang + +commit 6265e242f7b95f2c1195b42ec912b84ad161470e upstream. + +In some configure flow of hns3 driver, for example, change mtu, it will +disable MAC through firmware before configuration. But firmware disables +MAC asynchronously. The rx traffic may be not stopped in this case. + +So fixes it by waiting until mac link is down. + +Fixes: a9775bb64aa7 ("net: hns3: fix set and get link ksettings issue") +Signed-off-by: Jie Wang +Signed-off-by: Jijie Shao +Reviewed-by: Leon Romanovsky +Link: https://lore.kernel.org/r/20230807113452.474224-4-shaojijie@huawei.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 10 +++++++++- + 1 file changed, 9 insertions(+), 1 deletion(-) + +--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c ++++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +@@ -7569,6 +7569,8 @@ static void hclge_enable_fd(struct hnae3 + + static void hclge_cfg_mac_mode(struct hclge_dev *hdev, bool enable) + { ++#define HCLGE_LINK_STATUS_WAIT_CNT 3 ++ + struct hclge_desc desc; + struct hclge_config_mac_mode_cmd *req = + (struct hclge_config_mac_mode_cmd *)desc.data; +@@ -7593,9 +7595,15 @@ static void hclge_cfg_mac_mode(struct hc + req->txrx_pad_fcs_loop_en = cpu_to_le32(loop_en); + + ret = hclge_cmd_send(&hdev->hw, &desc, 1); +- if (ret) ++ if (ret) { + dev_err(&hdev->pdev->dev, + "mac enable fail, ret =%d.\n", ret); ++ return; ++ } ++ ++ if (!enable) ++ hclge_mac_link_status_wait(hdev, HCLGE_LINK_STATUS_DOWN, ++ HCLGE_LINK_STATUS_WAIT_CNT); + } + + static int hclge_config_switch_param(struct hclge_dev *hdev, int vfid, diff --git a/queue-6.1/net-hns3-fix-deadlock-issue-when-externel_lb-and-reset-are-executed-together.patch b/queue-6.1/net-hns3-fix-deadlock-issue-when-externel_lb-and-reset-are-executed-together.patch new file mode 100644 index 00000000000..422f7ad2e4b --- /dev/null +++ b/queue-6.1/net-hns3-fix-deadlock-issue-when-externel_lb-and-reset-are-executed-together.patch @@ -0,0 +1,89 @@ +From ac6257a3ae5db5193b1f19c268e4f72d274ddb88 Mon Sep 17 00:00:00 2001 +From: Yonglong Liu +Date: Mon, 7 Aug 2023 19:34:52 +0800 +Subject: net: hns3: fix deadlock issue when externel_lb and reset are executed together + +From: Yonglong Liu + +commit ac6257a3ae5db5193b1f19c268e4f72d274ddb88 upstream. + +When externel_lb and reset are executed together, a deadlock may +occur: +[ 3147.217009] INFO: task kworker/u321:0:7 blocked for more than 120 seconds. +[ 3147.230483] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. +[ 3147.238999] task:kworker/u321:0 state:D stack: 0 pid: 7 ppid: 2 flags:0x00000008 +[ 3147.248045] Workqueue: hclge hclge_service_task [hclge] +[ 3147.253957] Call trace: +[ 3147.257093] __switch_to+0x7c/0xbc +[ 3147.261183] __schedule+0x338/0x6f0 +[ 3147.265357] schedule+0x50/0xe0 +[ 3147.269185] schedule_preempt_disabled+0x18/0x24 +[ 3147.274488] __mutex_lock.constprop.0+0x1d4/0x5dc +[ 3147.279880] __mutex_lock_slowpath+0x1c/0x30 +[ 3147.284839] mutex_lock+0x50/0x60 +[ 3147.288841] rtnl_lock+0x20/0x2c +[ 3147.292759] hclge_reset_prepare+0x68/0x90 [hclge] +[ 3147.298239] hclge_reset_subtask+0x88/0xe0 [hclge] +[ 3147.303718] hclge_reset_service_task+0x84/0x120 [hclge] +[ 3147.309718] hclge_service_task+0x2c/0x70 [hclge] +[ 3147.315109] process_one_work+0x1d0/0x490 +[ 3147.319805] worker_thread+0x158/0x3d0 +[ 3147.324240] kthread+0x108/0x13c +[ 3147.328154] ret_from_fork+0x10/0x18 + +In externel_lb process, the hns3 driver call napi_disable() +first, then the reset happen, then the restore process of the +externel_lb will fail, and will not call napi_enable(). When +doing externel_lb again, napi_disable() will be double call, +cause a deadlock of rtnl_lock(). + +This patch use the HNS3_NIC_STATE_DOWN state to protect the +calling of napi_disable() and napi_enable() in externel_lb +process, just as the usage in ndo_stop() and ndo_start(). + +Fixes: 04b6ba143521 ("net: hns3: add support for external loopback test") +Signed-off-by: Yonglong Liu +Signed-off-by: Jijie Shao +Reviewed-by: Leon Romanovsky +Link: https://lore.kernel.org/r/20230807113452.474224-5-shaojijie@huawei.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 14 +++++++++++++- + 1 file changed, 13 insertions(+), 1 deletion(-) + +diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +index 9f6890059666..b7b51e56b030 100644 +--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c ++++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +@@ -5854,6 +5854,9 @@ void hns3_external_lb_prepare(struct net_device *ndev, bool if_running) + if (!if_running) + return; + ++ if (test_and_set_bit(HNS3_NIC_STATE_DOWN, &priv->state)) ++ return; ++ + netif_carrier_off(ndev); + netif_tx_disable(ndev); + +@@ -5882,7 +5885,16 @@ void hns3_external_lb_restore(struct net_device *ndev, bool if_running) + if (!if_running) + return; + +- hns3_nic_reset_all_ring(priv->ae_handle); ++ if (hns3_nic_resetting(ndev)) ++ return; ++ ++ if (!test_bit(HNS3_NIC_STATE_DOWN, &priv->state)) ++ return; ++ ++ if (hns3_nic_reset_all_ring(priv->ae_handle)) ++ return; ++ ++ clear_bit(HNS3_NIC_STATE_DOWN, &priv->state); + + for (i = 0; i < priv->vector_num; i++) + hns3_vector_enable(&priv->tqp_vector[i]); +-- +2.41.0 + diff --git a/queue-6.1/net-hns3-fix-strscpy-causing-content-truncation-issue.patch b/queue-6.1/net-hns3-fix-strscpy-causing-content-truncation-issue.patch new file mode 100644 index 00000000000..57c20f6b5af --- /dev/null +++ b/queue-6.1/net-hns3-fix-strscpy-causing-content-truncation-issue.patch @@ -0,0 +1,68 @@ +From 5e3d20617b055e725e785e0058426368269949f3 Mon Sep 17 00:00:00 2001 +From: Hao Chen +Date: Wed, 9 Aug 2023 10:09:02 +0800 +Subject: net: hns3: fix strscpy causing content truncation issue + +From: Hao Chen + +commit 5e3d20617b055e725e785e0058426368269949f3 upstream. + +hns3_dbg_fill_content()/hclge_dbg_fill_content() is aim to integrate some +items to a string for content, and we add '\n' and '\0' in the last +two bytes of content. + +strscpy() will add '\0' in the last byte of destination buffer(one of +items), it result in finishing content print ahead of schedule and some +dump content truncation. + +One Error log shows as below: +cat mac_list/uc +UC MAC_LIST: + +Expected: +UC MAC_LIST: +FUNC_ID MAC_ADDR STATE +pf 00:2b:19:05:03:00 ACTIVE + +The destination buffer is length-bounded and not required to be +NUL-terminated, so just change strscpy() to memcpy() to fix it. + +Fixes: 1cf3d5567f27 ("net: hns3: fix strncpy() not using dest-buf length as length issue") +Signed-off-by: Hao Chen +Signed-off-by: Jijie Shao +Link: https://lore.kernel.org/r/20230809020902.1941471-1-shaojijie@huawei.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c | 4 ++-- + drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c | 4 ++-- + 2 files changed, 4 insertions(+), 4 deletions(-) + +--- a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c ++++ b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c +@@ -458,9 +458,9 @@ static void hns3_dbg_fill_content(char * + if (result) { + if (item_len < strlen(result[i])) + break; +- strscpy(pos, result[i], strlen(result[i])); ++ memcpy(pos, result[i], strlen(result[i])); + } else { +- strscpy(pos, items[i].name, strlen(items[i].name)); ++ memcpy(pos, items[i].name, strlen(items[i].name)); + } + pos += item_len; + len -= item_len; +--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c ++++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c +@@ -110,9 +110,9 @@ static void hclge_dbg_fill_content(char + if (result) { + if (item_len < strlen(result[i])) + break; +- strscpy(pos, result[i], strlen(result[i])); ++ memcpy(pos, result[i], strlen(result[i])); + } else { +- strscpy(pos, items[i].name, strlen(items[i].name)); ++ memcpy(pos, items[i].name, strlen(items[i].name)); + } + pos += item_len; + len -= item_len; diff --git a/queue-6.1/net-hns3-refactor-hclge_mac_link_status_wait-for-interface-reuse.patch b/queue-6.1/net-hns3-refactor-hclge_mac_link_status_wait-for-interface-reuse.patch new file mode 100644 index 00000000000..faf5ed569fb --- /dev/null +++ b/queue-6.1/net-hns3-refactor-hclge_mac_link_status_wait-for-interface-reuse.patch @@ -0,0 +1,73 @@ +From 08469dacfad25428b66549716811807203744f4f Mon Sep 17 00:00:00 2001 +From: Jie Wang +Date: Mon, 7 Aug 2023 19:34:50 +0800 +Subject: net: hns3: refactor hclge_mac_link_status_wait for interface reuse + +From: Jie Wang + +commit 08469dacfad25428b66549716811807203744f4f upstream. + +Some nic configurations could only be performed after link is down. So this +patch refactor this API for reuse. + +Signed-off-by: Jie Wang +Signed-off-by: Jijie Shao +Reviewed-by: Leon Romanovsky +Link: https://lore.kernel.org/r/20230807113452.474224-3-shaojijie@huawei.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 14 +++++++++----- + 1 file changed, 9 insertions(+), 5 deletions(-) + +--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c ++++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +@@ -72,6 +72,8 @@ static void hclge_restore_hw_table(struc + static void hclge_sync_promisc_mode(struct hclge_dev *hdev); + static void hclge_sync_fd_table(struct hclge_dev *hdev); + static void hclge_update_fec_stats(struct hclge_dev *hdev); ++static int hclge_mac_link_status_wait(struct hclge_dev *hdev, int link_ret, ++ int wait_cnt); + + static struct hnae3_ae_algo ae_algo; + +@@ -7656,10 +7658,9 @@ static void hclge_phy_link_status_wait(s + } while (++i < HCLGE_PHY_LINK_STATUS_NUM); + } + +-static int hclge_mac_link_status_wait(struct hclge_dev *hdev, int link_ret) ++static int hclge_mac_link_status_wait(struct hclge_dev *hdev, int link_ret, ++ int wait_cnt) + { +-#define HCLGE_MAC_LINK_STATUS_NUM 100 +- + int link_status; + int i = 0; + int ret; +@@ -7672,13 +7673,15 @@ static int hclge_mac_link_status_wait(st + return 0; + + msleep(HCLGE_LINK_STATUS_MS); +- } while (++i < HCLGE_MAC_LINK_STATUS_NUM); ++ } while (++i < wait_cnt); + return -EBUSY; + } + + static int hclge_mac_phy_link_status_wait(struct hclge_dev *hdev, bool en, + bool is_phy) + { ++#define HCLGE_MAC_LINK_STATUS_NUM 100 ++ + int link_ret; + + link_ret = en ? HCLGE_LINK_STATUS_UP : HCLGE_LINK_STATUS_DOWN; +@@ -7686,7 +7689,8 @@ static int hclge_mac_phy_link_status_wai + if (is_phy) + hclge_phy_link_status_wait(hdev, link_ret); + +- return hclge_mac_link_status_wait(hdev, link_ret); ++ return hclge_mac_link_status_wait(hdev, link_ret, ++ HCLGE_MAC_LINK_STATUS_NUM); + } + + static int hclge_set_app_loopback(struct hclge_dev *hdev, bool en) diff --git a/queue-6.1/net-mlx5-allow-0-for-total-host-vfs.patch b/queue-6.1/net-mlx5-allow-0-for-total-host-vfs.patch new file mode 100644 index 00000000000..40a78162181 --- /dev/null +++ b/queue-6.1/net-mlx5-allow-0-for-total-host-vfs.patch @@ -0,0 +1,33 @@ +From 2dc2b3922d3c0f52d3a792d15dcacfbc4cc76b8f Mon Sep 17 00:00:00 2001 +From: Daniel Jurgens +Date: Tue, 11 Jul 2023 00:28:10 +0300 +Subject: net/mlx5: Allow 0 for total host VFs + +From: Daniel Jurgens + +commit 2dc2b3922d3c0f52d3a792d15dcacfbc4cc76b8f upstream. + +When querying eswitch functions 0 is a valid number of host VFs. After +introducing ARM SRIOV falling through to getting the max value from PCI +results in using the total VFs allowed on the ARM for the host. + +Fixes: 86eec50beaf3 ("net/mlx5: Support querying max VFs from device"); +Signed-off-by: Daniel Jurgens +Signed-off-by: Saeed Mahameed +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/mellanox/mlx5/core/sriov.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c +@@ -264,8 +264,7 @@ static u16 mlx5_get_max_vfs(struct mlx5_ + host_total_vfs = MLX5_GET(query_esw_functions_out, out, + host_params_context.host_total_vfs); + kvfree(out); +- if (host_total_vfs) +- return host_total_vfs; ++ return host_total_vfs; + } + + done: diff --git a/queue-6.1/net-mlx5-lag-check-correct-bucket-when-modifying-lag.patch b/queue-6.1/net-mlx5-lag-check-correct-bucket-when-modifying-lag.patch new file mode 100644 index 00000000000..cb93947b3f8 --- /dev/null +++ b/queue-6.1/net-mlx5-lag-check-correct-bucket-when-modifying-lag.patch @@ -0,0 +1,33 @@ +From 86ed7b773c01ba71617538b3b107c33fd9cf90b8 Mon Sep 17 00:00:00 2001 +From: Shay Drory +Date: Sun, 30 Jul 2023 09:26:27 +0300 +Subject: net/mlx5: LAG, Check correct bucket when modifying LAG + +From: Shay Drory + +commit 86ed7b773c01ba71617538b3b107c33fd9cf90b8 upstream. + +Cited patch introduced buckets in hash mode, but missed to update +the ports/bucket check when modifying LAG. +Fix the check. + +Fixes: 352899f384d4 ("net/mlx5: Lag, use buckets in hash mode") +Signed-off-by: Shay Drory +Reviewed-by: Maor Gottlieb +Signed-off-by: Saeed Mahameed +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c +@@ -574,7 +574,7 @@ static int __mlx5_lag_modify_definers_de + for (i = 0; i < ldev->ports; i++) { + for (j = 0; j < ldev->buckets; j++) { + idx = i * ldev->buckets + j; +- if (ldev->v2p_map[i] == ports[i]) ++ if (ldev->v2p_map[idx] == ports[idx]) + continue; + + dest.vport.vhca_id = MLX5_CAP_GEN(ldev->pf[ports[idx] - 1].dev, diff --git a/queue-6.1/net-mlx5-reload-auxiliary-devices-in-pci-error-handlers.patch b/queue-6.1/net-mlx5-reload-auxiliary-devices-in-pci-error-handlers.patch new file mode 100644 index 00000000000..f1ea9f27fb7 --- /dev/null +++ b/queue-6.1/net-mlx5-reload-auxiliary-devices-in-pci-error-handlers.patch @@ -0,0 +1,31 @@ +From aab8e1a200b926147db51e3f82fd07bb9edf6a98 Mon Sep 17 00:00:00 2001 +From: Moshe Shemesh +Date: Sun, 23 Jul 2023 11:03:01 +0300 +Subject: net/mlx5: Reload auxiliary devices in pci error handlers + +From: Moshe Shemesh + +commit aab8e1a200b926147db51e3f82fd07bb9edf6a98 upstream. + +Handling pci errors should fully teardown and load back auxiliary +devices, same as done through mlx5 health recovery flow. + +Fixes: 72ed5d5624af ("net/mlx5: Suspend auxiliary devices only in case of PCI device suspend") +Signed-off-by: Moshe Shemesh +Signed-off-by: Saeed Mahameed +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/mellanox/mlx5/core/main.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c +@@ -1794,7 +1794,7 @@ static pci_ers_result_t mlx5_pci_err_det + + mlx5_enter_error_state(dev, false); + mlx5_error_sw_reset(dev); +- mlx5_unload_one(dev, true); ++ mlx5_unload_one(dev, false); + mlx5_drain_health_wq(dev); + mlx5_pci_disable_device(dev); + diff --git a/queue-6.1/net-mlx5-skip-clock-update-work-when-device-is-in-error-state.patch b/queue-6.1/net-mlx5-skip-clock-update-work-when-device-is-in-error-state.patch new file mode 100644 index 00000000000..3f33e13ab25 --- /dev/null +++ b/queue-6.1/net-mlx5-skip-clock-update-work-when-device-is-in-error-state.patch @@ -0,0 +1,44 @@ +From d006207625657322ba8251b6e7e829f9659755dc Mon Sep 17 00:00:00 2001 +From: Moshe Shemesh +Date: Wed, 19 Jul 2023 11:33:44 +0300 +Subject: net/mlx5: Skip clock update work when device is in error state + +From: Moshe Shemesh + +commit d006207625657322ba8251b6e7e829f9659755dc upstream. + +When device is in error state, marked by the flag +MLX5_DEVICE_STATE_INTERNAL_ERROR, the HW and PCI may not be accessible +and so clock update work should be skipped. Furthermore, such access +through PCI in error state, after calling mlx5_pci_disable_device() can +result in failing to recover from pci errors. + +Fixes: ef9814deafd0 ("net/mlx5e: Add HW timestamping (TS) support") +Reported-and-tested-by: Ganesh G R +Closes: https://lore.kernel.org/netdev/9bdb9b9d-140a-7a28-f0de-2e64e873c068@nvidia.com +Signed-off-by: Moshe Shemesh +Reviewed-by: Aya Levin +Signed-off-by: Saeed Mahameed +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c | 5 +++++ + 1 file changed, 5 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c +@@ -198,10 +198,15 @@ static void mlx5_timestamp_overflow(stru + clock = container_of(timer, struct mlx5_clock, timer); + mdev = container_of(clock, struct mlx5_core_dev, clock); + ++ if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) ++ goto out; ++ + write_seqlock_irqsave(&clock->lock, flags); + timecounter_read(&timer->tc); + mlx5_update_clock_info_page(mdev); + write_sequnlock_irqrestore(&clock->lock, flags); ++ ++out: + schedule_delayed_work(&timer->overflow_work, timer->overflow_period); + } + diff --git a/queue-6.1/net-phy-at803x-remove-set-get-wol-callbacks-for-ar8032.patch b/queue-6.1/net-phy-at803x-remove-set-get-wol-callbacks-for-ar8032.patch new file mode 100644 index 00000000000..8aa3dffbb0a --- /dev/null +++ b/queue-6.1/net-phy-at803x-remove-set-get-wol-callbacks-for-ar8032.patch @@ -0,0 +1,32 @@ +From d7791cec2304aea22eb2ada944e4d467302f5bfe Mon Sep 17 00:00:00 2001 +From: Li Yang +Date: Wed, 2 Aug 2023 14:13:47 -0500 +Subject: net: phy: at803x: remove set/get wol callbacks for AR8032 + +From: Li Yang + +commit d7791cec2304aea22eb2ada944e4d467302f5bfe upstream. + +Since the AR8032 part does not support wol, remove related callbacks +from it. + +Fixes: 5800091a2061 ("net: phy: at803x: add support for AR8032 PHY") +Signed-off-by: Li Yang +Cc: David Bauer +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/phy/at803x.c | 2 -- + 1 file changed, 2 deletions(-) + +--- a/drivers/net/phy/at803x.c ++++ b/drivers/net/phy/at803x.c +@@ -2087,8 +2087,6 @@ static struct phy_driver at803x_driver[] + .flags = PHY_POLL_CABLE_TEST, + .config_init = at803x_config_init, + .link_change_notify = at803x_link_change_notify, +- .set_wol = at803x_set_wol, +- .get_wol = at803x_get_wol, + .suspend = at803x_suspend, + .resume = at803x_resume, + /* PHY_BASIC_FEATURES */ diff --git a/queue-6.1/nexthop-fix-infinite-nexthop-bucket-dump-when-using-maximum-nexthop-id.patch b/queue-6.1/nexthop-fix-infinite-nexthop-bucket-dump-when-using-maximum-nexthop-id.patch new file mode 100644 index 00000000000..65dc802c459 --- /dev/null +++ b/queue-6.1/nexthop-fix-infinite-nexthop-bucket-dump-when-using-maximum-nexthop-id.patch @@ -0,0 +1,128 @@ +From 8743aeff5bc4dcb5b87b43765f48d5ac3ad7dd9f Mon Sep 17 00:00:00 2001 +From: Ido Schimmel +Date: Tue, 8 Aug 2023 10:52:33 +0300 +Subject: nexthop: Fix infinite nexthop bucket dump when using maximum nexthop ID + +From: Ido Schimmel + +commit 8743aeff5bc4dcb5b87b43765f48d5ac3ad7dd9f upstream. + +A netlink dump callback can return a positive number to signal that more +information needs to be dumped or zero to signal that the dump is +complete. In the second case, the core netlink code will append the +NLMSG_DONE message to the skb in order to indicate to user space that +the dump is complete. + +The nexthop bucket dump callback always returns a positive number if +nexthop buckets were filled in the provided skb, even if the dump is +complete. This means that a dump will span at least two recvmsg() calls +as long as nexthop buckets are present. In the last recvmsg() call the +dump callback will not fill in any nexthop buckets because the previous +call indicated that the dump should restart from the last dumped nexthop +ID plus one. + + # ip link add name dummy1 up type dummy + # ip nexthop add id 1 dev dummy1 + # ip nexthop add id 10 group 1 type resilient buckets 2 + # strace -e sendto,recvmsg -s 5 ip nexthop bucket + sendto(3, [[{nlmsg_len=24, nlmsg_type=RTM_GETNEXTHOPBUCKET, nlmsg_flags=NLM_F_REQUEST|NLM_F_DUMP, nlmsg_seq=1691396980, nlmsg_pid=0}, {family=AF_UNSPEC, data="\x00\x00\x00\x00\x00"...}], {nlmsg_len=0, nlmsg_type=0 /* NLMSG_??? */, nlmsg_flags=0, nlmsg_seq=0, nlmsg_pid=0}], 152, 0, NULL, 0) = 152 + recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=NULL, iov_len=0}], msg_iovlen=1, msg_controllen=0, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_TRUNC) = 128 + recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[[{nlmsg_len=64, nlmsg_type=RTM_NEWNEXTHOPBUCKET, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1691396980, nlmsg_pid=347}, {family=AF_UNSPEC, data="\x00\x00\x00\x00\x00"...}], [{nlmsg_len=64, nlmsg_type=RTM_NEWNEXTHOPBUCKET, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1691396980, nlmsg_pid=347}, {family=AF_UNSPEC, data="\x00\x00\x00\x00\x00"...}]], iov_len=32768}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 128 + id 10 index 0 idle_time 6.66 nhid 1 + id 10 index 1 idle_time 6.66 nhid 1 + recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=NULL, iov_len=0}], msg_iovlen=1, msg_controllen=0, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_TRUNC) = 20 + recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[{nlmsg_len=20, nlmsg_type=NLMSG_DONE, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1691396980, nlmsg_pid=347}, 0], iov_len=32768}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 20 + +++ exited with 0 +++ + +This behavior is both inefficient and buggy. If the last nexthop to be +dumped had the maximum ID of 0xffffffff, then the dump will restart from +0 (0xffffffff + 1) and never end: + + # ip link add name dummy1 up type dummy + # ip nexthop add id 1 dev dummy1 + # ip nexthop add id $((2**32-1)) group 1 type resilient buckets 2 + # ip nexthop bucket + id 4294967295 index 0 idle_time 5.55 nhid 1 + id 4294967295 index 1 idle_time 5.55 nhid 1 + id 4294967295 index 0 idle_time 5.55 nhid 1 + id 4294967295 index 1 idle_time 5.55 nhid 1 + [...] + +Fix by adjusting the dump callback to return zero when the dump is +complete. After the fix only one recvmsg() call is made and the +NLMSG_DONE message is appended to the RTM_NEWNEXTHOPBUCKET responses: + + # ip link add name dummy1 up type dummy + # ip nexthop add id 1 dev dummy1 + # ip nexthop add id $((2**32-1)) group 1 type resilient buckets 2 + # strace -e sendto,recvmsg -s 5 ip nexthop bucket + sendto(3, [[{nlmsg_len=24, nlmsg_type=RTM_GETNEXTHOPBUCKET, nlmsg_flags=NLM_F_REQUEST|NLM_F_DUMP, nlmsg_seq=1691396737, nlmsg_pid=0}, {family=AF_UNSPEC, data="\x00\x00\x00\x00\x00"...}], {nlmsg_len=0, nlmsg_type=0 /* NLMSG_??? */, nlmsg_flags=0, nlmsg_seq=0, nlmsg_pid=0}], 152, 0, NULL, 0) = 152 + recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=NULL, iov_len=0}], msg_iovlen=1, msg_controllen=0, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_TRUNC) = 148 + recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[[{nlmsg_len=64, nlmsg_type=RTM_NEWNEXTHOPBUCKET, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1691396737, nlmsg_pid=350}, {family=AF_UNSPEC, data="\x00\x00\x00\x00\x00"...}], [{nlmsg_len=64, nlmsg_type=RTM_NEWNEXTHOPBUCKET, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1691396737, nlmsg_pid=350}, {family=AF_UNSPEC, data="\x00\x00\x00\x00\x00"...}], [{nlmsg_len=20, nlmsg_type=NLMSG_DONE, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1691396737, nlmsg_pid=350}, 0]], iov_len=32768}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 148 + id 4294967295 index 0 idle_time 6.61 nhid 1 + id 4294967295 index 1 idle_time 6.61 nhid 1 + +++ exited with 0 +++ + +Note that if the NLMSG_DONE message cannot be appended because of size +limitations, then another recvmsg() will be needed, but the core netlink +code will not invoke the dump callback and simply reply with a +NLMSG_DONE message since it knows that the callback previously returned +zero. + +Add a test that fails before the fix: + + # ./fib_nexthops.sh -t basic_res + [...] + TEST: Maximum nexthop ID dump [FAIL] + [...] + +And passes after it: + + # ./fib_nexthops.sh -t basic_res + [...] + TEST: Maximum nexthop ID dump [ OK ] + [...] + +Fixes: 8a1bbabb034d ("nexthop: Add netlink handlers for bucket dump") +Signed-off-by: Ido Schimmel +Reviewed-by: Petr Machata +Reviewed-by: David Ahern +Link: https://lore.kernel.org/r/20230808075233.3337922-4-idosch@nvidia.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/nexthop.c | 6 +----- + tools/testing/selftests/net/fib_nexthops.sh | 5 +++++ + 2 files changed, 6 insertions(+), 5 deletions(-) + +--- a/net/ipv4/nexthop.c ++++ b/net/ipv4/nexthop.c +@@ -3424,13 +3424,9 @@ static int rtm_dump_nexthop_bucket(struc + + if (err < 0) { + if (likely(skb->len)) +- goto out; +- goto out_err; ++ err = skb->len; + } + +-out: +- err = skb->len; +-out_err: + cb->seq = net->nexthop.seq; + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); + return err; +--- a/tools/testing/selftests/net/fib_nexthops.sh ++++ b/tools/testing/selftests/net/fib_nexthops.sh +@@ -2206,6 +2206,11 @@ basic_res() + run_cmd "$IP nexthop bucket list fdb" + log_test $? 255 "Dump all nexthop buckets with invalid 'fdb' keyword" + ++ # Dump should not loop endlessly when maximum nexthop ID is configured. ++ run_cmd "$IP nexthop add id $((2**32-1)) group 1/2 type resilient buckets 4" ++ run_cmd "timeout 5 $IP nexthop bucket" ++ log_test $? 0 "Maximum nexthop ID dump" ++ + # + # resilient nexthop buckets get requests + # diff --git a/queue-6.1/nexthop-fix-infinite-nexthop-dump-when-using-maximum-nexthop-id.patch b/queue-6.1/nexthop-fix-infinite-nexthop-dump-when-using-maximum-nexthop-id.patch new file mode 100644 index 00000000000..f401b002a52 --- /dev/null +++ b/queue-6.1/nexthop-fix-infinite-nexthop-dump-when-using-maximum-nexthop-id.patch @@ -0,0 +1,119 @@ +From 913f60cacda73ccac8eead94983e5884c03e04cd Mon Sep 17 00:00:00 2001 +From: Ido Schimmel +Date: Tue, 8 Aug 2023 10:52:31 +0300 +Subject: nexthop: Fix infinite nexthop dump when using maximum nexthop ID + +From: Ido Schimmel + +commit 913f60cacda73ccac8eead94983e5884c03e04cd upstream. + +A netlink dump callback can return a positive number to signal that more +information needs to be dumped or zero to signal that the dump is +complete. In the second case, the core netlink code will append the +NLMSG_DONE message to the skb in order to indicate to user space that +the dump is complete. + +The nexthop dump callback always returns a positive number if nexthops +were filled in the provided skb, even if the dump is complete. This +means that a dump will span at least two recvmsg() calls as long as +nexthops are present. In the last recvmsg() call the dump callback will +not fill in any nexthops because the previous call indicated that the +dump should restart from the last dumped nexthop ID plus one. + + # ip nexthop add id 1 blackhole + # strace -e sendto,recvmsg -s 5 ip nexthop + sendto(3, [[{nlmsg_len=24, nlmsg_type=RTM_GETNEXTHOP, nlmsg_flags=NLM_F_REQUEST|NLM_F_DUMP, nlmsg_seq=1691394315, nlmsg_pid=0}, {nh_family=AF_UNSPEC, nh_scope=RT_SCOPE_UNIVERSE, nh_protocol=RTPROT_UNSPEC, nh_flags=0}], {nlmsg_len=0, nlmsg_type=0 /* NLMSG_??? */, nlmsg_flags=0, nlmsg_seq=0, nlmsg_pid=0}], 152, 0, NULL, 0) = 152 + recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=NULL, iov_len=0}], msg_iovlen=1, msg_controllen=0, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_TRUNC) = 36 + recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[{nlmsg_len=36, nlmsg_type=RTM_NEWNEXTHOP, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1691394315, nlmsg_pid=343}, {nh_family=AF_INET, nh_scope=RT_SCOPE_UNIVERSE, nh_protocol=RTPROT_UNSPEC, nh_flags=0}, [[{nla_len=8, nla_type=NHA_ID}, 1], {nla_len=4, nla_type=NHA_BLACKHOLE}]], iov_len=32768}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 36 + id 1 blackhole + recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=NULL, iov_len=0}], msg_iovlen=1, msg_controllen=0, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_TRUNC) = 20 + recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[{nlmsg_len=20, nlmsg_type=NLMSG_DONE, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1691394315, nlmsg_pid=343}, 0], iov_len=32768}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 20 + +++ exited with 0 +++ + +This behavior is both inefficient and buggy. If the last nexthop to be +dumped had the maximum ID of 0xffffffff, then the dump will restart from +0 (0xffffffff + 1) and never end: + + # ip nexthop add id $((2**32-1)) blackhole + # ip nexthop + id 4294967295 blackhole + id 4294967295 blackhole + [...] + +Fix by adjusting the dump callback to return zero when the dump is +complete. After the fix only one recvmsg() call is made and the +NLMSG_DONE message is appended to the RTM_NEWNEXTHOP response: + + # ip nexthop add id $((2**32-1)) blackhole + # strace -e sendto,recvmsg -s 5 ip nexthop + sendto(3, [[{nlmsg_len=24, nlmsg_type=RTM_GETNEXTHOP, nlmsg_flags=NLM_F_REQUEST|NLM_F_DUMP, nlmsg_seq=1691394080, nlmsg_pid=0}, {nh_family=AF_UNSPEC, nh_scope=RT_SCOPE_UNIVERSE, nh_protocol=RTPROT_UNSPEC, nh_flags=0}], {nlmsg_len=0, nlmsg_type=0 /* NLMSG_??? */, nlmsg_flags=0, nlmsg_seq=0, nlmsg_pid=0}], 152, 0, NULL, 0) = 152 + recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=NULL, iov_len=0}], msg_iovlen=1, msg_controllen=0, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_TRUNC) = 56 + recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[[{nlmsg_len=36, nlmsg_type=RTM_NEWNEXTHOP, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1691394080, nlmsg_pid=342}, {nh_family=AF_INET, nh_scope=RT_SCOPE_UNIVERSE, nh_protocol=RTPROT_UNSPEC, nh_flags=0}, [[{nla_len=8, nla_type=NHA_ID}, 4294967295], {nla_len=4, nla_type=NHA_BLACKHOLE}]], [{nlmsg_len=20, nlmsg_type=NLMSG_DONE, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1691394080, nlmsg_pid=342}, 0]], iov_len=32768}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 56 + id 4294967295 blackhole + +++ exited with 0 +++ + +Note that if the NLMSG_DONE message cannot be appended because of size +limitations, then another recvmsg() will be needed, but the core netlink +code will not invoke the dump callback and simply reply with a +NLMSG_DONE message since it knows that the callback previously returned +zero. + +Add a test that fails before the fix: + + # ./fib_nexthops.sh -t basic + [...] + TEST: Maximum nexthop ID dump [FAIL] + [...] + +And passes after it: + + # ./fib_nexthops.sh -t basic + [...] + TEST: Maximum nexthop ID dump [ OK ] + [...] + +Fixes: ab84be7e54fc ("net: Initial nexthop code") +Reported-by: Petr Machata +Closes: https://lore.kernel.org/netdev/87sf91enuf.fsf@nvidia.com/ +Signed-off-by: Ido Schimmel +Reviewed-by: Petr Machata +Reviewed-by: David Ahern +Link: https://lore.kernel.org/r/20230808075233.3337922-2-idosch@nvidia.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/nexthop.c | 6 +----- + tools/testing/selftests/net/fib_nexthops.sh | 5 +++++ + 2 files changed, 6 insertions(+), 5 deletions(-) + +--- a/net/ipv4/nexthop.c ++++ b/net/ipv4/nexthop.c +@@ -3221,13 +3221,9 @@ static int rtm_dump_nexthop(struct sk_bu + &rtm_dump_nexthop_cb, &filter); + if (err < 0) { + if (likely(skb->len)) +- goto out; +- goto out_err; ++ err = skb->len; + } + +-out: +- err = skb->len; +-out_err: + cb->seq = net->nexthop.seq; + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); + return err; +--- a/tools/testing/selftests/net/fib_nexthops.sh ++++ b/tools/testing/selftests/net/fib_nexthops.sh +@@ -1981,6 +1981,11 @@ basic() + + run_cmd "$IP link set dev lo up" + ++ # Dump should not loop endlessly when maximum nexthop ID is configured. ++ run_cmd "$IP nexthop add id $((2**32-1)) blackhole" ++ run_cmd "timeout 5 $IP nexthop" ++ log_test $? 0 "Maximum nexthop ID dump" ++ + # + # groups + # diff --git a/queue-6.1/nexthop-make-nexthop-bucket-dump-more-efficient.patch b/queue-6.1/nexthop-make-nexthop-bucket-dump-more-efficient.patch new file mode 100644 index 00000000000..cb3c7d83aca --- /dev/null +++ b/queue-6.1/nexthop-make-nexthop-bucket-dump-more-efficient.patch @@ -0,0 +1,96 @@ +From f10d3d9df49d9e6ee244fda6ca264f901a9c5d85 Mon Sep 17 00:00:00 2001 +From: Ido Schimmel +Date: Tue, 8 Aug 2023 10:52:32 +0300 +Subject: nexthop: Make nexthop bucket dump more efficient + +From: Ido Schimmel + +commit f10d3d9df49d9e6ee244fda6ca264f901a9c5d85 upstream. + +rtm_dump_nexthop_bucket_nh() is used to dump nexthop buckets belonging +to a specific resilient nexthop group. The function returns a positive +return code (the skb length) upon both success and failure. + +The above behavior is problematic. When a complete nexthop bucket dump +is requested, the function that walks the different nexthops treats the +non-zero return code as an error. This causes buckets belonging to +different resilient nexthop groups to be dumped using different buffers +even if they can all fit in the same buffer: + + # ip link add name dummy1 up type dummy + # ip nexthop add id 1 dev dummy1 + # ip nexthop add id 10 group 1 type resilient buckets 1 + # ip nexthop add id 20 group 1 type resilient buckets 1 + # strace -e recvmsg -s 0 ip nexthop bucket + [...] + recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[...], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 64 + id 10 index 0 idle_time 10.27 nhid 1 + [...] + recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[...], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 64 + id 20 index 0 idle_time 6.44 nhid 1 + [...] + +Fix by only returning a non-zero return code when an error occurred and +restarting the dump from the bucket index we failed to fill in. This +allows buckets belonging to different resilient nexthop groups to be +dumped using the same buffer: + + # ip link add name dummy1 up type dummy + # ip nexthop add id 1 dev dummy1 + # ip nexthop add id 10 group 1 type resilient buckets 1 + # ip nexthop add id 20 group 1 type resilient buckets 1 + # strace -e recvmsg -s 0 ip nexthop bucket + [...] + recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[...], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 128 + id 10 index 0 idle_time 30.21 nhid 1 + id 20 index 0 idle_time 26.7 nhid 1 + [...] + +While this change is more of a performance improvement change than an +actual bug fix, it is a prerequisite for a subsequent patch that does +fix a bug. + +Fixes: 8a1bbabb034d ("nexthop: Add netlink handlers for bucket dump") +Signed-off-by: Ido Schimmel +Reviewed-by: Petr Machata +Reviewed-by: David Ahern +Link: https://lore.kernel.org/r/20230808075233.3337922-3-idosch@nvidia.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/nexthop.c | 16 +++++----------- + 1 file changed, 5 insertions(+), 11 deletions(-) + +--- a/net/ipv4/nexthop.c ++++ b/net/ipv4/nexthop.c +@@ -3363,25 +3363,19 @@ static int rtm_dump_nexthop_bucket_nh(st + dd->filter.res_bucket_nh_id != nhge->nh->id) + continue; + ++ dd->ctx->bucket_index = bucket_index; + err = nh_fill_res_bucket(skb, nh, bucket, bucket_index, + RTM_NEWNEXTHOPBUCKET, portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + cb->extack); +- if (err < 0) { +- if (likely(skb->len)) +- goto out; +- goto out_err; +- } ++ if (err) ++ return err; + } + + dd->ctx->done_nh_idx = dd->ctx->nh.idx + 1; +- bucket_index = 0; ++ dd->ctx->bucket_index = 0; + +-out: +- err = skb->len; +-out_err: +- dd->ctx->bucket_index = bucket_index; +- return err; ++ return 0; + } + + static int rtm_dump_nexthop_bucket_cb(struct sk_buff *skb, diff --git a/queue-6.1/series b/queue-6.1/series index 2e9a8fd587d..48aa4306139 100644 --- a/queue-6.1/series +++ b/queue-6.1/series @@ -103,3 +103,25 @@ wifi-cfg80211-fix-sband-iftype-data-lookup-for-ap_vlan.patch rdma-umem-set-iova-in-odp-flow.patch net-tls-avoid-discarding-data-on-record-close.patch net-marvell-prestera-fix-handling-ipv4-routes-with-nhid.patch +net-phy-at803x-remove-set-get-wol-callbacks-for-ar8032.patch +net-dsa-ocelot-call-dsa_tag_8021q_unregister-under-rtnl_lock-on-driver-remove.patch +net-hns3-refactor-hclge_mac_link_status_wait-for-interface-reuse.patch +net-hns3-add-wait-until-mac-link-down.patch +net-hns3-fix-deadlock-issue-when-externel_lb-and-reset-are-executed-together.patch +nexthop-fix-infinite-nexthop-dump-when-using-maximum-nexthop-id.patch +nexthop-make-nexthop-bucket-dump-more-efficient.patch +nexthop-fix-infinite-nexthop-bucket-dump-when-using-maximum-nexthop-id.patch +net-hns3-fix-strscpy-causing-content-truncation-issue.patch +dmaengine-mcf-edma-fix-a-potential-un-allocated-memory-access.patch +dmaengine-owl-dma-modify-mismatched-function-name.patch +net-mlx5-allow-0-for-total-host-vfs.patch +net-mlx5-lag-check-correct-bucket-when-modifying-lag.patch +net-mlx5-skip-clock-update-work-when-device-is-in-error-state.patch +net-mlx5-reload-auxiliary-devices-in-pci-error-handlers.patch +ibmvnic-enforce-stronger-sanity-checks-on-login-response.patch +ibmvnic-unmap-dma-login-rsp-buffer-on-send-login-fail.patch +ibmvnic-handle-dma-unmapping-of-login-buffs-in-release-functions.patch +ibmvnic-do-partial-reset-on-login-failure.patch +ibmvnic-ensure-login-failure-recovery-is-safe-from-other-resets.patch +gpio-ws16c48-fix-off-by-one-error-in-ws16c48-resource-region-extent.patch +gpio-sim-mark-the-gpio-chip-as-a-one-that-can-sleep.patch