--- /dev/null
+From 863676fe1ac1b82fc9eb56c242e80acfbfc18b76 Mon Sep 17 00:00:00 2001
+From: Fenghua Yu <fenghua.yu@intel.com>
+Date: Wed, 12 Jul 2023 12:35:05 -0700
+Subject: dmaengine: idxd: Clear PRS disable flag when disabling IDXD device
+
+From: Fenghua Yu <fenghua.yu@intel.com>
+
+commit 863676fe1ac1b82fc9eb56c242e80acfbfc18b76 upstream.
+
+Disabling IDXD device doesn't reset Page Request Service (PRS)
+disable flag to its initial value 0. This may cause user confusion
+because once PRS is disabled user will see PRS still remains the
+previous setting (i.e. disabled) via sysfs interface even after the
+device is disabled.
+
+To eliminate user confusion, reset PRS disable flag to ensure that
+the PRS flag bit reflects correct state after the device is disabled.
+
+Additionally, simplify the code by setting wq->flags to 0, which clears
+all flag bits, including any future additions.
+
+Fixes: f2dc327131b5 ("dmaengine: idxd: add per wq PRS disable")
+Tested-by: Tony Zhu <tony.zhu@intel.com>
+Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
+Reviewed-by: Dave Jiang <dave.jiang@intel.com>
+Link: https://lore.kernel.org/r/20230712193505.3440752-1-fenghua.yu@intel.com
+Signed-off-by: Vinod Koul <vkoul@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/dma/idxd/device.c | 4 +---
+ 1 file changed, 1 insertion(+), 3 deletions(-)
+
+diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
+index 5abbcc61c528..9a15f0d12c79 100644
+--- a/drivers/dma/idxd/device.c
++++ b/drivers/dma/idxd/device.c
+@@ -384,9 +384,7 @@ static void idxd_wq_disable_cleanup(struct idxd_wq *wq)
+ wq->threshold = 0;
+ wq->priority = 0;
+ wq->enqcmds_retries = IDXD_ENQCMDS_RETRIES;
+- clear_bit(WQ_FLAG_DEDICATED, &wq->flags);
+- clear_bit(WQ_FLAG_BLOCK_ON_FAULT, &wq->flags);
+- clear_bit(WQ_FLAG_ATS_DISABLE, &wq->flags);
++ wq->flags = 0;
+ memset(wq->name, 0, WQ_NAME_SIZE);
+ wq->max_xfer_bytes = WQ_DEFAULT_MAX_XFER;
+ idxd_wq_set_max_batch_size(idxd->data->type, wq, WQ_DEFAULT_MAX_BATCH);
+--
+2.41.0
+
--- /dev/null
+From 0a46781c89dece85386885a407244ca26e5c1c44 Mon Sep 17 00:00:00 2001
+From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
+Date: Wed, 12 Jul 2023 18:26:45 +0530
+Subject: dmaengine: mcf-edma: Fix a potential un-allocated memory access
+
+From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
+
+commit 0a46781c89dece85386885a407244ca26e5c1c44 upstream.
+
+When 'mcf_edma' is allocated, some space is allocated for a
+flexible array at the end of the struct. 'chans' item are allocated, that is
+to say 'pdata->dma_channels'.
+
+Then, this number of item is stored in 'mcf_edma->n_chans'.
+
+A few lines later, if 'mcf_edma->n_chans' is 0, then a default value of 64
+is set.
+
+This ends to no space allocated by devm_kzalloc() because chans was 0, but
+64 items are read and/or written in some not allocated memory.
+
+Change the logic to define a default value before allocating the memory.
+
+Fixes: e7a3ff92eaf1 ("dmaengine: fsl-edma: add ColdFire mcf5441x edma support")
+Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
+Link: https://lore.kernel.org/r/f55d914407c900828f6fad3ea5fa791a5f17b9a4.1685172449.git.christophe.jaillet@wanadoo.fr
+Signed-off-by: Vinod Koul <vkoul@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/dma/mcf-edma.c | 13 +++++++------
+ 1 file changed, 7 insertions(+), 6 deletions(-)
+
+--- a/drivers/dma/mcf-edma.c
++++ b/drivers/dma/mcf-edma.c
+@@ -190,7 +190,13 @@ static int mcf_edma_probe(struct platfor
+ return -EINVAL;
+ }
+
+- chans = pdata->dma_channels;
++ if (!pdata->dma_channels) {
++ dev_info(&pdev->dev, "setting default channel number to 64");
++ chans = 64;
++ } else {
++ chans = pdata->dma_channels;
++ }
++
+ len = sizeof(*mcf_edma) + sizeof(*mcf_chan) * chans;
+ mcf_edma = devm_kzalloc(&pdev->dev, len, GFP_KERNEL);
+ if (!mcf_edma)
+@@ -202,11 +208,6 @@ static int mcf_edma_probe(struct platfor
+ mcf_edma->drvdata = &mcf_data;
+ mcf_edma->big_endian = 1;
+
+- if (!mcf_edma->n_chans) {
+- dev_info(&pdev->dev, "setting default channel number to 64");
+- mcf_edma->n_chans = 64;
+- }
+-
+ mutex_init(&mcf_edma->fsl_edma_mutex);
+
+ mcf_edma->membase = devm_platform_ioremap_resource(pdev, 0);
--- /dev/null
+From 74d7221c1f9c9f3a8c316a3557ca7dca8b99d14c Mon Sep 17 00:00:00 2001
+From: Zhang Jianhua <chris.zjh@huawei.com>
+Date: Sat, 22 Jul 2023 15:32:44 +0000
+Subject: dmaengine: owl-dma: Modify mismatched function name
+
+From: Zhang Jianhua <chris.zjh@huawei.com>
+
+commit 74d7221c1f9c9f3a8c316a3557ca7dca8b99d14c upstream.
+
+No functional modification involved.
+
+drivers/dma/owl-dma.c:208: warning: expecting prototype for struct owl_dma_pchan. Prototype was for struct owl_dma_vchan instead HDRTEST usr/include/sound/asequencer.h
+
+Fixes: 47e20577c24d ("dmaengine: Add Actions Semi Owl family S900 DMA driver")
+Signed-off-by: Zhang Jianhua <chris.zjh@huawei.com>
+Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
+Link: https://lore.kernel.org/r/20230722153244.2086949-1-chris.zjh@huawei.com
+Signed-off-by: Vinod Koul <vkoul@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/dma/owl-dma.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/dma/owl-dma.c
++++ b/drivers/dma/owl-dma.c
+@@ -192,7 +192,7 @@ struct owl_dma_pchan {
+ };
+
+ /**
+- * struct owl_dma_pchan - Wrapper for DMA ENGINE channel
++ * struct owl_dma_vchan - Wrapper for DMA ENGINE channel
+ * @vc: wrapped virtual channel
+ * @pchan: the physical channel utilized by this channel
+ * @txd: active transaction on this channel
--- /dev/null
+From 23cc5f667453ca7645a24c8d21bf84dbf61107b2 Mon Sep 17 00:00:00 2001
+From: Nick Child <nnac123@linux.ibm.com>
+Date: Wed, 9 Aug 2023 17:10:37 -0500
+Subject: ibmvnic: Do partial reset on login failure
+
+From: Nick Child <nnac123@linux.ibm.com>
+
+commit 23cc5f667453ca7645a24c8d21bf84dbf61107b2 upstream.
+
+Perform a partial reset before sending a login request if any of the
+following are true:
+ 1. If a previous request times out. This can be dangerous because the
+ VIOS could still receive the old login request at any point after
+ the timeout. Therefore, it is best to re-register the CRQ's and
+ sub-CRQ's before retrying.
+ 2. If the previous request returns an error that is not described in
+ PAPR. PAPR provides procedures if the login returns with partial
+ success or aborted return codes (section L.5.1) but other values
+ do not have a defined procedure. Previously, these conditions
+ just returned error from the login function rather than trying
+ to resolve the issue.
+ This can cause further issues since most callers of the login
+ function are not prepared to handle an error when logging in. This
+ improper cleanup can lead to the device being permanently DOWN'd.
+ For example, if the VIOS believes that the device is already logged
+ in then it will return INVALID_STATE (-7). If we never re-register
+ CRQ's then it will always think that the device is already logged
+ in. This leaves the device inoperable.
+
+The partial reset involves freeing the sub-CRQs, freeing the CRQ then
+registering and initializing a new CRQ and sub-CRQs. This essentially
+restarts all communication with VIOS to allow for a fresh login attempt
+that will be unhindered by any previous failed attempts.
+
+Fixes: dff515a3e71d ("ibmvnic: Harden device login requests")
+Signed-off-by: Nick Child <nnac123@linux.ibm.com>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Link: https://lore.kernel.org/r/20230809221038.51296-4-nnac123@linux.ibm.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/ibm/ibmvnic.c | 46 ++++++++++++++++++++++++++++++++-----
+ 1 file changed, 40 insertions(+), 6 deletions(-)
+
+--- a/drivers/net/ethernet/ibm/ibmvnic.c
++++ b/drivers/net/ethernet/ibm/ibmvnic.c
+@@ -97,6 +97,8 @@ static int pending_scrq(struct ibmvnic_a
+ static union sub_crq *ibmvnic_next_scrq(struct ibmvnic_adapter *,
+ struct ibmvnic_sub_crq_queue *);
+ static int ibmvnic_poll(struct napi_struct *napi, int data);
++static int reset_sub_crq_queues(struct ibmvnic_adapter *adapter);
++static inline void reinit_init_done(struct ibmvnic_adapter *adapter);
+ static void send_query_map(struct ibmvnic_adapter *adapter);
+ static int send_request_map(struct ibmvnic_adapter *, dma_addr_t, u32, u8);
+ static int send_request_unmap(struct ibmvnic_adapter *, u8);
+@@ -1527,11 +1529,9 @@ static int ibmvnic_login(struct net_devi
+
+ if (!wait_for_completion_timeout(&adapter->init_done,
+ timeout)) {
+- netdev_warn(netdev, "Login timed out, retrying...\n");
+- retry = true;
+- adapter->init_done_rc = 0;
+- retry_count++;
+- continue;
++ netdev_warn(netdev, "Login timed out\n");
++ adapter->login_pending = false;
++ goto partial_reset;
+ }
+
+ if (adapter->init_done_rc == ABORTED) {
+@@ -1576,7 +1576,41 @@ static int ibmvnic_login(struct net_devi
+ } else if (adapter->init_done_rc) {
+ netdev_warn(netdev, "Adapter login failed, init_done_rc = %d\n",
+ adapter->init_done_rc);
+- return -EIO;
++
++partial_reset:
++ /* adapter login failed, so free any CRQs or sub-CRQs
++ * and register again before attempting to login again.
++ * If we don't do this then the VIOS may think that
++ * we are already logged in and reject any subsequent
++ * attempts
++ */
++ netdev_warn(netdev,
++ "Freeing and re-registering CRQs before attempting to login again\n");
++ retry = true;
++ adapter->init_done_rc = 0;
++ retry_count++;
++ release_sub_crqs(adapter, true);
++ reinit_init_done(adapter);
++ release_crq_queue(adapter);
++ /* If we don't sleep here then we risk an unnecessary
++ * failover event from the VIOS. This is a known VIOS
++ * issue caused by a vnic device freeing and registering
++ * a CRQ too quickly.
++ */
++ msleep(1500);
++ rc = init_crq_queue(adapter);
++ if (rc) {
++ netdev_err(netdev, "login recovery: init CRQ failed %d\n",
++ rc);
++ return -EIO;
++ }
++
++ rc = ibmvnic_reset_init(adapter, false);
++ if (rc) {
++ netdev_err(netdev, "login recovery: Reset init failed %d\n",
++ rc);
++ return -EIO;
++ }
+ }
+ } while (retry);
+
--- /dev/null
+From db17ba719bceb52f0ae4ebca0e4c17d9a3bebf05 Mon Sep 17 00:00:00 2001
+From: Nick Child <nnac123@linux.ibm.com>
+Date: Wed, 9 Aug 2023 17:10:34 -0500
+Subject: ibmvnic: Enforce stronger sanity checks on login response
+
+From: Nick Child <nnac123@linux.ibm.com>
+
+commit db17ba719bceb52f0ae4ebca0e4c17d9a3bebf05 upstream.
+
+Ensure that all offsets in a login response buffer are within the size
+of the allocated response buffer. Any offsets or lengths that surpass
+the allocation are likely the result of an incomplete response buffer.
+In these cases, a full reset is necessary.
+
+When attempting to login, the ibmvnic device will allocate a response
+buffer and pass a reference to the VIOS. The VIOS will then send the
+ibmvnic device a LOGIN_RSP CRQ to signal that the buffer has been filled
+with data. If the ibmvnic device does not get a response in 20 seconds,
+the old buffer is freed and a new login request is sent. With 2
+outstanding requests, any LOGIN_RSP CRQ's could be for the older
+login request. If this is the case then the login response buffer (which
+is for the newer login request) could be incomplete and contain invalid
+data. Therefore, we must enforce strict sanity checks on the response
+buffer values.
+
+Testing has shown that the `off_rxadd_buff_size` value is filled in last
+by the VIOS and will be the smoking gun for these circumstances.
+
+Until VIOS can implement a mechanism for tracking outstanding response
+buffers and a method for mapping a LOGIN_RSP CRQ to a particular login
+response buffer, the best ibmvnic can do in this situation is perform a
+full reset.
+
+Fixes: dff515a3e71d ("ibmvnic: Harden device login requests")
+Signed-off-by: Nick Child <nnac123@linux.ibm.com>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Link: https://lore.kernel.org/r/20230809221038.51296-1-nnac123@linux.ibm.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/ibm/ibmvnic.c | 18 ++++++++++++++++++
+ 1 file changed, 18 insertions(+)
+
+--- a/drivers/net/ethernet/ibm/ibmvnic.c
++++ b/drivers/net/ethernet/ibm/ibmvnic.c
+@@ -5396,6 +5396,7 @@ static int handle_login_rsp(union ibmvni
+ int num_tx_pools;
+ int num_rx_pools;
+ u64 *size_array;
++ u32 rsp_len;
+ int i;
+
+ /* CHECK: Test/set of login_pending does not need to be atomic
+@@ -5447,6 +5448,23 @@ static int handle_login_rsp(union ibmvni
+ ibmvnic_reset(adapter, VNIC_RESET_FATAL);
+ return -EIO;
+ }
++
++ rsp_len = be32_to_cpu(login_rsp->len);
++ if (be32_to_cpu(login->login_rsp_len) < rsp_len ||
++ rsp_len <= be32_to_cpu(login_rsp->off_txsubm_subcrqs) ||
++ rsp_len <= be32_to_cpu(login_rsp->off_rxadd_subcrqs) ||
++ rsp_len <= be32_to_cpu(login_rsp->off_rxadd_buff_size) ||
++ rsp_len <= be32_to_cpu(login_rsp->off_supp_tx_desc)) {
++ /* This can happen if a login request times out and there are
++ * 2 outstanding login requests sent, the LOGIN_RSP crq
++ * could have been for the older login request. So we are
++ * parsing the newer response buffer which may be incomplete
++ */
++ dev_err(dev, "FATAL: Login rsp offsets/lengths invalid\n");
++ ibmvnic_reset(adapter, VNIC_RESET_FATAL);
++ return -EIO;
++ }
++
+ size_array = (u64 *)((u8 *)(adapter->login_rsp_buf) +
+ be32_to_cpu(adapter->login_rsp_buf->off_rxadd_buff_size));
+ /* variable buffer sizes are not supported, so just read the
--- /dev/null
+From 6db541ae279bd4e76dbd939e5fbf298396166242 Mon Sep 17 00:00:00 2001
+From: Nick Child <nnac123@linux.ibm.com>
+Date: Wed, 9 Aug 2023 17:10:38 -0500
+Subject: ibmvnic: Ensure login failure recovery is safe from other resets
+
+From: Nick Child <nnac123@linux.ibm.com>
+
+commit 6db541ae279bd4e76dbd939e5fbf298396166242 upstream.
+
+If a login request fails, the recovery process should be protected
+against parallel resets. It is a known issue that freeing and
+registering CRQ's in quick succession can result in a failover CRQ from
+the VIOS. Processing a failover during login recovery is dangerous for
+two reasons:
+ 1. This will result in two parallel initialization processes, this can
+ cause serious issues during login.
+ 2. It is possible that the failover CRQ is received but never executed.
+ We get notified of a pending failover through a transport event CRQ.
+ The reset is not performed until a INIT CRQ request is received.
+ Previously, if CRQ init fails during login recovery, then the ibmvnic
+ irq is freed and the login process returned error. If failover_pending
+ is true (a transport event was received), then the ibmvnic device
+ would never be able to process the reset since it cannot receive the
+ CRQ_INIT request due to the irq being freed. This leaved the device
+ in a inoperable state.
+
+Therefore, the login failure recovery process must be hardened against
+these possible issues. Possible failovers (due to quick CRQ free and
+init) must be avoided and any issues during re-initialization should be
+dealt with instead of being propagated up the stack. This logic is
+similar to that of ibmvnic_probe().
+
+Fixes: dff515a3e71d ("ibmvnic: Harden device login requests")
+Signed-off-by: Nick Child <nnac123@linux.ibm.com>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Link: https://lore.kernel.org/r/20230809221038.51296-5-nnac123@linux.ibm.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/ibm/ibmvnic.c | 70 +++++++++++++++++++++++++------------
+ 1 file changed, 48 insertions(+), 22 deletions(-)
+
+--- a/drivers/net/ethernet/ibm/ibmvnic.c
++++ b/drivers/net/ethernet/ibm/ibmvnic.c
+@@ -116,6 +116,7 @@ static void ibmvnic_tx_scrq_clean_buffer
+ static void free_long_term_buff(struct ibmvnic_adapter *adapter,
+ struct ibmvnic_long_term_buff *ltb);
+ static void ibmvnic_disable_irqs(struct ibmvnic_adapter *adapter);
++static void flush_reset_queue(struct ibmvnic_adapter *adapter);
+
+ struct ibmvnic_stat {
+ char name[ETH_GSTRING_LEN];
+@@ -1507,8 +1508,8 @@ static const char *adapter_state_to_stri
+
+ static int ibmvnic_login(struct net_device *netdev)
+ {
++ unsigned long flags, timeout = msecs_to_jiffies(20000);
+ struct ibmvnic_adapter *adapter = netdev_priv(netdev);
+- unsigned long timeout = msecs_to_jiffies(20000);
+ int retry_count = 0;
+ int retries = 10;
+ bool retry;
+@@ -1573,6 +1574,7 @@ static int ibmvnic_login(struct net_devi
+ "SCRQ irq initialization failed\n");
+ return rc;
+ }
++ /* Default/timeout error handling, reset and start fresh */
+ } else if (adapter->init_done_rc) {
+ netdev_warn(netdev, "Adapter login failed, init_done_rc = %d\n",
+ adapter->init_done_rc);
+@@ -1588,29 +1590,53 @@ partial_reset:
+ "Freeing and re-registering CRQs before attempting to login again\n");
+ retry = true;
+ adapter->init_done_rc = 0;
+- retry_count++;
+ release_sub_crqs(adapter, true);
+- reinit_init_done(adapter);
+- release_crq_queue(adapter);
+- /* If we don't sleep here then we risk an unnecessary
+- * failover event from the VIOS. This is a known VIOS
+- * issue caused by a vnic device freeing and registering
+- * a CRQ too quickly.
++ /* Much of this is similar logic as ibmvnic_probe(),
++ * we are essentially re-initializing communication
++ * with the server. We really should not run any
++ * resets/failovers here because this is already a form
++ * of reset and we do not want parallel resets occurring
+ */
+- msleep(1500);
+- rc = init_crq_queue(adapter);
+- if (rc) {
+- netdev_err(netdev, "login recovery: init CRQ failed %d\n",
+- rc);
+- return -EIO;
+- }
+-
+- rc = ibmvnic_reset_init(adapter, false);
+- if (rc) {
+- netdev_err(netdev, "login recovery: Reset init failed %d\n",
+- rc);
+- return -EIO;
+- }
++ do {
++ reinit_init_done(adapter);
++ /* Clear any failovers we got in the previous
++ * pass since we are re-initializing the CRQ
++ */
++ adapter->failover_pending = false;
++ release_crq_queue(adapter);
++ /* If we don't sleep here then we risk an
++ * unnecessary failover event from the VIOS.
++ * This is a known VIOS issue caused by a vnic
++ * device freeing and registering a CRQ too
++ * quickly.
++ */
++ msleep(1500);
++ /* Avoid any resets, since we are currently
++ * resetting.
++ */
++ spin_lock_irqsave(&adapter->rwi_lock, flags);
++ flush_reset_queue(adapter);
++ spin_unlock_irqrestore(&adapter->rwi_lock,
++ flags);
++
++ rc = init_crq_queue(adapter);
++ if (rc) {
++ netdev_err(netdev, "login recovery: init CRQ failed %d\n",
++ rc);
++ return -EIO;
++ }
++
++ rc = ibmvnic_reset_init(adapter, false);
++ if (rc)
++ netdev_err(netdev, "login recovery: Reset init failed %d\n",
++ rc);
++ /* IBMVNIC_CRQ_INIT will return EAGAIN if it
++ * fails, since ibmvnic_reset_init will free
++ * irq's in failure, we won't be able to receive
++ * new CRQs so we need to keep trying. probe()
++ * handles this similarly.
++ */
++ } while (rc == -EAGAIN && retry_count++ < retries);
+ }
+ } while (retry);
+
--- /dev/null
+From d78a671eb8996af19d6311ecdee9790d2fa479f0 Mon Sep 17 00:00:00 2001
+From: Nick Child <nnac123@linux.ibm.com>
+Date: Wed, 9 Aug 2023 17:10:36 -0500
+Subject: ibmvnic: Handle DMA unmapping of login buffs in release functions
+
+From: Nick Child <nnac123@linux.ibm.com>
+
+commit d78a671eb8996af19d6311ecdee9790d2fa479f0 upstream.
+
+Rather than leaving the DMA unmapping of the login buffers to the
+login response handler, move this work into the login release functions.
+Previously, these functions were only used for freeing the allocated
+buffers. This could lead to issues if there are more than one
+outstanding login buffer requests, which is possible if a login request
+times out.
+
+If a login request times out, then there is another call to send login.
+The send login function makes a call to the login buffer release
+function. In the past, this freed the buffers but did not DMA unmap.
+Therefore, the VIOS could still write to the old login (now freed)
+buffer. It is for this reason that it is a good idea to leave the DMA
+unmap call to the login buffers release function.
+
+Since the login buffer release functions now handle DMA unmapping,
+remove the duplicate DMA unmapping in handle_login_rsp().
+
+Fixes: dff515a3e71d ("ibmvnic: Harden device login requests")
+Signed-off-by: Nick Child <nnac123@linux.ibm.com>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Link: https://lore.kernel.org/r/20230809221038.51296-3-nnac123@linux.ibm.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/ibm/ibmvnic.c | 15 ++++++++++-----
+ 1 file changed, 10 insertions(+), 5 deletions(-)
+
+--- a/drivers/net/ethernet/ibm/ibmvnic.c
++++ b/drivers/net/ethernet/ibm/ibmvnic.c
+@@ -1588,12 +1588,22 @@ static int ibmvnic_login(struct net_devi
+
+ static void release_login_buffer(struct ibmvnic_adapter *adapter)
+ {
++ if (!adapter->login_buf)
++ return;
++
++ dma_unmap_single(&adapter->vdev->dev, adapter->login_buf_token,
++ adapter->login_buf_sz, DMA_TO_DEVICE);
+ kfree(adapter->login_buf);
+ adapter->login_buf = NULL;
+ }
+
+ static void release_login_rsp_buffer(struct ibmvnic_adapter *adapter)
+ {
++ if (!adapter->login_rsp_buf)
++ return;
++
++ dma_unmap_single(&adapter->vdev->dev, adapter->login_rsp_buf_token,
++ adapter->login_rsp_buf_sz, DMA_FROM_DEVICE);
+ kfree(adapter->login_rsp_buf);
+ adapter->login_rsp_buf = NULL;
+ }
+@@ -5411,11 +5421,6 @@ static int handle_login_rsp(union ibmvni
+ }
+ adapter->login_pending = false;
+
+- dma_unmap_single(dev, adapter->login_buf_token, adapter->login_buf_sz,
+- DMA_TO_DEVICE);
+- dma_unmap_single(dev, adapter->login_rsp_buf_token,
+- adapter->login_rsp_buf_sz, DMA_FROM_DEVICE);
+-
+ /* If the number of queues requested can't be allocated by the
+ * server, the login response will return with code 1. We will need
+ * to resend the login buffer with fewer queues requested.
--- /dev/null
+From 411c565b4bc63e9584a8493882bd566e35a90588 Mon Sep 17 00:00:00 2001
+From: Nick Child <nnac123@linux.ibm.com>
+Date: Wed, 9 Aug 2023 17:10:35 -0500
+Subject: ibmvnic: Unmap DMA login rsp buffer on send login fail
+
+From: Nick Child <nnac123@linux.ibm.com>
+
+commit 411c565b4bc63e9584a8493882bd566e35a90588 upstream.
+
+If the LOGIN CRQ fails to send then we must DMA unmap the response
+buffer. Previously, if the CRQ failed then the memory was freed without
+DMA unmapping.
+
+Fixes: c98d9cc4170d ("ibmvnic: send_login should check for crq errors")
+Signed-off-by: Nick Child <nnac123@linux.ibm.com>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Link: https://lore.kernel.org/r/20230809221038.51296-2-nnac123@linux.ibm.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/ibm/ibmvnic.c | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+--- a/drivers/net/ethernet/ibm/ibmvnic.c
++++ b/drivers/net/ethernet/ibm/ibmvnic.c
+@@ -4830,11 +4830,14 @@ static int send_login(struct ibmvnic_ada
+ if (rc) {
+ adapter->login_pending = false;
+ netdev_err(adapter->netdev, "Failed to send login, rc=%d\n", rc);
+- goto buf_rsp_map_failed;
++ goto buf_send_failed;
+ }
+
+ return 0;
+
++buf_send_failed:
++ dma_unmap_single(dev, rsp_buffer_token, rsp_buffer_size,
++ DMA_FROM_DEVICE);
+ buf_rsp_map_failed:
+ kfree(login_rsp_buffer);
+ adapter->login_rsp_buf = NULL;
--- /dev/null
+From a94c16a2fda010866b8858a386a8bfbeba4f72c5 Mon Sep 17 00:00:00 2001
+From: Vladimir Oltean <vladimir.oltean@nxp.com>
+Date: Thu, 3 Aug 2023 16:42:53 +0300
+Subject: net: dsa: ocelot: call dsa_tag_8021q_unregister() under rtnl_lock() on driver remove
+
+From: Vladimir Oltean <vladimir.oltean@nxp.com>
+
+commit a94c16a2fda010866b8858a386a8bfbeba4f72c5 upstream.
+
+When the tagging protocol in current use is "ocelot-8021q" and we unbind
+the driver, we see this splat:
+
+$ echo '0000:00:00.2' > /sys/bus/pci/drivers/fsl_enetc/unbind
+mscc_felix 0000:00:00.5 swp0: left promiscuous mode
+sja1105 spi2.0: Link is Down
+DSA: tree 1 torn down
+mscc_felix 0000:00:00.5 swp2: left promiscuous mode
+sja1105 spi2.2: Link is Down
+DSA: tree 3 torn down
+fsl_enetc 0000:00:00.2 eno2: left promiscuous mode
+mscc_felix 0000:00:00.5: Link is Down
+------------[ cut here ]------------
+RTNL: assertion failed at net/dsa/tag_8021q.c (409)
+WARNING: CPU: 1 PID: 329 at net/dsa/tag_8021q.c:409 dsa_tag_8021q_unregister+0x12c/0x1a0
+Modules linked in:
+CPU: 1 PID: 329 Comm: bash Not tainted 6.5.0-rc3+ #771
+pc : dsa_tag_8021q_unregister+0x12c/0x1a0
+lr : dsa_tag_8021q_unregister+0x12c/0x1a0
+Call trace:
+ dsa_tag_8021q_unregister+0x12c/0x1a0
+ felix_tag_8021q_teardown+0x130/0x150
+ felix_teardown+0x3c/0xd8
+ dsa_tree_teardown_switches+0xbc/0xe0
+ dsa_unregister_switch+0x168/0x260
+ felix_pci_remove+0x30/0x60
+ pci_device_remove+0x4c/0x100
+ device_release_driver_internal+0x188/0x288
+ device_links_unbind_consumers+0xfc/0x138
+ device_release_driver_internal+0xe0/0x288
+ device_driver_detach+0x24/0x38
+ unbind_store+0xd8/0x108
+ drv_attr_store+0x30/0x50
+---[ end trace 0000000000000000 ]---
+------------[ cut here ]------------
+RTNL: assertion failed at net/8021q/vlan_core.c (376)
+WARNING: CPU: 1 PID: 329 at net/8021q/vlan_core.c:376 vlan_vid_del+0x1b8/0x1f0
+CPU: 1 PID: 329 Comm: bash Tainted: G W 6.5.0-rc3+ #771
+pc : vlan_vid_del+0x1b8/0x1f0
+lr : vlan_vid_del+0x1b8/0x1f0
+ dsa_tag_8021q_unregister+0x8c/0x1a0
+ felix_tag_8021q_teardown+0x130/0x150
+ felix_teardown+0x3c/0xd8
+ dsa_tree_teardown_switches+0xbc/0xe0
+ dsa_unregister_switch+0x168/0x260
+ felix_pci_remove+0x30/0x60
+ pci_device_remove+0x4c/0x100
+ device_release_driver_internal+0x188/0x288
+ device_links_unbind_consumers+0xfc/0x138
+ device_release_driver_internal+0xe0/0x288
+ device_driver_detach+0x24/0x38
+ unbind_store+0xd8/0x108
+ drv_attr_store+0x30/0x50
+DSA: tree 0 torn down
+
+This was somewhat not so easy to spot, because "ocelot-8021q" is not the
+default tagging protocol, and thus, not everyone who tests the unbinding
+path may have switched to it beforehand. The default
+felix_tag_npi_teardown() does not require rtnl_lock() to be held.
+
+Fixes: 7c83a7c539ab ("net: dsa: add a second tagger for Ocelot switches based on tag_8021q")
+Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
+Link: https://lore.kernel.org/r/20230803134253.2711124-1-vladimir.oltean@nxp.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/dsa/ocelot/felix.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/drivers/net/dsa/ocelot/felix.c
++++ b/drivers/net/dsa/ocelot/felix.c
+@@ -1625,8 +1625,10 @@ static void felix_teardown(struct dsa_sw
+ struct felix *felix = ocelot_to_felix(ocelot);
+ struct dsa_port *dp;
+
++ rtnl_lock();
+ if (felix->tag_proto_ops)
+ felix->tag_proto_ops->teardown(ds);
++ rtnl_unlock();
+
+ dsa_switch_for_each_available_port(dp, ds)
+ ocelot_deinit_port(ocelot, dp->index);
--- /dev/null
+From f0168042a21292d20007d24ab2e4fc32f79ebf11 Mon Sep 17 00:00:00 2001
+From: Vladimir Oltean <vladimir.oltean@nxp.com>
+Date: Thu, 3 Aug 2023 16:58:57 +0300
+Subject: net: enetc: reimplement RFS/RSS memory clearing as PCI quirk
+
+From: Vladimir Oltean <vladimir.oltean@nxp.com>
+
+commit f0168042a21292d20007d24ab2e4fc32f79ebf11 upstream.
+
+The workaround implemented in commit 3222b5b613db ("net: enetc:
+initialize RFS/RSS memories for unused ports too") is no longer
+effective after commit 6fffbc7ae137 ("PCI: Honor firmware's device
+disabled status"). Thus, it has introduced a regression and we see AER
+errors being reported again:
+
+$ ip link set sw2p0 up && dhclient -i sw2p0 && ip addr show sw2p0
+fsl_enetc 0000:00:00.2 eno2: configuring for fixed/internal link mode
+fsl_enetc 0000:00:00.2 eno2: Link is Up - 2.5Gbps/Full - flow control rx/tx
+mscc_felix 0000:00:00.5 swp2: configuring for fixed/sgmii link mode
+mscc_felix 0000:00:00.5 swp2: Link is Up - 1Gbps/Full - flow control off
+sja1105 spi2.2 sw2p0: configuring for phy/rgmii-id link mode
+sja1105 spi2.2 sw2p0: Link is Up - 1Gbps/Full - flow control off
+pcieport 0000:00:1f.0: AER: Multiple Corrected error received: 0000:00:00.0
+pcieport 0000:00:1f.0: AER: can't find device of ID0000
+
+Rob's suggestion is to reimplement the enetc driver workaround as a
+PCI fixup, and to modify the PCI core to run the fixups for all PCI
+functions. This change handles the first part.
+
+We refactor the common code in enetc_psi_create() and enetc_psi_destroy(),
+and use the PCI fixup only for those functions for which enetc_pf_probe()
+won't get called. This avoids some work being done twice for the PFs
+which are enabled.
+
+Fixes: 6fffbc7ae137 ("PCI: Honor firmware's device disabled status")
+Link: https://lore.kernel.org/netdev/CAL_JsqLsVYiPLx2kcHkDQ4t=hQVCR7NHziDwi9cCFUFhx48Qow@mail.gmail.com/
+Suggested-by: Rob Herring <robh@kernel.org>
+Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/freescale/enetc/enetc_pf.c | 103 +++++++++++++++++-------
+ 1 file changed, 73 insertions(+), 30 deletions(-)
+
+--- a/drivers/net/ethernet/freescale/enetc/enetc_pf.c
++++ b/drivers/net/ethernet/freescale/enetc/enetc_pf.c
+@@ -1222,50 +1222,81 @@ static int enetc_pf_register_with_ierb(s
+ return enetc_ierb_register_pf(ierb_pdev, pdev);
+ }
+
+-static int enetc_pf_probe(struct pci_dev *pdev,
+- const struct pci_device_id *ent)
++static struct enetc_si *enetc_psi_create(struct pci_dev *pdev)
+ {
+- struct device_node *node = pdev->dev.of_node;
+- struct enetc_ndev_priv *priv;
+- struct net_device *ndev;
+ struct enetc_si *si;
+- struct enetc_pf *pf;
+ int err;
+
+- err = enetc_pf_register_with_ierb(pdev);
+- if (err == -EPROBE_DEFER)
+- return err;
+- if (err)
+- dev_warn(&pdev->dev,
+- "Could not register with IERB driver: %pe, please update the device tree\n",
+- ERR_PTR(err));
+-
+- err = enetc_pci_probe(pdev, KBUILD_MODNAME, sizeof(*pf));
+- if (err)
+- return dev_err_probe(&pdev->dev, err, "PCI probing failed\n");
++ err = enetc_pci_probe(pdev, KBUILD_MODNAME, sizeof(struct enetc_pf));
++ if (err) {
++ dev_err_probe(&pdev->dev, err, "PCI probing failed\n");
++ goto out;
++ }
+
+ si = pci_get_drvdata(pdev);
+ if (!si->hw.port || !si->hw.global) {
+ err = -ENODEV;
+ dev_err(&pdev->dev, "could not map PF space, probing a VF?\n");
+- goto err_map_pf_space;
++ goto out_pci_remove;
+ }
+
+ err = enetc_setup_cbdr(&pdev->dev, &si->hw, ENETC_CBDR_DEFAULT_SIZE,
+ &si->cbd_ring);
+ if (err)
+- goto err_setup_cbdr;
++ goto out_pci_remove;
+
+ err = enetc_init_port_rfs_memory(si);
+ if (err) {
+ dev_err(&pdev->dev, "Failed to initialize RFS memory\n");
+- goto err_init_port_rfs;
++ goto out_teardown_cbdr;
+ }
+
+ err = enetc_init_port_rss_memory(si);
+ if (err) {
+ dev_err(&pdev->dev, "Failed to initialize RSS memory\n");
+- goto err_init_port_rss;
++ goto out_teardown_cbdr;
++ }
++
++ return si;
++
++out_teardown_cbdr:
++ enetc_teardown_cbdr(&si->cbd_ring);
++out_pci_remove:
++ enetc_pci_remove(pdev);
++out:
++ return ERR_PTR(err);
++}
++
++static void enetc_psi_destroy(struct pci_dev *pdev)
++{
++ struct enetc_si *si = pci_get_drvdata(pdev);
++
++ enetc_teardown_cbdr(&si->cbd_ring);
++ enetc_pci_remove(pdev);
++}
++
++static int enetc_pf_probe(struct pci_dev *pdev,
++ const struct pci_device_id *ent)
++{
++ struct device_node *node = pdev->dev.of_node;
++ struct enetc_ndev_priv *priv;
++ struct net_device *ndev;
++ struct enetc_si *si;
++ struct enetc_pf *pf;
++ int err;
++
++ err = enetc_pf_register_with_ierb(pdev);
++ if (err == -EPROBE_DEFER)
++ return err;
++ if (err)
++ dev_warn(&pdev->dev,
++ "Could not register with IERB driver: %pe, please update the device tree\n",
++ ERR_PTR(err));
++
++ si = enetc_psi_create(pdev);
++ if (IS_ERR(si)) {
++ err = PTR_ERR(si);
++ goto err_psi_create;
+ }
+
+ if (node && !of_device_is_available(node)) {
+@@ -1353,15 +1384,10 @@ err_alloc_si_res:
+ si->ndev = NULL;
+ free_netdev(ndev);
+ err_alloc_netdev:
+-err_init_port_rss:
+-err_init_port_rfs:
+ err_device_disabled:
+ err_setup_mac_addresses:
+- enetc_teardown_cbdr(&si->cbd_ring);
+-err_setup_cbdr:
+-err_map_pf_space:
+- enetc_pci_remove(pdev);
+-
++ enetc_psi_destroy(pdev);
++err_psi_create:
+ return err;
+ }
+
+@@ -1384,12 +1410,29 @@ static void enetc_pf_remove(struct pci_d
+ enetc_free_msix(priv);
+
+ enetc_free_si_resources(priv);
+- enetc_teardown_cbdr(&si->cbd_ring);
+
+ free_netdev(si->ndev);
+
+- enetc_pci_remove(pdev);
++ enetc_psi_destroy(pdev);
++}
++
++static void enetc_fixup_clear_rss_rfs(struct pci_dev *pdev)
++{
++ struct device_node *node = pdev->dev.of_node;
++ struct enetc_si *si;
++
++ /* Only apply quirk for disabled functions. For the ones
++ * that are enabled, enetc_pf_probe() will apply it.
++ */
++ if (node && of_device_is_available(node))
++ return;
++
++ si = enetc_psi_create(pdev);
++ if (si)
++ enetc_psi_destroy(pdev);
+ }
++DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_FREESCALE, ENETC_DEV_ID_PF,
++ enetc_fixup_clear_rss_rfs);
+
+ static const struct pci_device_id enetc_pf_id_table[] = {
+ { PCI_DEVICE(PCI_VENDOR_ID_FREESCALE, ENETC_DEV_ID_PF) },
--- /dev/null
+From 6265e242f7b95f2c1195b42ec912b84ad161470e Mon Sep 17 00:00:00 2001
+From: Jie Wang <wangjie125@huawei.com>
+Date: Mon, 7 Aug 2023 19:34:51 +0800
+Subject: net: hns3: add wait until mac link down
+
+From: Jie Wang <wangjie125@huawei.com>
+
+commit 6265e242f7b95f2c1195b42ec912b84ad161470e upstream.
+
+In some configure flow of hns3 driver, for example, change mtu, it will
+disable MAC through firmware before configuration. But firmware disables
+MAC asynchronously. The rx traffic may be not stopped in this case.
+
+So fixes it by waiting until mac link is down.
+
+Fixes: a9775bb64aa7 ("net: hns3: fix set and get link ksettings issue")
+Signed-off-by: Jie Wang <wangjie125@huawei.com>
+Signed-off-by: Jijie Shao <shaojijie@huawei.com>
+Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
+Link: https://lore.kernel.org/r/20230807113452.474224-4-shaojijie@huawei.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 10 +++++++++-
+ 1 file changed, 9 insertions(+), 1 deletion(-)
+
+--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
++++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+@@ -7569,6 +7569,8 @@ static void hclge_enable_fd(struct hnae3
+
+ static void hclge_cfg_mac_mode(struct hclge_dev *hdev, bool enable)
+ {
++#define HCLGE_LINK_STATUS_WAIT_CNT 3
++
+ struct hclge_desc desc;
+ struct hclge_config_mac_mode_cmd *req =
+ (struct hclge_config_mac_mode_cmd *)desc.data;
+@@ -7593,9 +7595,15 @@ static void hclge_cfg_mac_mode(struct hc
+ req->txrx_pad_fcs_loop_en = cpu_to_le32(loop_en);
+
+ ret = hclge_cmd_send(&hdev->hw, &desc, 1);
+- if (ret)
++ if (ret) {
+ dev_err(&hdev->pdev->dev,
+ "mac enable fail, ret =%d.\n", ret);
++ return;
++ }
++
++ if (!enable)
++ hclge_mac_link_status_wait(hdev, HCLGE_LINK_STATUS_DOWN,
++ HCLGE_LINK_STATUS_WAIT_CNT);
+ }
+
+ static int hclge_config_switch_param(struct hclge_dev *hdev, int vfid,
--- /dev/null
+From ac6257a3ae5db5193b1f19c268e4f72d274ddb88 Mon Sep 17 00:00:00 2001
+From: Yonglong Liu <liuyonglong@huawei.com>
+Date: Mon, 7 Aug 2023 19:34:52 +0800
+Subject: net: hns3: fix deadlock issue when externel_lb and reset are executed together
+
+From: Yonglong Liu <liuyonglong@huawei.com>
+
+commit ac6257a3ae5db5193b1f19c268e4f72d274ddb88 upstream.
+
+When externel_lb and reset are executed together, a deadlock may
+occur:
+[ 3147.217009] INFO: task kworker/u321:0:7 blocked for more than 120 seconds.
+[ 3147.230483] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
+[ 3147.238999] task:kworker/u321:0 state:D stack: 0 pid: 7 ppid: 2 flags:0x00000008
+[ 3147.248045] Workqueue: hclge hclge_service_task [hclge]
+[ 3147.253957] Call trace:
+[ 3147.257093] __switch_to+0x7c/0xbc
+[ 3147.261183] __schedule+0x338/0x6f0
+[ 3147.265357] schedule+0x50/0xe0
+[ 3147.269185] schedule_preempt_disabled+0x18/0x24
+[ 3147.274488] __mutex_lock.constprop.0+0x1d4/0x5dc
+[ 3147.279880] __mutex_lock_slowpath+0x1c/0x30
+[ 3147.284839] mutex_lock+0x50/0x60
+[ 3147.288841] rtnl_lock+0x20/0x2c
+[ 3147.292759] hclge_reset_prepare+0x68/0x90 [hclge]
+[ 3147.298239] hclge_reset_subtask+0x88/0xe0 [hclge]
+[ 3147.303718] hclge_reset_service_task+0x84/0x120 [hclge]
+[ 3147.309718] hclge_service_task+0x2c/0x70 [hclge]
+[ 3147.315109] process_one_work+0x1d0/0x490
+[ 3147.319805] worker_thread+0x158/0x3d0
+[ 3147.324240] kthread+0x108/0x13c
+[ 3147.328154] ret_from_fork+0x10/0x18
+
+In externel_lb process, the hns3 driver call napi_disable()
+first, then the reset happen, then the restore process of the
+externel_lb will fail, and will not call napi_enable(). When
+doing externel_lb again, napi_disable() will be double call,
+cause a deadlock of rtnl_lock().
+
+This patch use the HNS3_NIC_STATE_DOWN state to protect the
+calling of napi_disable() and napi_enable() in externel_lb
+process, just as the usage in ndo_stop() and ndo_start().
+
+Fixes: 04b6ba143521 ("net: hns3: add support for external loopback test")
+Signed-off-by: Yonglong Liu <liuyonglong@huawei.com>
+Signed-off-by: Jijie Shao <shaojijie@huawei.com>
+Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
+Link: https://lore.kernel.org/r/20230807113452.474224-5-shaojijie@huawei.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 14 +++++++++++++-
+ 1 file changed, 13 insertions(+), 1 deletion(-)
+
+--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
++++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+@@ -5854,6 +5854,9 @@ void hns3_external_lb_prepare(struct net
+ if (!if_running)
+ return;
+
++ if (test_and_set_bit(HNS3_NIC_STATE_DOWN, &priv->state))
++ return;
++
+ netif_carrier_off(ndev);
+ netif_tx_disable(ndev);
+
+@@ -5882,7 +5885,16 @@ void hns3_external_lb_restore(struct net
+ if (!if_running)
+ return;
+
+- hns3_nic_reset_all_ring(priv->ae_handle);
++ if (hns3_nic_resetting(ndev))
++ return;
++
++ if (!test_bit(HNS3_NIC_STATE_DOWN, &priv->state))
++ return;
++
++ if (hns3_nic_reset_all_ring(priv->ae_handle))
++ return;
++
++ clear_bit(HNS3_NIC_STATE_DOWN, &priv->state);
+
+ for (i = 0; i < priv->vector_num; i++)
+ hns3_vector_enable(&priv->tqp_vector[i]);
--- /dev/null
+From 5e3d20617b055e725e785e0058426368269949f3 Mon Sep 17 00:00:00 2001
+From: Hao Chen <chenhao418@huawei.com>
+Date: Wed, 9 Aug 2023 10:09:02 +0800
+Subject: net: hns3: fix strscpy causing content truncation issue
+
+From: Hao Chen <chenhao418@huawei.com>
+
+commit 5e3d20617b055e725e785e0058426368269949f3 upstream.
+
+hns3_dbg_fill_content()/hclge_dbg_fill_content() is aim to integrate some
+items to a string for content, and we add '\n' and '\0' in the last
+two bytes of content.
+
+strscpy() will add '\0' in the last byte of destination buffer(one of
+items), it result in finishing content print ahead of schedule and some
+dump content truncation.
+
+One Error log shows as below:
+cat mac_list/uc
+UC MAC_LIST:
+
+Expected:
+UC MAC_LIST:
+FUNC_ID MAC_ADDR STATE
+pf 00:2b:19:05:03:00 ACTIVE
+
+The destination buffer is length-bounded and not required to be
+NUL-terminated, so just change strscpy() to memcpy() to fix it.
+
+Fixes: 1cf3d5567f27 ("net: hns3: fix strncpy() not using dest-buf length as length issue")
+Signed-off-by: Hao Chen <chenhao418@huawei.com>
+Signed-off-by: Jijie Shao <shaojijie@huawei.com>
+Link: https://lore.kernel.org/r/20230809020902.1941471-1-shaojijie@huawei.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c | 4 ++--
+ drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c | 4 ++--
+ 2 files changed, 4 insertions(+), 4 deletions(-)
+
+--- a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c
++++ b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c
+@@ -461,9 +461,9 @@ static void hns3_dbg_fill_content(char *
+ if (result) {
+ if (item_len < strlen(result[i]))
+ break;
+- strscpy(pos, result[i], strlen(result[i]));
++ memcpy(pos, result[i], strlen(result[i]));
+ } else {
+- strscpy(pos, items[i].name, strlen(items[i].name));
++ memcpy(pos, items[i].name, strlen(items[i].name));
+ }
+ pos += item_len;
+ len -= item_len;
+--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c
++++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c
+@@ -111,9 +111,9 @@ static void hclge_dbg_fill_content(char
+ if (result) {
+ if (item_len < strlen(result[i]))
+ break;
+- strscpy(pos, result[i], strlen(result[i]));
++ memcpy(pos, result[i], strlen(result[i]));
+ } else {
+- strscpy(pos, items[i].name, strlen(items[i].name));
++ memcpy(pos, items[i].name, strlen(items[i].name));
+ }
+ pos += item_len;
+ len -= item_len;
--- /dev/null
+From 08469dacfad25428b66549716811807203744f4f Mon Sep 17 00:00:00 2001
+From: Jie Wang <wangjie125@huawei.com>
+Date: Mon, 7 Aug 2023 19:34:50 +0800
+Subject: net: hns3: refactor hclge_mac_link_status_wait for interface reuse
+
+From: Jie Wang <wangjie125@huawei.com>
+
+commit 08469dacfad25428b66549716811807203744f4f upstream.
+
+Some nic configurations could only be performed after link is down. So this
+patch refactor this API for reuse.
+
+Signed-off-by: Jie Wang <wangjie125@huawei.com>
+Signed-off-by: Jijie Shao <shaojijie@huawei.com>
+Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
+Link: https://lore.kernel.org/r/20230807113452.474224-3-shaojijie@huawei.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 14 +++++++++-----
+ 1 file changed, 9 insertions(+), 5 deletions(-)
+
+--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
++++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+@@ -72,6 +72,8 @@ static void hclge_restore_hw_table(struc
+ static void hclge_sync_promisc_mode(struct hclge_dev *hdev);
+ static void hclge_sync_fd_table(struct hclge_dev *hdev);
+ static void hclge_update_fec_stats(struct hclge_dev *hdev);
++static int hclge_mac_link_status_wait(struct hclge_dev *hdev, int link_ret,
++ int wait_cnt);
+
+ static struct hnae3_ae_algo ae_algo;
+
+@@ -7656,10 +7658,9 @@ static void hclge_phy_link_status_wait(s
+ } while (++i < HCLGE_PHY_LINK_STATUS_NUM);
+ }
+
+-static int hclge_mac_link_status_wait(struct hclge_dev *hdev, int link_ret)
++static int hclge_mac_link_status_wait(struct hclge_dev *hdev, int link_ret,
++ int wait_cnt)
+ {
+-#define HCLGE_MAC_LINK_STATUS_NUM 100
+-
+ int link_status;
+ int i = 0;
+ int ret;
+@@ -7672,13 +7673,15 @@ static int hclge_mac_link_status_wait(st
+ return 0;
+
+ msleep(HCLGE_LINK_STATUS_MS);
+- } while (++i < HCLGE_MAC_LINK_STATUS_NUM);
++ } while (++i < wait_cnt);
+ return -EBUSY;
+ }
+
+ static int hclge_mac_phy_link_status_wait(struct hclge_dev *hdev, bool en,
+ bool is_phy)
+ {
++#define HCLGE_MAC_LINK_STATUS_NUM 100
++
+ int link_ret;
+
+ link_ret = en ? HCLGE_LINK_STATUS_UP : HCLGE_LINK_STATUS_DOWN;
+@@ -7686,7 +7689,8 @@ static int hclge_mac_phy_link_status_wai
+ if (is_phy)
+ hclge_phy_link_status_wait(hdev, link_ret);
+
+- return hclge_mac_link_status_wait(hdev, link_ret);
++ return hclge_mac_link_status_wait(hdev, link_ret,
++ HCLGE_MAC_LINK_STATUS_NUM);
+ }
+
+ static int hclge_set_app_loopback(struct hclge_dev *hdev, bool en)
--- /dev/null
+From 2dc2b3922d3c0f52d3a792d15dcacfbc4cc76b8f Mon Sep 17 00:00:00 2001
+From: Daniel Jurgens <danielj@nvidia.com>
+Date: Tue, 11 Jul 2023 00:28:10 +0300
+Subject: net/mlx5: Allow 0 for total host VFs
+
+From: Daniel Jurgens <danielj@nvidia.com>
+
+commit 2dc2b3922d3c0f52d3a792d15dcacfbc4cc76b8f upstream.
+
+When querying eswitch functions 0 is a valid number of host VFs. After
+introducing ARM SRIOV falling through to getting the max value from PCI
+results in using the total VFs allowed on the ARM for the host.
+
+Fixes: 86eec50beaf3 ("net/mlx5: Support querying max VFs from device");
+Signed-off-by: Daniel Jurgens <danielj@nvidia.com>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/sriov.c | 3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
+@@ -264,8 +264,7 @@ static u16 mlx5_get_max_vfs(struct mlx5_
+ host_total_vfs = MLX5_GET(query_esw_functions_out, out,
+ host_params_context.host_total_vfs);
+ kvfree(out);
+- if (host_total_vfs)
+- return host_total_vfs;
++ return host_total_vfs;
+ }
+
+ done:
--- /dev/null
+From 8bfe1e19fb96d89fce14302e35cba0cd9f39d0a1 Mon Sep 17 00:00:00 2001
+From: Yevgeny Kliteynik <kliteyn@nvidia.com>
+Date: Wed, 26 Jul 2023 14:38:03 +0300
+Subject: net/mlx5: DR, Fix wrong allocation of modify hdr pattern
+
+From: Yevgeny Kliteynik <kliteyn@nvidia.com>
+
+commit 8bfe1e19fb96d89fce14302e35cba0cd9f39d0a1 upstream.
+
+Fixing wrong calculation of the modify hdr pattern size,
+where the previously calculated number would not be enough
+to accommodate the required number of actions.
+
+Fixes: da5d0027d666 ("net/mlx5: DR, Add cache for modify header pattern")
+Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
+Reviewed-by: Erez Shitrit <erezsh@nvidia.com>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ptrn.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ptrn.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ptrn.c
+index d6947fe13d56..8ca534ef5d03 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ptrn.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ptrn.c
+@@ -82,7 +82,7 @@ dr_ptrn_alloc_pattern(struct mlx5dr_ptrn_mgr *mgr,
+ u32 chunk_size;
+ u32 index;
+
+- chunk_size = ilog2(num_of_actions);
++ chunk_size = ilog2(roundup_pow_of_two(num_of_actions));
+ /* HW modify action index granularity is at least 64B */
+ chunk_size = max_t(u32, chunk_size, DR_CHUNK_SIZE_8);
+
+--
+2.41.0
+
--- /dev/null
+From 86ed7b773c01ba71617538b3b107c33fd9cf90b8 Mon Sep 17 00:00:00 2001
+From: Shay Drory <shayd@nvidia.com>
+Date: Sun, 30 Jul 2023 09:26:27 +0300
+Subject: net/mlx5: LAG, Check correct bucket when modifying LAG
+
+From: Shay Drory <shayd@nvidia.com>
+
+commit 86ed7b773c01ba71617538b3b107c33fd9cf90b8 upstream.
+
+Cited patch introduced buckets in hash mode, but missed to update
+the ports/bucket check when modifying LAG.
+Fix the check.
+
+Fixes: 352899f384d4 ("net/mlx5: Lag, use buckets in hash mode")
+Signed-off-by: Shay Drory <shayd@nvidia.com>
+Reviewed-by: Maor Gottlieb <maorg@nvidia.com>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c
+@@ -574,7 +574,7 @@ static int __mlx5_lag_modify_definers_de
+ for (i = 0; i < ldev->ports; i++) {
+ for (j = 0; j < ldev->buckets; j++) {
+ idx = i * ldev->buckets + j;
+- if (ldev->v2p_map[i] == ports[i])
++ if (ldev->v2p_map[idx] == ports[idx])
+ continue;
+
+ dest.vport.vhca_id = MLX5_CAP_GEN(ldev->pf[ports[idx] - 1].dev,
--- /dev/null
+From aab8e1a200b926147db51e3f82fd07bb9edf6a98 Mon Sep 17 00:00:00 2001
+From: Moshe Shemesh <moshe@nvidia.com>
+Date: Sun, 23 Jul 2023 11:03:01 +0300
+Subject: net/mlx5: Reload auxiliary devices in pci error handlers
+
+From: Moshe Shemesh <moshe@nvidia.com>
+
+commit aab8e1a200b926147db51e3f82fd07bb9edf6a98 upstream.
+
+Handling pci errors should fully teardown and load back auxiliary
+devices, same as done through mlx5 health recovery flow.
+
+Fixes: 72ed5d5624af ("net/mlx5: Suspend auxiliary devices only in case of PCI device suspend")
+Signed-off-by: Moshe Shemesh <moshe@nvidia.com>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/main.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
+@@ -1845,7 +1845,7 @@ static pci_ers_result_t mlx5_pci_err_det
+
+ mlx5_enter_error_state(dev, false);
+ mlx5_error_sw_reset(dev);
+- mlx5_unload_one(dev, true);
++ mlx5_unload_one(dev, false);
+ mlx5_drain_health_wq(dev);
+ mlx5_pci_disable_device(dev);
+
--- /dev/null
+From d006207625657322ba8251b6e7e829f9659755dc Mon Sep 17 00:00:00 2001
+From: Moshe Shemesh <moshe@nvidia.com>
+Date: Wed, 19 Jul 2023 11:33:44 +0300
+Subject: net/mlx5: Skip clock update work when device is in error state
+
+From: Moshe Shemesh <moshe@nvidia.com>
+
+commit d006207625657322ba8251b6e7e829f9659755dc upstream.
+
+When device is in error state, marked by the flag
+MLX5_DEVICE_STATE_INTERNAL_ERROR, the HW and PCI may not be accessible
+and so clock update work should be skipped. Furthermore, such access
+through PCI in error state, after calling mlx5_pci_disable_device() can
+result in failing to recover from pci errors.
+
+Fixes: ef9814deafd0 ("net/mlx5e: Add HW timestamping (TS) support")
+Reported-and-tested-by: Ganesh G R <ganeshgr@linux.ibm.com>
+Closes: https://lore.kernel.org/netdev/9bdb9b9d-140a-7a28-f0de-2e64e873c068@nvidia.com
+Signed-off-by: Moshe Shemesh <moshe@nvidia.com>
+Reviewed-by: Aya Levin <ayal@nvidia.com>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c
+@@ -221,10 +221,15 @@ static void mlx5_timestamp_overflow(stru
+ clock = container_of(timer, struct mlx5_clock, timer);
+ mdev = container_of(clock, struct mlx5_core_dev, clock);
+
++ if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
++ goto out;
++
+ write_seqlock_irqsave(&clock->lock, flags);
+ timecounter_read(&timer->tc);
+ mlx5_update_clock_info_page(mdev);
+ write_sequnlock_irqrestore(&clock->lock, flags);
++
++out:
+ schedule_delayed_work(&timer->overflow_work, timer->overflow_period);
+ }
+
--- /dev/null
+From 72cc654970658e88a1cdea08f06b11c218efa4da Mon Sep 17 00:00:00 2001
+From: Gal Pressman <gal@nvidia.com>
+Date: Sun, 16 Jul 2023 14:28:10 +0300
+Subject: net/mlx5e: Take RTNL lock when needed before calling xdp_set_features()
+
+From: Gal Pressman <gal@nvidia.com>
+
+commit 72cc654970658e88a1cdea08f06b11c218efa4da upstream.
+
+Hold RTNL lock when calling xdp_set_features() with a registered netdev,
+as the call triggers the netdev notifiers. This could happen when
+switching from uplink rep to nic profile for example.
+
+This resolves the following call trace:
+
+RTNL: assertion failed at net/core/dev.c (1953)
+WARNING: CPU: 6 PID: 112670 at net/core/dev.c:1953 call_netdevice_notifiers_info+0x7c/0x80
+Modules linked in: sch_mqprio sch_mqprio_lib act_tunnel_key act_mirred act_skbedit cls_matchall nfnetlink_cttimeout act_gact cls_flower sch_ingress bonding ib_umad ip_gre rdma_ucm mlx5_vfio_pci ipip tunnel4 ip6_gre gre mlx5_ib vfio_pci vfio_pci_core vfio_iommu_type1 ib_uverbs vfio mlx5_core ib_ipoib geneve nf_tables ip6_tunnel tunnel6 iptable_raw openvswitch nsh rpcrdma ib_iser libiscsi scsi_transport_iscsi rdma_cm iw_cm ib_cm ib_core xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink xt_addrtype iptable_nat nf_nat br_netfilter rpcsec_gss_krb5 auth_rpcgss oid_registry overlay zram zsmalloc fuse [last unloaded: ib_uverbs]
+CPU: 6 PID: 112670 Comm: devlink Not tainted 6.4.0-rc7_for_upstream_min_debug_2023_06_28_17_02 #1
+Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
+RIP: 0010:call_netdevice_notifiers_info+0x7c/0x80
+Code: 90 ff 80 3d 2d 6b f7 00 00 75 c5 ba a1 07 00 00 48 c7 c6 e4 ce 0b 82 48 c7 c7 c8 f4 04 82 c6 05 11 6b f7 00 01 e8 a4 7c 8e ff <0f> 0b eb a2 0f 1f 44 00 00 55 48 89 e5 41 54 48 83 e4 f0 48 83 ec
+RSP: 0018:ffff8882a21c3948 EFLAGS: 00010282
+RAX: 0000000000000000 RBX: ffffffff82e6f880 RCX: 0000000000000027
+RDX: ffff88885f99b5c8 RSI: 0000000000000001 RDI: ffff88885f99b5c0
+RBP: 0000000000000028 R08: ffff88887ffabaa8 R09: 0000000000000003
+R10: ffff88887fecbac0 R11: ffff88887ff7bac0 R12: ffff8882a21c3968
+R13: ffff88811c018940 R14: 0000000000000000 R15: ffff8881274401a0
+FS: 00007fe141c81800(0000) GS:ffff88885f980000(0000) knlGS:0000000000000000
+CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+CR2: 00007f787c28b948 CR3: 000000014bcf3005 CR4: 0000000000370ea0
+DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+Call Trace:
+ <TASK>
+ ? __warn+0x79/0x120
+ ? call_netdevice_notifiers_info+0x7c/0x80
+ ? report_bug+0x17c/0x190
+ ? handle_bug+0x3c/0x60
+ ? exc_invalid_op+0x14/0x70
+ ? asm_exc_invalid_op+0x16/0x20
+ ? call_netdevice_notifiers_info+0x7c/0x80
+ ? call_netdevice_notifiers_info+0x7c/0x80
+ call_netdevice_notifiers+0x2e/0x50
+ mlx5e_set_xdp_feature+0x21/0x50 [mlx5_core]
+ mlx5e_nic_init+0xf1/0x1a0 [mlx5_core]
+ mlx5e_netdev_init_profile+0x76/0x110 [mlx5_core]
+ mlx5e_netdev_attach_profile+0x1f/0x90 [mlx5_core]
+ mlx5e_netdev_change_profile+0x92/0x160 [mlx5_core]
+ mlx5e_netdev_attach_nic_profile+0x1b/0x30 [mlx5_core]
+ mlx5e_vport_rep_unload+0xaa/0xc0 [mlx5_core]
+ __esw_offloads_unload_rep+0x52/0x60 [mlx5_core]
+ mlx5_esw_offloads_rep_unload+0x52/0x70 [mlx5_core]
+ esw_offloads_unload_rep+0x34/0x70 [mlx5_core]
+ esw_offloads_disable+0x2b/0x90 [mlx5_core]
+ mlx5_eswitch_disable_locked+0x1b9/0x210 [mlx5_core]
+ mlx5_devlink_eswitch_mode_set+0xf5/0x630 [mlx5_core]
+ ? devlink_get_from_attrs_lock+0x9e/0x110
+ devlink_nl_cmd_eswitch_set_doit+0x60/0xe0
+ genl_family_rcv_msg_doit.isra.0+0xc2/0x110
+ genl_rcv_msg+0x17d/0x2b0
+ ? devlink_get_from_attrs_lock+0x110/0x110
+ ? devlink_nl_cmd_eswitch_get_doit+0x290/0x290
+ ? devlink_pernet_pre_exit+0xf0/0xf0
+ ? genl_family_rcv_msg_doit.isra.0+0x110/0x110
+ netlink_rcv_skb+0x54/0x100
+ genl_rcv+0x24/0x40
+ netlink_unicast+0x1f6/0x2c0
+ netlink_sendmsg+0x232/0x4a0
+ sock_sendmsg+0x38/0x60
+ ? _copy_from_user+0x2a/0x60
+ __sys_sendto+0x110/0x160
+ ? __count_memcg_events+0x48/0x90
+ ? handle_mm_fault+0x161/0x260
+ ? do_user_addr_fault+0x278/0x6e0
+ __x64_sys_sendto+0x20/0x30
+ do_syscall_64+0x3d/0x90
+ entry_SYSCALL_64_after_hwframe+0x46/0xb0
+RIP: 0033:0x7fe141b1340a
+Code: d8 64 89 02 48 c7 c0 ff ff ff ff eb b8 0f 1f 00 f3 0f 1e fa 41 89 ca 64 8b 04 25 18 00 00 00 85 c0 75 15 b8 2c 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 7e c3 0f 1f 44 00 00 41 54 48 83 ec 30 44 89
+RSP: 002b:00007fff61d03de8 EFLAGS: 00000246 ORIG_RAX: 000000000000002c
+RAX: ffffffffffffffda RBX: 0000000000afab00 RCX: 00007fe141b1340a
+RDX: 0000000000000038 RSI: 0000000000afab00 RDI: 0000000000000003
+RBP: 0000000000afa910 R08: 00007fe141d80200 R09: 000000000000000c
+R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
+R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000001
+ </TASK>
+
+Fixes: 4d5ab0ad964d ("net/mlx5e: take into account device reconfiguration for xdp_features flag")
+Signed-off-by: Gal Pressman <gal@nvidia.com>
+Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 11 +++++++++++
+ 1 file changed, 11 insertions(+)
+
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+index 1c820119e438..c27df14df145 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+@@ -5266,6 +5266,7 @@ void mlx5e_destroy_q_counters(struct mlx5e_priv *priv)
+ static int mlx5e_nic_init(struct mlx5_core_dev *mdev,
+ struct net_device *netdev)
+ {
++ const bool take_rtnl = netdev->reg_state == NETREG_REGISTERED;
+ struct mlx5e_priv *priv = netdev_priv(netdev);
+ struct mlx5e_flow_steering *fs;
+ int err;
+@@ -5294,9 +5295,19 @@ static int mlx5e_nic_init(struct mlx5_core_dev *mdev,
+ mlx5_core_err(mdev, "TLS initialization failed, %d\n", err);
+
+ mlx5e_health_create_reporters(priv);
++
++ /* If netdev is already registered (e.g. move from uplink to nic profile),
++ * RTNL lock must be held before triggering netdev notifiers.
++ */
++ if (take_rtnl)
++ rtnl_lock();
++
+ /* update XDP supported features */
+ mlx5e_set_xdp_feature(netdev);
+
++ if (take_rtnl)
++ rtnl_unlock();
++
+ return 0;
+ }
+
+--
+2.41.0
+
--- /dev/null
+From ac5da544a3c2047cbfd715acd9cec8380d7fe5c6 Mon Sep 17 00:00:00 2001
+From: Jianbo Liu <jianbol@nvidia.com>
+Date: Fri, 14 Apr 2023 08:48:20 +0000
+Subject: net/mlx5e: TC, Fix internal port memory leak
+
+From: Jianbo Liu <jianbol@nvidia.com>
+
+commit ac5da544a3c2047cbfd715acd9cec8380d7fe5c6 upstream.
+
+The flow rule can be splited, and the extra post_act rules are added
+to post_act table. It's possible to trigger memleak when the rule
+forwards packets from internal port and over tunnel, in the case that,
+for example, CT 'new' state offload is allowed. As int_port object is
+assigned to the flow attribute of post_act rule, and its refcnt is
+incremented by mlx5e_tc_int_port_get(), but mlx5e_tc_int_port_put() is
+not called, the refcnt is never decremented, then int_port is never
+freed.
+
+The kmemleak reports the following error:
+unreferenced object 0xffff888128204b80 (size 64):
+ comm "handler20", pid 50121, jiffies 4296973009 (age 642.932s)
+ hex dump (first 32 bytes):
+ 01 00 00 00 19 00 00 00 03 f0 00 00 04 00 00 00 ................
+ 98 77 67 41 81 88 ff ff 98 77 67 41 81 88 ff ff .wgA.....wgA....
+ backtrace:
+ [<00000000e992680d>] kmalloc_trace+0x27/0x120
+ [<000000009e945a98>] mlx5e_tc_int_port_get+0x3f3/0xe20 [mlx5_core]
+ [<0000000035a537f0>] mlx5e_tc_add_fdb_flow+0x473/0xcf0 [mlx5_core]
+ [<0000000070c2cec6>] __mlx5e_add_fdb_flow+0x7cf/0xe90 [mlx5_core]
+ [<000000005cc84048>] mlx5e_configure_flower+0xd40/0x4c40 [mlx5_core]
+ [<000000004f8a2031>] mlx5e_rep_indr_offload.isra.0+0x10e/0x1c0 [mlx5_core]
+ [<000000007df797dc>] mlx5e_rep_indr_setup_tc_cb+0x90/0x130 [mlx5_core]
+ [<0000000016c15cc3>] tc_setup_cb_add+0x1cf/0x410
+ [<00000000a63305b4>] fl_hw_replace_filter+0x38f/0x670 [cls_flower]
+ [<000000008bc9e77c>] fl_change+0x1fd5/0x4430 [cls_flower]
+ [<00000000e7f766e4>] tc_new_tfilter+0x867/0x2010
+ [<00000000e101c0ef>] rtnetlink_rcv_msg+0x6fc/0x9f0
+ [<00000000e1111d44>] netlink_rcv_skb+0x12c/0x360
+ [<0000000082dd6c8b>] netlink_unicast+0x438/0x710
+ [<00000000fc568f70>] netlink_sendmsg+0x794/0xc50
+ [<0000000016e92590>] sock_sendmsg+0xc5/0x190
+
+So fix this by moving int_port cleanup code to the flow attribute
+free helper, which is used by all the attribute free cases.
+
+Fixes: 8300f225268b ("net/mlx5e: Create new flow attr for multi table actions")
+Signed-off-by: Jianbo Liu <jianbol@nvidia.com>
+Reviewed-by: Vlad Buslov <vladbu@nvidia.com>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 21 +++++++++++++--------
+ 1 file changed, 13 insertions(+), 8 deletions(-)
+
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+@@ -1943,9 +1943,7 @@ static void mlx5e_tc_del_fdb_flow(struct
+ {
+ struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+ struct mlx5_flow_attr *attr = flow->attr;
+- struct mlx5_esw_flow_attr *esw_attr;
+
+- esw_attr = attr->esw_attr;
+ mlx5e_put_flow_tunnel_id(flow);
+
+ remove_unready_flow(flow);
+@@ -1966,12 +1964,6 @@ static void mlx5e_tc_del_fdb_flow(struct
+
+ mlx5_tc_ct_match_del(get_ct_priv(priv), &flow->attr->ct_attr);
+
+- if (esw_attr->int_port)
+- mlx5e_tc_int_port_put(mlx5e_get_int_port_priv(priv), esw_attr->int_port);
+-
+- if (esw_attr->dest_int_port)
+- mlx5e_tc_int_port_put(mlx5e_get_int_port_priv(priv), esw_attr->dest_int_port);
+-
+ if (flow_flag_test(flow, L3_TO_L2_DECAP))
+ mlx5e_detach_decap(priv, flow);
+
+@@ -4250,6 +4242,7 @@ static void
+ mlx5_free_flow_attr_actions(struct mlx5e_tc_flow *flow, struct mlx5_flow_attr *attr)
+ {
+ struct mlx5_core_dev *counter_dev = get_flow_counter_dev(flow);
++ struct mlx5_esw_flow_attr *esw_attr;
+
+ if (!attr)
+ return;
+@@ -4267,6 +4260,18 @@ mlx5_free_flow_attr_actions(struct mlx5e
+ mlx5e_tc_detach_mod_hdr(flow->priv, flow, attr);
+ }
+
++ if (mlx5e_is_eswitch_flow(flow)) {
++ esw_attr = attr->esw_attr;
++
++ if (esw_attr->int_port)
++ mlx5e_tc_int_port_put(mlx5e_get_int_port_priv(flow->priv),
++ esw_attr->int_port);
++
++ if (esw_attr->dest_int_port)
++ mlx5e_tc_int_port_put(mlx5e_get_int_port_priv(flow->priv),
++ esw_attr->dest_int_port);
++ }
++
+ mlx5_tc_ct_delete_flow(get_ct_priv(flow->priv), attr);
+
+ free_branch_attr(flow, attr->branch_true);
--- /dev/null
+From 6b5926eb1c034affff3fb44a98cb8c67153847d8 Mon Sep 17 00:00:00 2001
+From: Chris Mi <cmi@nvidia.com>
+Date: Wed, 26 Jul 2023 09:06:33 +0300
+Subject: net/mlx5e: Unoffload post act rule when handling FIB events
+
+From: Chris Mi <cmi@nvidia.com>
+
+commit 6b5926eb1c034affff3fb44a98cb8c67153847d8 upstream.
+
+If having the following tc rule on stack device:
+
+filter parent ffff: protocol ip pref 3 flower chain 1
+filter parent ffff: protocol ip pref 3 flower chain 1 handle 0x1
+ dst_mac 24:25:d0:e1:00:00
+ src_mac 02:25:d0:25:01:02
+ eth_type ipv4
+ ct_state +trk+new
+ in_hw in_hw_count 1
+ action order 1: ct commit zone 0 pipe
+ index 2 ref 1 bind 1 installed 3807 sec used 3779 sec firstused 3800 sec
+ Action statistics:
+ Sent 120 bytes 2 pkt (dropped 0, overlimits 0 requeues 0)
+ backlog 0b 0p requeues 0
+ used_hw_stats delayed
+
+ action order 2: tunnel_key set
+ src_ip 192.168.1.25
+ dst_ip 192.168.1.26
+ key_id 4
+ dst_port 4789
+ csum pipe
+ index 3 ref 1 bind 1 installed 3807 sec used 3779 sec firstused 3800 sec
+ Action statistics:
+ Sent 120 bytes 2 pkt (dropped 0, overlimits 0 requeues 0)
+ backlog 0b 0p requeues 0
+ used_hw_stats delayed
+
+ action order 3: mirred (Egress Redirect to device vxlan1) stolen
+ index 9 ref 1 bind 1 installed 3807 sec used 3779 sec firstused 3800 sec
+ Action statistics:
+ Sent 120 bytes 2 pkt (dropped 0, overlimits 0 requeues 0)
+ backlog 0b 0p requeues 0
+ used_hw_stats delayed
+
+When handling FIB events, the rule in post act will not be deleted.
+And because the post act rule has packet reformat and modify header
+actions, also will hit the following syndromes:
+
+mlx5_core 0000:08:00.0: mlx5_cmd_out_err:829:(pid 11613): DEALLOC_MODIFY_HEADER_CONTEXT(0x941) op_mod(0x0) failed, status bad resource state(0x9), syndrome (0x1ab444), err(-22)
+mlx5_core 0000:08:00.0: mlx5_cmd_out_err:829:(pid 11613): DEALLOC_PACKET_REFORMAT_CONTEXT(0x93e) op_mod(0x0) failed, status bad resource state(0x9), syndrome (0x179e84), err(-22)
+
+Fix it by unoffloading post act rule when handling FIB events.
+
+Fixes: 314e1105831b ("net/mlx5e: Add post act offload/unoffload API")
+Signed-off-by: Chris Mi <cmi@nvidia.com>
+Reviewed-by: Vlad Buslov <vladbu@nvidia.com>
+Reviewed-by: Roi Dayan <roid@nvidia.com>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c
+@@ -1461,10 +1461,12 @@ static void mlx5e_invalidate_encap(struc
+ attr = mlx5e_tc_get_encap_attr(flow);
+ esw_attr = attr->esw_attr;
+
+- if (flow_flag_test(flow, SLOW))
++ if (flow_flag_test(flow, SLOW)) {
+ mlx5e_tc_unoffload_from_slow_path(esw, flow);
+- else
++ } else {
+ mlx5e_tc_unoffload_fdb_rules(esw, flow, flow->attr);
++ mlx5e_tc_unoffload_flow_post_acts(flow);
++ }
+
+ mlx5e_tc_detach_mod_hdr(priv, flow, attr);
+ attr->modify_hdr = NULL;
--- /dev/null
+From d7791cec2304aea22eb2ada944e4d467302f5bfe Mon Sep 17 00:00:00 2001
+From: Li Yang <leoyang.li@nxp.com>
+Date: Wed, 2 Aug 2023 14:13:47 -0500
+Subject: net: phy: at803x: remove set/get wol callbacks for AR8032
+
+From: Li Yang <leoyang.li@nxp.com>
+
+commit d7791cec2304aea22eb2ada944e4d467302f5bfe upstream.
+
+Since the AR8032 part does not support wol, remove related callbacks
+from it.
+
+Fixes: 5800091a2061 ("net: phy: at803x: add support for AR8032 PHY")
+Signed-off-by: Li Yang <leoyang.li@nxp.com>
+Cc: David Bauer <mail@david-bauer.net>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/phy/at803x.c | 2 --
+ 1 file changed, 2 deletions(-)
+
+--- a/drivers/net/phy/at803x.c
++++ b/drivers/net/phy/at803x.c
+@@ -2086,8 +2086,6 @@ static struct phy_driver at803x_driver[]
+ .flags = PHY_POLL_CABLE_TEST,
+ .config_init = at803x_config_init,
+ .link_change_notify = at803x_link_change_notify,
+- .set_wol = at803x_set_wol,
+- .get_wol = at803x_get_wol,
+ .suspend = at803x_suspend,
+ .resume = at803x_resume,
+ /* PHY_BASIC_FEATURES */
--- /dev/null
+From 8743aeff5bc4dcb5b87b43765f48d5ac3ad7dd9f Mon Sep 17 00:00:00 2001
+From: Ido Schimmel <idosch@nvidia.com>
+Date: Tue, 8 Aug 2023 10:52:33 +0300
+Subject: nexthop: Fix infinite nexthop bucket dump when using maximum nexthop ID
+
+From: Ido Schimmel <idosch@nvidia.com>
+
+commit 8743aeff5bc4dcb5b87b43765f48d5ac3ad7dd9f upstream.
+
+A netlink dump callback can return a positive number to signal that more
+information needs to be dumped or zero to signal that the dump is
+complete. In the second case, the core netlink code will append the
+NLMSG_DONE message to the skb in order to indicate to user space that
+the dump is complete.
+
+The nexthop bucket dump callback always returns a positive number if
+nexthop buckets were filled in the provided skb, even if the dump is
+complete. This means that a dump will span at least two recvmsg() calls
+as long as nexthop buckets are present. In the last recvmsg() call the
+dump callback will not fill in any nexthop buckets because the previous
+call indicated that the dump should restart from the last dumped nexthop
+ID plus one.
+
+ # ip link add name dummy1 up type dummy
+ # ip nexthop add id 1 dev dummy1
+ # ip nexthop add id 10 group 1 type resilient buckets 2
+ # strace -e sendto,recvmsg -s 5 ip nexthop bucket
+ sendto(3, [[{nlmsg_len=24, nlmsg_type=RTM_GETNEXTHOPBUCKET, nlmsg_flags=NLM_F_REQUEST|NLM_F_DUMP, nlmsg_seq=1691396980, nlmsg_pid=0}, {family=AF_UNSPEC, data="\x00\x00\x00\x00\x00"...}], {nlmsg_len=0, nlmsg_type=0 /* NLMSG_??? */, nlmsg_flags=0, nlmsg_seq=0, nlmsg_pid=0}], 152, 0, NULL, 0) = 152
+ recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=NULL, iov_len=0}], msg_iovlen=1, msg_controllen=0, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_TRUNC) = 128
+ recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[[{nlmsg_len=64, nlmsg_type=RTM_NEWNEXTHOPBUCKET, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1691396980, nlmsg_pid=347}, {family=AF_UNSPEC, data="\x00\x00\x00\x00\x00"...}], [{nlmsg_len=64, nlmsg_type=RTM_NEWNEXTHOPBUCKET, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1691396980, nlmsg_pid=347}, {family=AF_UNSPEC, data="\x00\x00\x00\x00\x00"...}]], iov_len=32768}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 128
+ id 10 index 0 idle_time 6.66 nhid 1
+ id 10 index 1 idle_time 6.66 nhid 1
+ recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=NULL, iov_len=0}], msg_iovlen=1, msg_controllen=0, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_TRUNC) = 20
+ recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[{nlmsg_len=20, nlmsg_type=NLMSG_DONE, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1691396980, nlmsg_pid=347}, 0], iov_len=32768}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 20
+ +++ exited with 0 +++
+
+This behavior is both inefficient and buggy. If the last nexthop to be
+dumped had the maximum ID of 0xffffffff, then the dump will restart from
+0 (0xffffffff + 1) and never end:
+
+ # ip link add name dummy1 up type dummy
+ # ip nexthop add id 1 dev dummy1
+ # ip nexthop add id $((2**32-1)) group 1 type resilient buckets 2
+ # ip nexthop bucket
+ id 4294967295 index 0 idle_time 5.55 nhid 1
+ id 4294967295 index 1 idle_time 5.55 nhid 1
+ id 4294967295 index 0 idle_time 5.55 nhid 1
+ id 4294967295 index 1 idle_time 5.55 nhid 1
+ [...]
+
+Fix by adjusting the dump callback to return zero when the dump is
+complete. After the fix only one recvmsg() call is made and the
+NLMSG_DONE message is appended to the RTM_NEWNEXTHOPBUCKET responses:
+
+ # ip link add name dummy1 up type dummy
+ # ip nexthop add id 1 dev dummy1
+ # ip nexthop add id $((2**32-1)) group 1 type resilient buckets 2
+ # strace -e sendto,recvmsg -s 5 ip nexthop bucket
+ sendto(3, [[{nlmsg_len=24, nlmsg_type=RTM_GETNEXTHOPBUCKET, nlmsg_flags=NLM_F_REQUEST|NLM_F_DUMP, nlmsg_seq=1691396737, nlmsg_pid=0}, {family=AF_UNSPEC, data="\x00\x00\x00\x00\x00"...}], {nlmsg_len=0, nlmsg_type=0 /* NLMSG_??? */, nlmsg_flags=0, nlmsg_seq=0, nlmsg_pid=0}], 152, 0, NULL, 0) = 152
+ recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=NULL, iov_len=0}], msg_iovlen=1, msg_controllen=0, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_TRUNC) = 148
+ recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[[{nlmsg_len=64, nlmsg_type=RTM_NEWNEXTHOPBUCKET, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1691396737, nlmsg_pid=350}, {family=AF_UNSPEC, data="\x00\x00\x00\x00\x00"...}], [{nlmsg_len=64, nlmsg_type=RTM_NEWNEXTHOPBUCKET, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1691396737, nlmsg_pid=350}, {family=AF_UNSPEC, data="\x00\x00\x00\x00\x00"...}], [{nlmsg_len=20, nlmsg_type=NLMSG_DONE, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1691396737, nlmsg_pid=350}, 0]], iov_len=32768}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 148
+ id 4294967295 index 0 idle_time 6.61 nhid 1
+ id 4294967295 index 1 idle_time 6.61 nhid 1
+ +++ exited with 0 +++
+
+Note that if the NLMSG_DONE message cannot be appended because of size
+limitations, then another recvmsg() will be needed, but the core netlink
+code will not invoke the dump callback and simply reply with a
+NLMSG_DONE message since it knows that the callback previously returned
+zero.
+
+Add a test that fails before the fix:
+
+ # ./fib_nexthops.sh -t basic_res
+ [...]
+ TEST: Maximum nexthop ID dump [FAIL]
+ [...]
+
+And passes after it:
+
+ # ./fib_nexthops.sh -t basic_res
+ [...]
+ TEST: Maximum nexthop ID dump [ OK ]
+ [...]
+
+Fixes: 8a1bbabb034d ("nexthop: Add netlink handlers for bucket dump")
+Signed-off-by: Ido Schimmel <idosch@nvidia.com>
+Reviewed-by: Petr Machata <petrm@nvidia.com>
+Reviewed-by: David Ahern <dsahern@kernel.org>
+Link: https://lore.kernel.org/r/20230808075233.3337922-4-idosch@nvidia.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/nexthop.c | 6 +-----
+ tools/testing/selftests/net/fib_nexthops.sh | 5 +++++
+ 2 files changed, 6 insertions(+), 5 deletions(-)
+
+--- a/net/ipv4/nexthop.c
++++ b/net/ipv4/nexthop.c
+@@ -3424,13 +3424,9 @@ static int rtm_dump_nexthop_bucket(struc
+
+ if (err < 0) {
+ if (likely(skb->len))
+- goto out;
+- goto out_err;
++ err = skb->len;
+ }
+
+-out:
+- err = skb->len;
+-out_err:
+ cb->seq = net->nexthop.seq;
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+ return err;
+--- a/tools/testing/selftests/net/fib_nexthops.sh
++++ b/tools/testing/selftests/net/fib_nexthops.sh
+@@ -2206,6 +2206,11 @@ basic_res()
+ run_cmd "$IP nexthop bucket list fdb"
+ log_test $? 255 "Dump all nexthop buckets with invalid 'fdb' keyword"
+
++ # Dump should not loop endlessly when maximum nexthop ID is configured.
++ run_cmd "$IP nexthop add id $((2**32-1)) group 1/2 type resilient buckets 4"
++ run_cmd "timeout 5 $IP nexthop bucket"
++ log_test $? 0 "Maximum nexthop ID dump"
++
+ #
+ # resilient nexthop buckets get requests
+ #
--- /dev/null
+From 913f60cacda73ccac8eead94983e5884c03e04cd Mon Sep 17 00:00:00 2001
+From: Ido Schimmel <idosch@nvidia.com>
+Date: Tue, 8 Aug 2023 10:52:31 +0300
+Subject: nexthop: Fix infinite nexthop dump when using maximum nexthop ID
+
+From: Ido Schimmel <idosch@nvidia.com>
+
+commit 913f60cacda73ccac8eead94983e5884c03e04cd upstream.
+
+A netlink dump callback can return a positive number to signal that more
+information needs to be dumped or zero to signal that the dump is
+complete. In the second case, the core netlink code will append the
+NLMSG_DONE message to the skb in order to indicate to user space that
+the dump is complete.
+
+The nexthop dump callback always returns a positive number if nexthops
+were filled in the provided skb, even if the dump is complete. This
+means that a dump will span at least two recvmsg() calls as long as
+nexthops are present. In the last recvmsg() call the dump callback will
+not fill in any nexthops because the previous call indicated that the
+dump should restart from the last dumped nexthop ID plus one.
+
+ # ip nexthop add id 1 blackhole
+ # strace -e sendto,recvmsg -s 5 ip nexthop
+ sendto(3, [[{nlmsg_len=24, nlmsg_type=RTM_GETNEXTHOP, nlmsg_flags=NLM_F_REQUEST|NLM_F_DUMP, nlmsg_seq=1691394315, nlmsg_pid=0}, {nh_family=AF_UNSPEC, nh_scope=RT_SCOPE_UNIVERSE, nh_protocol=RTPROT_UNSPEC, nh_flags=0}], {nlmsg_len=0, nlmsg_type=0 /* NLMSG_??? */, nlmsg_flags=0, nlmsg_seq=0, nlmsg_pid=0}], 152, 0, NULL, 0) = 152
+ recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=NULL, iov_len=0}], msg_iovlen=1, msg_controllen=0, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_TRUNC) = 36
+ recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[{nlmsg_len=36, nlmsg_type=RTM_NEWNEXTHOP, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1691394315, nlmsg_pid=343}, {nh_family=AF_INET, nh_scope=RT_SCOPE_UNIVERSE, nh_protocol=RTPROT_UNSPEC, nh_flags=0}, [[{nla_len=8, nla_type=NHA_ID}, 1], {nla_len=4, nla_type=NHA_BLACKHOLE}]], iov_len=32768}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 36
+ id 1 blackhole
+ recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=NULL, iov_len=0}], msg_iovlen=1, msg_controllen=0, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_TRUNC) = 20
+ recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[{nlmsg_len=20, nlmsg_type=NLMSG_DONE, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1691394315, nlmsg_pid=343}, 0], iov_len=32768}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 20
+ +++ exited with 0 +++
+
+This behavior is both inefficient and buggy. If the last nexthop to be
+dumped had the maximum ID of 0xffffffff, then the dump will restart from
+0 (0xffffffff + 1) and never end:
+
+ # ip nexthop add id $((2**32-1)) blackhole
+ # ip nexthop
+ id 4294967295 blackhole
+ id 4294967295 blackhole
+ [...]
+
+Fix by adjusting the dump callback to return zero when the dump is
+complete. After the fix only one recvmsg() call is made and the
+NLMSG_DONE message is appended to the RTM_NEWNEXTHOP response:
+
+ # ip nexthop add id $((2**32-1)) blackhole
+ # strace -e sendto,recvmsg -s 5 ip nexthop
+ sendto(3, [[{nlmsg_len=24, nlmsg_type=RTM_GETNEXTHOP, nlmsg_flags=NLM_F_REQUEST|NLM_F_DUMP, nlmsg_seq=1691394080, nlmsg_pid=0}, {nh_family=AF_UNSPEC, nh_scope=RT_SCOPE_UNIVERSE, nh_protocol=RTPROT_UNSPEC, nh_flags=0}], {nlmsg_len=0, nlmsg_type=0 /* NLMSG_??? */, nlmsg_flags=0, nlmsg_seq=0, nlmsg_pid=0}], 152, 0, NULL, 0) = 152
+ recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=NULL, iov_len=0}], msg_iovlen=1, msg_controllen=0, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_TRUNC) = 56
+ recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[[{nlmsg_len=36, nlmsg_type=RTM_NEWNEXTHOP, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1691394080, nlmsg_pid=342}, {nh_family=AF_INET, nh_scope=RT_SCOPE_UNIVERSE, nh_protocol=RTPROT_UNSPEC, nh_flags=0}, [[{nla_len=8, nla_type=NHA_ID}, 4294967295], {nla_len=4, nla_type=NHA_BLACKHOLE}]], [{nlmsg_len=20, nlmsg_type=NLMSG_DONE, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1691394080, nlmsg_pid=342}, 0]], iov_len=32768}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 56
+ id 4294967295 blackhole
+ +++ exited with 0 +++
+
+Note that if the NLMSG_DONE message cannot be appended because of size
+limitations, then another recvmsg() will be needed, but the core netlink
+code will not invoke the dump callback and simply reply with a
+NLMSG_DONE message since it knows that the callback previously returned
+zero.
+
+Add a test that fails before the fix:
+
+ # ./fib_nexthops.sh -t basic
+ [...]
+ TEST: Maximum nexthop ID dump [FAIL]
+ [...]
+
+And passes after it:
+
+ # ./fib_nexthops.sh -t basic
+ [...]
+ TEST: Maximum nexthop ID dump [ OK ]
+ [...]
+
+Fixes: ab84be7e54fc ("net: Initial nexthop code")
+Reported-by: Petr Machata <petrm@nvidia.com>
+Closes: https://lore.kernel.org/netdev/87sf91enuf.fsf@nvidia.com/
+Signed-off-by: Ido Schimmel <idosch@nvidia.com>
+Reviewed-by: Petr Machata <petrm@nvidia.com>
+Reviewed-by: David Ahern <dsahern@kernel.org>
+Link: https://lore.kernel.org/r/20230808075233.3337922-2-idosch@nvidia.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/nexthop.c | 6 +-----
+ tools/testing/selftests/net/fib_nexthops.sh | 5 +++++
+ 2 files changed, 6 insertions(+), 5 deletions(-)
+
+--- a/net/ipv4/nexthop.c
++++ b/net/ipv4/nexthop.c
+@@ -3221,13 +3221,9 @@ static int rtm_dump_nexthop(struct sk_bu
+ &rtm_dump_nexthop_cb, &filter);
+ if (err < 0) {
+ if (likely(skb->len))
+- goto out;
+- goto out_err;
++ err = skb->len;
+ }
+
+-out:
+- err = skb->len;
+-out_err:
+ cb->seq = net->nexthop.seq;
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+ return err;
+--- a/tools/testing/selftests/net/fib_nexthops.sh
++++ b/tools/testing/selftests/net/fib_nexthops.sh
+@@ -1981,6 +1981,11 @@ basic()
+
+ run_cmd "$IP link set dev lo up"
+
++ # Dump should not loop endlessly when maximum nexthop ID is configured.
++ run_cmd "$IP nexthop add id $((2**32-1)) blackhole"
++ run_cmd "timeout 5 $IP nexthop"
++ log_test $? 0 "Maximum nexthop ID dump"
++
+ #
+ # groups
+ #
--- /dev/null
+From f10d3d9df49d9e6ee244fda6ca264f901a9c5d85 Mon Sep 17 00:00:00 2001
+From: Ido Schimmel <idosch@nvidia.com>
+Date: Tue, 8 Aug 2023 10:52:32 +0300
+Subject: nexthop: Make nexthop bucket dump more efficient
+
+From: Ido Schimmel <idosch@nvidia.com>
+
+commit f10d3d9df49d9e6ee244fda6ca264f901a9c5d85 upstream.
+
+rtm_dump_nexthop_bucket_nh() is used to dump nexthop buckets belonging
+to a specific resilient nexthop group. The function returns a positive
+return code (the skb length) upon both success and failure.
+
+The above behavior is problematic. When a complete nexthop bucket dump
+is requested, the function that walks the different nexthops treats the
+non-zero return code as an error. This causes buckets belonging to
+different resilient nexthop groups to be dumped using different buffers
+even if they can all fit in the same buffer:
+
+ # ip link add name dummy1 up type dummy
+ # ip nexthop add id 1 dev dummy1
+ # ip nexthop add id 10 group 1 type resilient buckets 1
+ # ip nexthop add id 20 group 1 type resilient buckets 1
+ # strace -e recvmsg -s 0 ip nexthop bucket
+ [...]
+ recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[...], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 64
+ id 10 index 0 idle_time 10.27 nhid 1
+ [...]
+ recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[...], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 64
+ id 20 index 0 idle_time 6.44 nhid 1
+ [...]
+
+Fix by only returning a non-zero return code when an error occurred and
+restarting the dump from the bucket index we failed to fill in. This
+allows buckets belonging to different resilient nexthop groups to be
+dumped using the same buffer:
+
+ # ip link add name dummy1 up type dummy
+ # ip nexthop add id 1 dev dummy1
+ # ip nexthop add id 10 group 1 type resilient buckets 1
+ # ip nexthop add id 20 group 1 type resilient buckets 1
+ # strace -e recvmsg -s 0 ip nexthop bucket
+ [...]
+ recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[...], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 128
+ id 10 index 0 idle_time 30.21 nhid 1
+ id 20 index 0 idle_time 26.7 nhid 1
+ [...]
+
+While this change is more of a performance improvement change than an
+actual bug fix, it is a prerequisite for a subsequent patch that does
+fix a bug.
+
+Fixes: 8a1bbabb034d ("nexthop: Add netlink handlers for bucket dump")
+Signed-off-by: Ido Schimmel <idosch@nvidia.com>
+Reviewed-by: Petr Machata <petrm@nvidia.com>
+Reviewed-by: David Ahern <dsahern@kernel.org>
+Link: https://lore.kernel.org/r/20230808075233.3337922-3-idosch@nvidia.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/nexthop.c | 16 +++++-----------
+ 1 file changed, 5 insertions(+), 11 deletions(-)
+
+--- a/net/ipv4/nexthop.c
++++ b/net/ipv4/nexthop.c
+@@ -3363,25 +3363,19 @@ static int rtm_dump_nexthop_bucket_nh(st
+ dd->filter.res_bucket_nh_id != nhge->nh->id)
+ continue;
+
++ dd->ctx->bucket_index = bucket_index;
+ err = nh_fill_res_bucket(skb, nh, bucket, bucket_index,
+ RTM_NEWNEXTHOPBUCKET, portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ cb->extack);
+- if (err < 0) {
+- if (likely(skb->len))
+- goto out;
+- goto out_err;
+- }
++ if (err)
++ return err;
+ }
+
+ dd->ctx->done_nh_idx = dd->ctx->nh.idx + 1;
+- bucket_index = 0;
++ dd->ctx->bucket_index = 0;
+
+-out:
+- err = skb->len;
+-out_err:
+- dd->ctx->bucket_index = bucket_index;
+- return err;
++ return 0;
+ }
+
+ static int rtm_dump_nexthop_bucket_cb(struct sk_buff *skb,
rdma-bnxt_re-fix-error-handling-in-probe-failure-path.patch
net-tls-avoid-discarding-data-on-record-close.patch
net-marvell-prestera-fix-handling-ipv4-routes-with-nhid.patch
+net-phy-at803x-remove-set-get-wol-callbacks-for-ar8032.patch
+net-dsa-ocelot-call-dsa_tag_8021q_unregister-under-rtnl_lock-on-driver-remove.patch
+net-hns3-refactor-hclge_mac_link_status_wait-for-interface-reuse.patch
+net-hns3-add-wait-until-mac-link-down.patch
+net-hns3-fix-deadlock-issue-when-externel_lb-and-reset-are-executed-together.patch
+net-enetc-reimplement-rfs-rss-memory-clearing-as-pci-quirk.patch
+nexthop-fix-infinite-nexthop-dump-when-using-maximum-nexthop-id.patch
+nexthop-make-nexthop-bucket-dump-more-efficient.patch
+nexthop-fix-infinite-nexthop-bucket-dump-when-using-maximum-nexthop-id.patch
+net-hns3-fix-strscpy-causing-content-truncation-issue.patch
+dmaengine-mcf-edma-fix-a-potential-un-allocated-memory-access.patch
+dmaengine-idxd-clear-prs-disable-flag-when-disabling-idxd-device.patch
+dmaengine-owl-dma-modify-mismatched-function-name.patch
+net-mlx5e-take-rtnl-lock-when-needed-before-calling-xdp_set_features.patch
+net-mlx5e-tc-fix-internal-port-memory-leak.patch
+net-mlx5-dr-fix-wrong-allocation-of-modify-hdr-pattern.patch
+net-mlx5-allow-0-for-total-host-vfs.patch
+net-mlx5e-unoffload-post-act-rule-when-handling-fib-events.patch
+net-mlx5-lag-check-correct-bucket-when-modifying-lag.patch
+net-mlx5-skip-clock-update-work-when-device-is-in-error-state.patch
+net-mlx5-reload-auxiliary-devices-in-pci-error-handlers.patch
+ibmvnic-enforce-stronger-sanity-checks-on-login-response.patch
+ibmvnic-unmap-dma-login-rsp-buffer-on-send-login-fail.patch
+ibmvnic-handle-dma-unmapping-of-login-buffs-in-release-functions.patch
+ibmvnic-do-partial-reset-on-login-failure.patch
+ibmvnic-ensure-login-failure-recovery-is-safe-from-other-resets.patch