--- /dev/null
+From be46e14e1c3651f3c002b34e47ddb0cf0a1d35c7 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 23 Aug 2022 14:15:54 +0800
+Subject: iommu/vt-d: Fix kdump kernels boot failure with scalable mode
+
+From: Lu Baolu <baolu.lu@linux.intel.com>
+
+[ Upstream commit 0c5f6c0d8201a809a6585b07b6263e9db2c874a3 ]
+
+The translation table copying code for kdump kernels is currently based
+on the extended root/context entry formats of ECS mode defined in older
+VT-d v2.5, and doesn't handle the scalable mode formats. This causes
+the kexec capture kernel boot failure with DMAR faults if the IOMMU was
+enabled in scalable mode by the previous kernel.
+
+The ECS mode has already been deprecated by the VT-d spec since v3.0 and
+Intel IOMMU driver doesn't support this mode as there's no real hardware
+implementation. Hence this converts ECS checking in copying table code
+into scalable mode.
+
+The existing copying code consumes a bit in the context entry as a mark
+of copied entry. It needs to work for the old format as well as for the
+extended context entries. As it's hard to find such a common bit for both
+legacy and scalable mode context entries. This replaces it with a per-
+IOMMU bitmap.
+
+Fixes: 7373a8cc38197 ("iommu/vt-d: Setup context and enable RID2PASID support")
+Cc: stable@vger.kernel.org
+Reported-by: Jerry Snitselaar <jsnitsel@redhat.com>
+Tested-by: Wen Jin <wen.jin@intel.com>
+Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
+Link: https://lore.kernel.org/r/20220817011035.3250131-1-baolu.lu@linux.intel.com
+Signed-off-by: Joerg Roedel <jroedel@suse.de>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/iommu/intel/iommu.c | 100 ++++++++++++++++--------------------
+ include/linux/intel-iommu.h | 9 ++--
+ 2 files changed, 50 insertions(+), 59 deletions(-)
+
+diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
+index 40ac3a78d90ef..c0464959cbcdb 100644
+--- a/drivers/iommu/intel/iommu.c
++++ b/drivers/iommu/intel/iommu.c
+@@ -168,38 +168,6 @@ static phys_addr_t root_entry_uctp(struct root_entry *re)
+ return re->hi & VTD_PAGE_MASK;
+ }
+
+-static inline void context_clear_pasid_enable(struct context_entry *context)
+-{
+- context->lo &= ~(1ULL << 11);
+-}
+-
+-static inline bool context_pasid_enabled(struct context_entry *context)
+-{
+- return !!(context->lo & (1ULL << 11));
+-}
+-
+-static inline void context_set_copied(struct context_entry *context)
+-{
+- context->hi |= (1ull << 3);
+-}
+-
+-static inline bool context_copied(struct context_entry *context)
+-{
+- return !!(context->hi & (1ULL << 3));
+-}
+-
+-static inline bool __context_present(struct context_entry *context)
+-{
+- return (context->lo & 1);
+-}
+-
+-bool context_present(struct context_entry *context)
+-{
+- return context_pasid_enabled(context) ?
+- __context_present(context) :
+- __context_present(context) && !context_copied(context);
+-}
+-
+ static inline void context_set_present(struct context_entry *context)
+ {
+ context->lo |= 1;
+@@ -247,6 +215,26 @@ static inline void context_clear_entry(struct context_entry *context)
+ context->hi = 0;
+ }
+
++static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
++{
++ if (!iommu->copied_tables)
++ return false;
++
++ return test_bit(((long)bus << 8) | devfn, iommu->copied_tables);
++}
++
++static inline void
++set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
++{
++ set_bit(((long)bus << 8) | devfn, iommu->copied_tables);
++}
++
++static inline void
++clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
++{
++ clear_bit(((long)bus << 8) | devfn, iommu->copied_tables);
++}
++
+ /*
+ * This domain is a statically identity mapping domain.
+ * 1. This domain creats a static 1:1 mapping to all usable memory.
+@@ -644,6 +632,13 @@ struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
+ struct context_entry *context;
+ u64 *entry;
+
++ /*
++ * Except that the caller requested to allocate a new entry,
++ * returning a copied context entry makes no sense.
++ */
++ if (!alloc && context_copied(iommu, bus, devfn))
++ return NULL;
++
+ entry = &root->lo;
+ if (sm_supported(iommu)) {
+ if (devfn >= 0x80) {
+@@ -1770,6 +1765,11 @@ static void free_dmar_iommu(struct intel_iommu *iommu)
+ iommu->domain_ids = NULL;
+ }
+
++ if (iommu->copied_tables) {
++ bitmap_free(iommu->copied_tables);
++ iommu->copied_tables = NULL;
++ }
++
+ g_iommus[iommu->seq_id] = NULL;
+
+ /* free context mapping */
+@@ -1978,7 +1978,7 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
+ goto out_unlock;
+
+ ret = 0;
+- if (context_present(context))
++ if (context_present(context) && !context_copied(iommu, bus, devfn))
+ goto out_unlock;
+
+ /*
+@@ -1990,7 +1990,7 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
+ * in-flight DMA will exist, and we don't need to worry anymore
+ * hereafter.
+ */
+- if (context_copied(context)) {
++ if (context_copied(iommu, bus, devfn)) {
+ u16 did_old = context_domain_id(context);
+
+ if (did_old < cap_ndoms(iommu->cap)) {
+@@ -2001,6 +2001,8 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
+ iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
+ DMA_TLB_DSI_FLUSH);
+ }
++
++ clear_context_copied(iommu, bus, devfn);
+ }
+
+ context_clear_entry(context);
+@@ -2783,32 +2785,14 @@ static int copy_context_table(struct intel_iommu *iommu,
+ /* Now copy the context entry */
+ memcpy(&ce, old_ce + idx, sizeof(ce));
+
+- if (!__context_present(&ce))
++ if (!context_present(&ce))
+ continue;
+
+ did = context_domain_id(&ce);
+ if (did >= 0 && did < cap_ndoms(iommu->cap))
+ set_bit(did, iommu->domain_ids);
+
+- /*
+- * We need a marker for copied context entries. This
+- * marker needs to work for the old format as well as
+- * for extended context entries.
+- *
+- * Bit 67 of the context entry is used. In the old
+- * format this bit is available to software, in the
+- * extended format it is the PGE bit, but PGE is ignored
+- * by HW if PASIDs are disabled (and thus still
+- * available).
+- *
+- * So disable PASIDs first and then mark the entry
+- * copied. This means that we don't copy PASID
+- * translations from the old kernel, but this is fine as
+- * faults there are not fatal.
+- */
+- context_clear_pasid_enable(&ce);
+- context_set_copied(&ce);
+-
++ set_context_copied(iommu, bus, devfn);
+ new_ce[idx] = ce;
+ }
+
+@@ -2835,8 +2819,8 @@ static int copy_translation_tables(struct intel_iommu *iommu)
+ bool new_ext, ext;
+
+ rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
+- ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
+- new_ext = !!ecap_ecs(iommu->ecap);
++ ext = !!(rtaddr_reg & DMA_RTADDR_SMT);
++ new_ext = !!sm_supported(iommu);
+
+ /*
+ * The RTT bit can only be changed when translation is disabled,
+@@ -2847,6 +2831,10 @@ static int copy_translation_tables(struct intel_iommu *iommu)
+ if (new_ext != ext)
+ return -EINVAL;
+
++ iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
++ if (!iommu->copied_tables)
++ return -ENOMEM;
++
+ old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
+ if (!old_rt_phys)
+ return -EINVAL;
+diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
+index 5fcf89faa31ab..d72626d71258f 100644
+--- a/include/linux/intel-iommu.h
++++ b/include/linux/intel-iommu.h
+@@ -196,7 +196,6 @@
+ #define ecap_dis(e) (((e) >> 27) & 0x1)
+ #define ecap_nest(e) (((e) >> 26) & 0x1)
+ #define ecap_mts(e) (((e) >> 25) & 0x1)
+-#define ecap_ecs(e) (((e) >> 24) & 0x1)
+ #define ecap_iotlb_offset(e) ((((e) >> 8) & 0x3ff) * 16)
+ #define ecap_max_iotlb_offset(e) (ecap_iotlb_offset(e) + 16)
+ #define ecap_coherent(e) ((e) & 0x1)
+@@ -264,7 +263,6 @@
+ #define DMA_GSTS_CFIS (((u32)1) << 23)
+
+ /* DMA_RTADDR_REG */
+-#define DMA_RTADDR_RTT (((u64)1) << 11)
+ #define DMA_RTADDR_SMT (((u64)1) << 10)
+
+ /* CCMD_REG */
+@@ -579,6 +577,7 @@ struct intel_iommu {
+
+ #ifdef CONFIG_INTEL_IOMMU
+ unsigned long *domain_ids; /* bitmap of domains */
++ unsigned long *copied_tables; /* bitmap of copied tables */
+ spinlock_t lock; /* protect context, domain ids */
+ struct root_entry *root_entry; /* virtual address */
+
+@@ -692,6 +691,11 @@ static inline int nr_pte_to_next_page(struct dma_pte *pte)
+ (struct dma_pte *)ALIGN((unsigned long)pte, VTD_PAGE_SIZE) - pte;
+ }
+
++static inline bool context_present(struct context_entry *context)
++{
++ return (context->lo & 1);
++}
++
+ extern struct dmar_drhd_unit * dmar_find_matched_drhd_unit(struct pci_dev *dev);
+
+ extern int dmar_enable_qi(struct intel_iommu *iommu);
+@@ -776,7 +780,6 @@ static inline void intel_iommu_debugfs_init(void) {}
+ #endif /* CONFIG_INTEL_IOMMU_DEBUGFS */
+
+ extern const struct attribute_group *intel_iommu_groups[];
+-bool context_present(struct context_entry *context);
+ struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
+ u8 devfn, int alloc);
+
+--
+2.35.1
+
--- /dev/null
+From dfd456f054e8df529005b7ee9b68fa3ac0d9a5cb Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 2 Jun 2022 12:41:00 +0300
+Subject: net/mlx5: Introduce ifc bits for using software vhca id
+
+From: Yishai Hadas <yishaih@nvidia.com>
+
+[ Upstream commit 0372c546eca575445331c0ad8902210b70be6d61 ]
+
+Introduce ifc related stuff to enable using software vhca id
+functionality.
+
+Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
+Reviewed-by: Mark Bloch <mbloch@nvidia.com>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Stable-dep-of: 9ca05b0f27de ("RDMA/mlx5: Rely on RoCE fw cap instead of devlink when setting profile")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/mlx5/mlx5_ifc.h | 25 +++++++++++++++++++++----
+ 1 file changed, 21 insertions(+), 4 deletions(-)
+
+diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
+index fd7d083a34d33..6d57e5ec9718d 100644
+--- a/include/linux/mlx5/mlx5_ifc.h
++++ b/include/linux/mlx5/mlx5_ifc.h
+@@ -1804,7 +1804,14 @@ struct mlx5_ifc_cmd_hca_cap_2_bits {
+ u8 max_reformat_remove_size[0x8];
+ u8 max_reformat_remove_offset[0x8];
+
+- u8 reserved_at_c0[0x740];
++ u8 reserved_at_c0[0x160];
++
++ u8 reserved_at_220[0x1];
++ u8 sw_vhca_id_valid[0x1];
++ u8 sw_vhca_id[0xe];
++ u8 reserved_at_230[0x10];
++
++ u8 reserved_at_240[0x5c0];
+ };
+
+ enum mlx5_ifc_flow_destination_type {
+@@ -3715,6 +3722,11 @@ struct mlx5_ifc_rmpc_bits {
+ struct mlx5_ifc_wq_bits wq;
+ };
+
++enum {
++ VHCA_ID_TYPE_HW = 0,
++ VHCA_ID_TYPE_SW = 1,
++};
++
+ struct mlx5_ifc_nic_vport_context_bits {
+ u8 reserved_at_0[0x5];
+ u8 min_wqe_inline_mode[0x3];
+@@ -3731,8 +3743,8 @@ struct mlx5_ifc_nic_vport_context_bits {
+ u8 event_on_mc_address_change[0x1];
+ u8 event_on_uc_address_change[0x1];
+
+- u8 reserved_at_40[0xc];
+-
++ u8 vhca_id_type[0x1];
++ u8 reserved_at_41[0xb];
+ u8 affiliation_criteria[0x4];
+ u8 affiliated_vhca_id[0x10];
+
+@@ -7189,7 +7201,12 @@ struct mlx5_ifc_init_hca_in_bits {
+ u8 reserved_at_20[0x10];
+ u8 op_mod[0x10];
+
+- u8 reserved_at_40[0x40];
++ u8 reserved_at_40[0x20];
++
++ u8 reserved_at_60[0x2];
++ u8 sw_vhca_id[0xe];
++ u8 reserved_at_70[0x10];
++
+ u8 sw_owner_id[4][0x20];
+ };
+
+--
+2.35.1
+
--- /dev/null
+From 0389847c9b08301f63601d5d90d814f4faa5df52 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 2 Jun 2022 12:47:34 +0300
+Subject: net/mlx5: Use software VHCA id when it's supported
+
+From: Yishai Hadas <yishaih@nvidia.com>
+
+[ Upstream commit dc402ccc0d7b55922a79505df3000da7deb77a2b ]
+
+Use software VHCA id when it's supported by the firmware.
+
+A unique id is allocated upon mlx5_mdev_init() and freed upon
+mlx5_mdev_uninit(), as such it stays the same during the full life cycle
+of the device including upon health recovery if occurred.
+
+The conjunction of sw_vhca_id with sw_owner_id will be a global unique
+id per function which uses mlx5_core.
+
+The sw_vhca_id is set upon init_hca command and is used to specify the
+VHCA that the NIC vport is affiliated with.
+
+This functionality is needed upon migration of VM which is MPV based.
+(i.e. multi port device).
+
+Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
+Reviewed-by: Mark Bloch <mbloch@nvidia.com>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Stable-dep-of: 9ca05b0f27de ("RDMA/mlx5: Rely on RoCE fw cap instead of devlink when setting profile")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/fw.c | 4 ++
+ .../net/ethernet/mellanox/mlx5/core/main.c | 49 +++++++++++++++++++
+ .../net/ethernet/mellanox/mlx5/core/vport.c | 14 ++++--
+ include/linux/mlx5/driver.h | 1 +
+ 4 files changed, 65 insertions(+), 3 deletions(-)
+
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
+index cfb8bedba5124..079fa44ada71e 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
+@@ -289,6 +289,10 @@ int mlx5_cmd_init_hca(struct mlx5_core_dev *dev, uint32_t *sw_owner_id)
+ sw_owner_id[i]);
+ }
+
++ if (MLX5_CAP_GEN_2_MAX(dev, sw_vhca_id_valid) &&
++ dev->priv.sw_vhca_id > 0)
++ MLX5_SET(init_hca_in, in, sw_vhca_id, dev->priv.sw_vhca_id);
++
+ return mlx5_cmd_exec_in(dev, init_hca, in);
+ }
+
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
+index 616207c3b187a..64d54bba91f69 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
+@@ -90,6 +90,8 @@ module_param_named(prof_sel, prof_sel, uint, 0444);
+ MODULE_PARM_DESC(prof_sel, "profile selector. Valid range 0 - 2");
+
+ static u32 sw_owner_id[4];
++#define MAX_SW_VHCA_ID (BIT(__mlx5_bit_sz(cmd_hca_cap_2, sw_vhca_id)) - 1)
++static DEFINE_IDA(sw_vhca_ida);
+
+ enum {
+ MLX5_ATOMIC_REQ_MODE_BE = 0x0,
+@@ -499,6 +501,31 @@ static int max_uc_list_get_devlink_param(struct mlx5_core_dev *dev)
+ return err;
+ }
+
++static int handle_hca_cap_2(struct mlx5_core_dev *dev, void *set_ctx)
++{
++ void *set_hca_cap;
++ int err;
++
++ if (!MLX5_CAP_GEN_MAX(dev, hca_cap_2))
++ return 0;
++
++ err = mlx5_core_get_caps(dev, MLX5_CAP_GENERAL_2);
++ if (err)
++ return err;
++
++ if (!MLX5_CAP_GEN_2_MAX(dev, sw_vhca_id_valid) ||
++ !(dev->priv.sw_vhca_id > 0))
++ return 0;
++
++ set_hca_cap = MLX5_ADDR_OF(set_hca_cap_in, set_ctx,
++ capability);
++ memcpy(set_hca_cap, dev->caps.hca[MLX5_CAP_GENERAL_2]->cur,
++ MLX5_ST_SZ_BYTES(cmd_hca_cap_2));
++ MLX5_SET(cmd_hca_cap_2, set_hca_cap, sw_vhca_id_valid, 1);
++
++ return set_caps(dev, set_ctx, MLX5_CAP_GENERAL_2);
++}
++
+ static int handle_hca_cap(struct mlx5_core_dev *dev, void *set_ctx)
+ {
+ struct mlx5_profile *prof = &dev->profile;
+@@ -669,6 +696,13 @@ static int set_hca_cap(struct mlx5_core_dev *dev)
+ goto out;
+ }
+
++ memset(set_ctx, 0, set_sz);
++ err = handle_hca_cap_2(dev, set_ctx);
++ if (err) {
++ mlx5_core_err(dev, "handle_hca_cap_2 failed\n");
++ goto out;
++ }
++
+ out:
+ kfree(set_ctx);
+ return err;
+@@ -1512,6 +1546,18 @@ int mlx5_mdev_init(struct mlx5_core_dev *dev, int profile_idx)
+ if (err)
+ goto err_hca_caps;
+
++ /* The conjunction of sw_vhca_id with sw_owner_id will be a global
++ * unique id per function which uses mlx5_core.
++ * Those values are supplied to FW as part of the init HCA command to
++ * be used by both driver and FW when it's applicable.
++ */
++ dev->priv.sw_vhca_id = ida_alloc_range(&sw_vhca_ida, 1,
++ MAX_SW_VHCA_ID,
++ GFP_KERNEL);
++ if (dev->priv.sw_vhca_id < 0)
++ mlx5_core_err(dev, "failed to allocate sw_vhca_id, err=%d\n",
++ dev->priv.sw_vhca_id);
++
+ return 0;
+
+ err_hca_caps:
+@@ -1537,6 +1583,9 @@ void mlx5_mdev_uninit(struct mlx5_core_dev *dev)
+ {
+ struct mlx5_priv *priv = &dev->priv;
+
++ if (priv->sw_vhca_id > 0)
++ ida_free(&sw_vhca_ida, dev->priv.sw_vhca_id);
++
+ mlx5_hca_caps_free(dev);
+ mlx5_adev_cleanup(dev);
+ mlx5_pagealloc_cleanup(dev);
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
+index ac020cb780727..d5c3173250309 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
+@@ -1086,9 +1086,17 @@ int mlx5_nic_vport_affiliate_multiport(struct mlx5_core_dev *master_mdev,
+ goto free;
+
+ MLX5_SET(modify_nic_vport_context_in, in, field_select.affiliation, 1);
+- MLX5_SET(modify_nic_vport_context_in, in,
+- nic_vport_context.affiliated_vhca_id,
+- MLX5_CAP_GEN(master_mdev, vhca_id));
++ if (MLX5_CAP_GEN_2(master_mdev, sw_vhca_id_valid)) {
++ MLX5_SET(modify_nic_vport_context_in, in,
++ nic_vport_context.vhca_id_type, VHCA_ID_TYPE_SW);
++ MLX5_SET(modify_nic_vport_context_in, in,
++ nic_vport_context.affiliated_vhca_id,
++ MLX5_CAP_GEN_2(master_mdev, sw_vhca_id));
++ } else {
++ MLX5_SET(modify_nic_vport_context_in, in,
++ nic_vport_context.affiliated_vhca_id,
++ MLX5_CAP_GEN(master_mdev, vhca_id));
++ }
+ MLX5_SET(modify_nic_vport_context_in, in,
+ nic_vport_context.affiliation_criteria,
+ MLX5_CAP_GEN(port_mdev, affiliate_nic_vport_criteria));
+diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
+index b0b4ac92354a2..0015a08ddbd24 100644
+--- a/include/linux/mlx5/driver.h
++++ b/include/linux/mlx5/driver.h
+@@ -606,6 +606,7 @@ struct mlx5_priv {
+ spinlock_t ctx_lock;
+ struct mlx5_adev **adev;
+ int adev_idx;
++ int sw_vhca_id;
+ struct mlx5_events *events;
+
+ struct mlx5_flow_steering *steering;
+--
+2.35.1
+
--- /dev/null
+From f72f1848bfdd0f07c7a648c8be04c43eb8a38d5b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 15 May 2022 07:19:53 +0300
+Subject: RDMA/mlx5: Add a umr recovery flow
+
+From: Aharon Landau <aharonl@nvidia.com>
+
+[ Upstream commit 158e71bb69e368b8b33e8b7c4ac8c111da0c1ae2 ]
+
+When a UMR fails, the UMR QP state changes to an error state. Therefore,
+all the further UMR operations will fail too.
+
+Add a recovery flow to the UMR QP, and repost the flushed WQEs.
+
+Link: https://lore.kernel.org/r/6cc24816cca049bd8541317f5e41d3ac659445d3.1652588303.git.leonro@nvidia.com
+Signed-off-by: Aharon Landau <aharonl@nvidia.com>
+Reviewed-by: Michael Guralnik <michaelgur@nvidia.com>
+Signed-off-by: Leon Romanovsky <leon@kernel.org>
+Stable-dep-of: 9b7d4be967f1 ("RDMA/mlx5: Fix UMR cleanup on error flow of driver init")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/infiniband/hw/mlx5/cq.c | 4 ++
+ drivers/infiniband/hw/mlx5/mlx5_ib.h | 12 ++++-
+ drivers/infiniband/hw/mlx5/umr.c | 78 ++++++++++++++++++++++++----
+ 3 files changed, 83 insertions(+), 11 deletions(-)
+
+diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
+index 08371a80fdc26..be189e0525de6 100644
+--- a/drivers/infiniband/hw/mlx5/cq.c
++++ b/drivers/infiniband/hw/mlx5/cq.c
+@@ -523,6 +523,10 @@ static int mlx5_poll_one(struct mlx5_ib_cq *cq,
+ "Requestor" : "Responder", cq->mcq.cqn);
+ mlx5_ib_dbg(dev, "syndrome 0x%x, vendor syndrome 0x%x\n",
+ err_cqe->syndrome, err_cqe->vendor_err_synd);
++ if (wc->status != IB_WC_WR_FLUSH_ERR &&
++ (*cur_qp)->type == MLX5_IB_QPT_REG_UMR)
++ dev->umrc.state = MLX5_UMR_STATE_RECOVER;
++
+ if (opcode == MLX5_CQE_REQ_ERR) {
+ wq = &(*cur_qp)->sq;
+ wqe_ctr = be16_to_cpu(cqe64->wqe_counter);
+diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
+index 998b67509a533..7460e0dfe6db4 100644
+--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
++++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
+@@ -717,13 +717,23 @@ struct mlx5_ib_umr_context {
+ struct completion done;
+ };
+
++enum {
++ MLX5_UMR_STATE_ACTIVE,
++ MLX5_UMR_STATE_RECOVER,
++ MLX5_UMR_STATE_ERR,
++};
++
+ struct umr_common {
+ struct ib_pd *pd;
+ struct ib_cq *cq;
+ struct ib_qp *qp;
+- /* control access to UMR QP
++ /* Protects from UMR QP overflow
+ */
+ struct semaphore sem;
++ /* Protects from using UMR while the UMR is not active
++ */
++ struct mutex lock;
++ unsigned int state;
+ };
+
+ struct mlx5_cache_ent {
+diff --git a/drivers/infiniband/hw/mlx5/umr.c b/drivers/infiniband/hw/mlx5/umr.c
+index 3a48364c09181..e00b94d1b1ea1 100644
+--- a/drivers/infiniband/hw/mlx5/umr.c
++++ b/drivers/infiniband/hw/mlx5/umr.c
+@@ -176,6 +176,7 @@ int mlx5r_umr_resource_init(struct mlx5_ib_dev *dev)
+ dev->umrc.pd = pd;
+
+ sema_init(&dev->umrc.sem, MAX_UMR_WR);
++ mutex_init(&dev->umrc.lock);
+
+ return 0;
+
+@@ -195,6 +196,31 @@ void mlx5r_umr_resource_cleanup(struct mlx5_ib_dev *dev)
+ ib_dealloc_pd(dev->umrc.pd);
+ }
+
++static int mlx5r_umr_recover(struct mlx5_ib_dev *dev)
++{
++ struct umr_common *umrc = &dev->umrc;
++ struct ib_qp_attr attr;
++ int err;
++
++ attr.qp_state = IB_QPS_RESET;
++ err = ib_modify_qp(umrc->qp, &attr, IB_QP_STATE);
++ if (err) {
++ mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n");
++ goto err;
++ }
++
++ err = mlx5r_umr_qp_rst2rts(dev, umrc->qp);
++ if (err)
++ goto err;
++
++ umrc->state = MLX5_UMR_STATE_ACTIVE;
++ return 0;
++
++err:
++ umrc->state = MLX5_UMR_STATE_ERR;
++ return err;
++}
++
+ static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe,
+ struct mlx5r_umr_wqe *wqe, bool with_data)
+ {
+@@ -231,7 +257,7 @@ static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe,
+
+ id.ib_cqe = cqe;
+ mlx5r_finish_wqe(qp, ctrl, seg, size, cur_edge, idx, id.wr_id, 0,
+- MLX5_FENCE_MODE_NONE, MLX5_OPCODE_UMR);
++ MLX5_FENCE_MODE_INITIATOR_SMALL, MLX5_OPCODE_UMR);
+
+ mlx5r_ring_db(qp, 1, ctrl);
+
+@@ -270,17 +296,49 @@ static int mlx5r_umr_post_send_wait(struct mlx5_ib_dev *dev, u32 mkey,
+ mlx5r_umr_init_context(&umr_context);
+
+ down(&umrc->sem);
+- err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context.cqe, wqe,
+- with_data);
+- if (err)
+- mlx5_ib_warn(dev, "UMR post send failed, err %d\n", err);
+- else {
+- wait_for_completion(&umr_context.done);
+- if (umr_context.status != IB_WC_SUCCESS) {
+- mlx5_ib_warn(dev, "reg umr failed (%u)\n",
+- umr_context.status);
++ while (true) {
++ mutex_lock(&umrc->lock);
++ if (umrc->state == MLX5_UMR_STATE_ERR) {
++ mutex_unlock(&umrc->lock);
+ err = -EFAULT;
++ break;
++ }
++
++ if (umrc->state == MLX5_UMR_STATE_RECOVER) {
++ mutex_unlock(&umrc->lock);
++ usleep_range(3000, 5000);
++ continue;
++ }
++
++ err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context.cqe, wqe,
++ with_data);
++ mutex_unlock(&umrc->lock);
++ if (err) {
++ mlx5_ib_warn(dev, "UMR post send failed, err %d\n",
++ err);
++ break;
+ }
++
++ wait_for_completion(&umr_context.done);
++
++ if (umr_context.status == IB_WC_SUCCESS)
++ break;
++
++ if (umr_context.status == IB_WC_WR_FLUSH_ERR)
++ continue;
++
++ WARN_ON_ONCE(1);
++ mlx5_ib_warn(dev,
++ "reg umr failed (%u). Trying to recover and resubmit the flushed WQEs\n",
++ umr_context.status);
++ mutex_lock(&umrc->lock);
++ err = mlx5r_umr_recover(dev);
++ mutex_unlock(&umrc->lock);
++ if (err)
++ mlx5_ib_warn(dev, "couldn't recover UMR, err %d\n",
++ err);
++ err = -EFAULT;
++ break;
+ }
+ up(&umrc->sem);
+ return err;
+--
+2.35.1
+
--- /dev/null
+From 064b2940c803d63bf31e9693a6e4c4b45c3c8501 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 29 Aug 2022 12:02:29 +0300
+Subject: RDMA/mlx5: Fix UMR cleanup on error flow of driver init
+
+From: Maor Gottlieb <maorg@nvidia.com>
+
+[ Upstream commit 9b7d4be967f16f79a2283b2338709fcc750313ee ]
+
+The cited commit removed from the cleanup flow of umr the checks
+if the resources were created. This could lead to null-ptr-deref
+in case that we had failure in mlx5_ib_stage_ib_reg_init stage.
+
+Fix it by adding new state to the umr that can say if the resources
+were created or not and check it in the umr cleanup flow before
+destroying the resources.
+
+Fixes: 04876c12c19e ("RDMA/mlx5: Move init and cleanup of UMR to umr.c")
+Reviewed-by: Michael Guralnik <michaelgur@nvidia.com>
+Signed-off-by: Maor Gottlieb <maorg@nvidia.com>
+Link: https://lore.kernel.org/r/4cfa61386cf202e9ce330e8d228ce3b25a36326e.1661763459.git.leonro@nvidia.com
+Signed-off-by: Leon Romanovsky <leon@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/infiniband/hw/mlx5/mlx5_ib.h | 1 +
+ drivers/infiniband/hw/mlx5/umr.c | 3 +++
+ 2 files changed, 4 insertions(+)
+
+diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
+index 7460e0dfe6db4..c2cca032a6ed4 100644
+--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
++++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
+@@ -718,6 +718,7 @@ struct mlx5_ib_umr_context {
+ };
+
+ enum {
++ MLX5_UMR_STATE_UNINIT,
+ MLX5_UMR_STATE_ACTIVE,
+ MLX5_UMR_STATE_RECOVER,
+ MLX5_UMR_STATE_ERR,
+diff --git a/drivers/infiniband/hw/mlx5/umr.c b/drivers/infiniband/hw/mlx5/umr.c
+index e00b94d1b1ea1..d5105b5c9979b 100644
+--- a/drivers/infiniband/hw/mlx5/umr.c
++++ b/drivers/infiniband/hw/mlx5/umr.c
+@@ -177,6 +177,7 @@ int mlx5r_umr_resource_init(struct mlx5_ib_dev *dev)
+
+ sema_init(&dev->umrc.sem, MAX_UMR_WR);
+ mutex_init(&dev->umrc.lock);
++ dev->umrc.state = MLX5_UMR_STATE_ACTIVE;
+
+ return 0;
+
+@@ -191,6 +192,8 @@ int mlx5r_umr_resource_init(struct mlx5_ib_dev *dev)
+
+ void mlx5r_umr_resource_cleanup(struct mlx5_ib_dev *dev)
+ {
++ if (dev->umrc.state == MLX5_UMR_STATE_UNINIT)
++ return;
+ ib_destroy_qp(dev->umrc.qp);
+ ib_free_cq(dev->umrc.cq);
+ ib_dealloc_pd(dev->umrc.pd);
+--
+2.35.1
+
--- /dev/null
+From 9954f3b09f233f60ce5ad618a101ab2f046b87a9 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 29 Aug 2022 12:02:27 +0300
+Subject: RDMA/mlx5: Rely on RoCE fw cap instead of devlink when setting
+ profile
+
+From: Maher Sanalla <msanalla@nvidia.com>
+
+[ Upstream commit 9ca05b0f27de928be121cccf07735819dc9e1ed3 ]
+
+When the RDMA auxiliary driver probes, it sets its profile based on
+devlink driverinit value. The latter might not be in sync with FW yet
+(In case devlink reload is not performed), thus causing a mismatch
+between RDMA driver and FW. This results in the following FW syndrome
+when the RDMA driver tries to adjust RoCE state, which fails the probe:
+
+"0xC1F678 | modify_nic_vport_context: roce_en set on a vport that
+doesn't support roce"
+
+To prevent this, select the PF profile based on FW RoCE capability
+instead of relying on devlink driverinit value.
+To provide backward compatibility of the RoCE disable feature, on older
+FW's where roce_rw is not set (FW RoCE capability is read-only), keep
+the current behavior e.g., rely on devlink driverinit value.
+
+Fixes: fbfa97b4d79f ("net/mlx5: Disable roce at HCA level")
+Reviewed-by: Shay Drory <shayd@nvidia.com>
+Reviewed-by: Michael Guralnik <michaelgur@nvidia.com>
+Reviewed-by: Saeed Mahameed <saeedm@nvidia.com>
+Signed-off-by: Maher Sanalla <msanalla@nvidia.com>
+Link: https://lore.kernel.org/r/cb34ce9a1df4a24c135cb804db87f7d2418bd6cc.1661763459.git.leonro@nvidia.com
+Signed-off-by: Leon Romanovsky <leon@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/infiniband/hw/mlx5/main.c | 2 +-
+ .../net/ethernet/mellanox/mlx5/core/main.c | 23 +++++++++++++++++--
+ include/linux/mlx5/driver.h | 19 +++++++--------
+ 3 files changed, 32 insertions(+), 12 deletions(-)
+
+diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
+index 63c89a72cc352..bb13164124fdb 100644
+--- a/drivers/infiniband/hw/mlx5/main.c
++++ b/drivers/infiniband/hw/mlx5/main.c
+@@ -4336,7 +4336,7 @@ static int mlx5r_probe(struct auxiliary_device *adev,
+ dev->mdev = mdev;
+ dev->num_ports = num_ports;
+
+- if (ll == IB_LINK_LAYER_ETHERNET && !mlx5_is_roce_init_enabled(mdev))
++ if (ll == IB_LINK_LAYER_ETHERNET && !mlx5_get_roce_state(mdev))
+ profile = &raw_eth_profile;
+ else
+ profile = &pf_profile;
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
+index 64d54bba91f69..6c8bb74bd8fc6 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
+@@ -501,6 +501,24 @@ static int max_uc_list_get_devlink_param(struct mlx5_core_dev *dev)
+ return err;
+ }
+
++bool mlx5_is_roce_on(struct mlx5_core_dev *dev)
++{
++ struct devlink *devlink = priv_to_devlink(dev);
++ union devlink_param_value val;
++ int err;
++
++ err = devlink_param_driverinit_value_get(devlink,
++ DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE,
++ &val);
++
++ if (!err)
++ return val.vbool;
++
++ mlx5_core_dbg(dev, "Failed to get param. err = %d\n", err);
++ return MLX5_CAP_GEN(dev, roce);
++}
++EXPORT_SYMBOL(mlx5_is_roce_on);
++
+ static int handle_hca_cap_2(struct mlx5_core_dev *dev, void *set_ctx)
+ {
+ void *set_hca_cap;
+@@ -604,7 +622,8 @@ static int handle_hca_cap(struct mlx5_core_dev *dev, void *set_ctx)
+ MLX5_CAP_GEN_MAX(dev, num_total_dynamic_vf_msix));
+
+ if (MLX5_CAP_GEN(dev, roce_rw_supported))
+- MLX5_SET(cmd_hca_cap, set_hca_cap, roce, mlx5_is_roce_init_enabled(dev));
++ MLX5_SET(cmd_hca_cap, set_hca_cap, roce,
++ mlx5_is_roce_on(dev));
+
+ max_uc_list = max_uc_list_get_devlink_param(dev);
+ if (max_uc_list > 0)
+@@ -630,7 +649,7 @@ static int handle_hca_cap(struct mlx5_core_dev *dev, void *set_ctx)
+ */
+ static bool is_roce_fw_disabled(struct mlx5_core_dev *dev)
+ {
+- return (MLX5_CAP_GEN(dev, roce_rw_supported) && !mlx5_is_roce_init_enabled(dev)) ||
++ return (MLX5_CAP_GEN(dev, roce_rw_supported) && !mlx5_is_roce_on(dev)) ||
+ (!MLX5_CAP_GEN(dev, roce_rw_supported) && !MLX5_CAP_GEN(dev, roce));
+ }
+
+diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
+index 0015a08ddbd24..b3ea245faa515 100644
+--- a/include/linux/mlx5/driver.h
++++ b/include/linux/mlx5/driver.h
+@@ -1275,16 +1275,17 @@ enum {
+ MLX5_TRIGGERED_CMD_COMP = (u64)1 << 32,
+ };
+
+-static inline bool mlx5_is_roce_init_enabled(struct mlx5_core_dev *dev)
++bool mlx5_is_roce_on(struct mlx5_core_dev *dev);
++
++static inline bool mlx5_get_roce_state(struct mlx5_core_dev *dev)
+ {
+- struct devlink *devlink = priv_to_devlink(dev);
+- union devlink_param_value val;
+- int err;
+-
+- err = devlink_param_driverinit_value_get(devlink,
+- DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE,
+- &val);
+- return err ? MLX5_CAP_GEN(dev, roce) : val.vbool;
++ if (MLX5_CAP_GEN(dev, roce_rw_supported))
++ return MLX5_CAP_GEN(dev, roce);
++
++ /* If RoCE cap is read-only in FW, get RoCE state from devlink
++ * in order to support RoCE enable/disable feature
++ */
++ return mlx5_is_roce_on(dev);
+ }
+
+ #endif /* MLX5_DRIVER_H */
+--
+2.35.1
+
--- /dev/null
+iommu-vt-d-fix-kdump-kernels-boot-failure-with-scala.patch
+net-mlx5-introduce-ifc-bits-for-using-software-vhca-.patch
+net-mlx5-use-software-vhca-id-when-it-s-supported.patch
+rdma-mlx5-rely-on-roce-fw-cap-instead-of-devlink-whe.patch
+rdma-mlx5-add-a-umr-recovery-flow.patch
+rdma-mlx5-fix-umr-cleanup-on-error-flow-of-driver-in.patch