To prepare for assigning vPorts to dedicated MSI-X vectors, remove EQ
sharing among the vPorts and create dedicated EQs for each vPort.
Move the EQ definition from struct mana_context to struct mana_port_context
and update related support functions. Export mana_create_eq() and
mana_destroy_eq() for use by the MANA RDMA driver.
RSS QPs now take a vport reference via pd->vport_use_count to ensure
EQs outlive all QP consumers. The vport must already be configured by
a raw QP before an RSS QP can be created. EQs are only destroyed when
the last QP (raw or RSS) on the PD releases its reference.
Restrict each vport to a single RSS QP. The hardware only supports one
steering configuration (indirection table / hash key) per vport, and
mana_disable_vport_rx() on QP destroy disables RX globally for the
vport. Previously, creating a second RSS QP would silently overwrite
the first QP's steering config and destroy would blackhole all traffic.
This is now explicitly rejected with -EBUSY. Existing applications
(DPDK being the primary RDMA consumer) always create one RSS QP per
vport, so no real-world flows are affected.
Reject cross-port PD sharing for both raw and RSS QPs. Since EQs and
vport configuration are per-port, a PD is bound to the port used by
its first raw QP. Subsequent QPs on the same PD must use the same
port or the creation fails with -EINVAL. Previously this was silently
broken: with shared EQs it appeared to work, but with per-vPort EQs
a cross-port PD would cause wrong-port EQ teardown and corruption.
DPDK creates one PD per port so no existing flows are affected.
Serialize mana_set_channels() and the async per-port queue reset
handler against RDMA vport configuration to prevent RDMA from claiming
the vport during the detach/attach window. A channel_changing flag is
set under apc->vport_mutex before detach and checked by
mana_cfg_vport() when called from the RDMA path, blocking RDMA from
grabbing the vport during the entire window. When the port is down
and RDMA already holds the vport, the channel change is rejected with
-EBUSY.
Signed-off-by: Long Li <longli@microsoft.com>
Link: https://patch.msgid.link/20260605005717.2059954-2-longli@microsoft.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
pd->vport_use_count--;
WARN_ON(pd->vport_use_count < 0);
- if (!pd->vport_use_count)
+ if (!pd->vport_use_count) {
+ mana_destroy_eq(mpc);
mana_uncfg_vport(mpc);
+ }
mutex_unlock(&pd->vport_mutex);
}
pd->vport_use_count++;
if (pd->vport_use_count > 1) {
+ /* Reject cross-port PD sharing. EQs and vport config
+ * are per-port, so the PD must stay bound to the port
+ * that was configured on the first raw QP creation.
+ */
+ if (pd->vport_port != port) {
+ pd->vport_use_count--;
+ mutex_unlock(&pd->vport_mutex);
+ ibdev_dbg(&dev->ib_dev,
+ "PD already bound to port %u\n",
+ pd->vport_port);
+ return -EINVAL;
+ }
ibdev_dbg(&dev->ib_dev,
"Skip as this PD is already configured vport\n");
mutex_unlock(&pd->vport_mutex);
return 0;
}
- err = mana_cfg_vport(mpc, pd->pdn, doorbell_id);
+ pd->vport_port = port;
+
+ err = mana_cfg_vport(mpc, pd->pdn, doorbell_id, true);
if (err) {
pd->vport_use_count--;
mutex_unlock(&pd->vport_mutex);
return err;
}
- mutex_unlock(&pd->vport_mutex);
- pd->tx_shortform_allowed = mpc->tx_shortform_allowed;
- pd->tx_vp_offset = mpc->tx_vp_offset;
+ err = mana_create_eq(mpc);
+ if (err) {
+ mana_uncfg_vport(mpc);
+ pd->vport_use_count--;
+ } else {
+ pd->tx_shortform_allowed = mpc->tx_shortform_allowed;
+ pd->tx_vp_offset = mpc->tx_vp_offset;
+ }
+
+ mutex_unlock(&pd->vport_mutex);
- ibdev_dbg(&dev->ib_dev, "vport handle %llx pdid %x doorbell_id %x\n",
- mpc->port_handle, pd->pdn, doorbell_id);
+ if (!err)
+ ibdev_dbg(&dev->ib_dev, "vport handle %llx pdid %x doorbell_id %x\n",
+ mpc->port_handle, pd->pdn, doorbell_id);
- return 0;
+ return err;
}
int mana_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
struct mutex vport_mutex;
int vport_use_count;
+ /* Port bound to this PD for raw QP usage. Only valid when
+ * vport_use_count > 0. A PD can only be associated with a
+ * single physical port because per-port EQs and vport
+ * configuration are tied to the PD's refcount.
+ */
+ u32 vport_port;
+
+ /* Only one RSS QP is allowed per vport because each RSS QP
+ * overwrites the vport steering config (indirection table /
+ * hash key) and mana_disable_vport_rx() on destroy would
+ * blackhole traffic for any other RSS QP on the same vport.
+ */
+ bool has_rss_qp;
+
bool tx_shortform_allowed;
u32 tx_vp_offset;
};
struct ib_qp_init_attr *attr,
struct ib_udata *udata)
{
+ struct mana_ib_pd *mana_pd = container_of(pd, struct mana_ib_pd, ibpd);
struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp);
struct mana_ib_dev *mdev =
container_of(pd->device, struct mana_ib_dev, ib_dev);
qp->port = port;
+ /* Take a reference on the vport to ensure EQs outlive this QP.
+ * The vport must already be configured by a raw QP on the
+ * same port — cross-port PD sharing is not supported.
+ * Only one RSS QP per vport is allowed because each one
+ * overwrites the steering config and destroy disables RX
+ * globally.
+ */
+ mutex_lock(&mana_pd->vport_mutex);
+ if (!mana_pd->vport_use_count || mana_pd->vport_port != port) {
+ mutex_unlock(&mana_pd->vport_mutex);
+ ret = -EINVAL;
+ goto fail;
+ }
+ if (mana_pd->has_rss_qp) {
+ mutex_unlock(&mana_pd->vport_mutex);
+ ibdev_dbg(&mdev->ib_dev,
+ "Only one RSS QP per vport is supported\n");
+ ret = -EBUSY;
+ goto fail;
+ }
+ mana_pd->vport_use_count++;
+ mana_pd->has_rss_qp = true;
+ mutex_unlock(&mana_pd->vport_mutex);
+
for (i = 0; i < ind_tbl_size; i++) {
struct mana_obj_spec wq_spec = {};
struct mana_obj_spec cq_spec = {};
cq_spec.gdma_region = cq->queue.gdma_region;
cq_spec.queue_size = cq->cqe * COMP_ENTRY_SIZE;
cq_spec.modr_ctx_id = 0;
- eq = &mpc->ac->eqs[cq->comp_vector];
+ /* Map comp_vector to a per-vPort EQ. The modulo handles
+ * the case where the RDMA-advertised num_comp_vectors
+ * exceeds this port's num_queues (e.g. after ethtool -L
+ * reduces it), remapping to an available EQ rather than
+ * failing the QP creation.
+ */
+ eq = &mpc->eqs[cq->comp_vector % mpc->num_queues];
cq_spec.attached_eq = eq->eq->id;
ret = mana_create_wq_obj(mpc, mpc->port_handle, GDMA_RQ,
&wq_spec, &cq_spec, &wq->rx_object);
if (ret)
- goto fail;
+ goto free_vport;
/* The GDMA regions are now owned by the WQ object */
wq->queue.gdma_region = GDMA_INVALID_DMA_REGION;
ret = mana_ib_install_cq_cb(mdev, cq);
if (ret) {
mana_destroy_wq_obj(mpc, GDMA_RQ, wq->rx_object);
- goto fail;
+ goto free_vport;
}
}
resp.num_entries = i;
ucmd.rx_hash_key_len,
ucmd.rx_hash_key);
if (ret)
- goto fail;
+ goto free_vport;
ret = ib_copy_to_udata(udata, &resp, sizeof(resp));
if (ret) {
err_disable_vport_rx:
mana_disable_vport_rx(mpc);
-fail:
+free_vport:
while (i-- > 0) {
ibwq = ind_tbl->ind_tbl[i];
ibcq = ibwq->cq;
mana_destroy_wq_obj(mpc, GDMA_RQ, wq->rx_object);
}
+ mutex_lock(&mana_pd->vport_mutex);
+ mana_pd->has_rss_qp = false;
+ mutex_unlock(&mana_pd->vport_mutex);
+
+ mana_ib_uncfg_vport(mdev, mana_pd, port);
+
+fail:
kfree(mana_ind_table);
return ret;
err = mana_ib_cfg_vport(mdev, port, pd, mana_ucontext->doorbell);
if (err)
- return -ENODEV;
+ return err;
qp->port = port;
cq_spec.queue_size = send_cq->cqe * COMP_ENTRY_SIZE;
cq_spec.modr_ctx_id = 0;
eq_vec = send_cq->comp_vector;
- eq = &mpc->ac->eqs[eq_vec];
+ if (!mpc->eqs) {
+ err = -EINVAL;
+ goto err_destroy_queue;
+ }
+ /* Map comp_vector to a per-vPort EQ. See comment in
+ * mana_ib_create_qp_rss() for the modulo rationale.
+ */
+ eq = &mpc->eqs[eq_vec % mpc->num_queues];
cq_spec.attached_eq = eq->eq->id;
err = mana_create_wq_obj(mpc, mpc->port_handle, GDMA_SQ, &wq_spec,
{
struct mana_ib_dev *mdev =
container_of(qp->ibqp.device, struct mana_ib_dev, ib_dev);
+ struct ib_pd *ibpd = qp->ibqp.pd;
struct mana_port_context *mpc;
struct net_device *ndev;
+ struct mana_ib_pd *pd;
struct mana_ib_wq *wq;
struct ib_wq *ibwq;
int i;
ndev = mana_ib_get_netdev(qp->ibqp.device, qp->port);
mpc = netdev_priv(ndev);
+ pd = container_of(ibpd, struct mana_ib_pd, ibpd);
/* Disable vPort RX steering before destroying RX WQ objects.
* Otherwise firmware still routes traffic to the destroyed queues,
mana_destroy_wq_obj(mpc, GDMA_RQ, wq->rx_object);
}
+ mutex_lock(&pd->vport_mutex);
+ pd->has_rss_qp = false;
+ mutex_unlock(&pd->vport_mutex);
+
+ mana_ib_uncfg_vport(mdev, pd, qp->port);
+
return 0;
}
rtnl_lock();
+ /* Block RDMA from grabbing the vport during the detach/attach
+ * window, same as mana_set_channels().
+ */
+ mutex_lock(&apc->vport_mutex);
+ apc->channel_changing = true;
+ mutex_unlock(&apc->vport_mutex);
+
/* Pre-allocate buffers to prevent failure in mana_attach later */
err = mana_pre_alloc_rxbufs(apc, ndev->mtu, apc->num_queues);
if (err) {
netdev_err(ndev, "Insufficient memory for reset post tx stall detection\n");
- goto out;
+ goto clear_flag;
}
err = mana_detach(ndev, false);
dealloc_pre_rxbufs:
mana_pre_dealloc_rxbufs(apc);
-out:
+clear_flag:
+ mutex_lock(&apc->vport_mutex);
+ apc->channel_changing = false;
+ mutex_unlock(&apc->vport_mutex);
+
rtnl_unlock();
}
EXPORT_SYMBOL_NS(mana_uncfg_vport, "NET_MANA");
int mana_cfg_vport(struct mana_port_context *apc, u32 protection_dom_id,
- u32 doorbell_pg_id)
+ u32 doorbell_pg_id, bool check_channel_changing)
{
struct mana_config_vport_resp resp = {};
struct mana_config_vport_req req = {};
* Ethernet usage on the same port.
*/
mutex_lock(&apc->vport_mutex);
- if (apc->vport_use_count > 0) {
+ if (apc->vport_use_count > 0 ||
+ (check_channel_changing && apc->channel_changing)) {
mutex_unlock(&apc->vport_mutex);
return -EBUSY;
}
}
EXPORT_SYMBOL_NS(mana_destroy_wq_obj, "NET_MANA");
-static void mana_destroy_eq(struct mana_context *ac)
+void mana_destroy_eq(struct mana_port_context *apc)
{
+ struct mana_context *ac = apc->ac;
struct gdma_context *gc = ac->gdma_dev->gdma_context;
struct gdma_queue *eq;
int i;
- if (!ac->eqs)
+ if (!apc->eqs)
return;
- debugfs_remove_recursive(ac->mana_eqs_debugfs);
- ac->mana_eqs_debugfs = NULL;
+ debugfs_remove_recursive(apc->mana_eqs_debugfs);
+ apc->mana_eqs_debugfs = NULL;
- for (i = 0; i < gc->max_num_queues; i++) {
- eq = ac->eqs[i].eq;
+ for (i = 0; i < apc->num_queues; i++) {
+ eq = apc->eqs[i].eq;
if (!eq)
continue;
mana_gd_destroy_queue(gc, eq);
}
- kfree(ac->eqs);
- ac->eqs = NULL;
+ kfree(apc->eqs);
+ apc->eqs = NULL;
}
+EXPORT_SYMBOL_NS(mana_destroy_eq, "NET_MANA");
-static void mana_create_eq_debugfs(struct mana_context *ac, int i)
+static void mana_create_eq_debugfs(struct mana_port_context *apc, int i)
{
- struct mana_eq eq = ac->eqs[i];
+ struct mana_eq eq = apc->eqs[i];
char eqnum[32];
sprintf(eqnum, "eq%d", i);
- eq.mana_eq_debugfs = debugfs_create_dir(eqnum, ac->mana_eqs_debugfs);
+ eq.mana_eq_debugfs = debugfs_create_dir(eqnum, apc->mana_eqs_debugfs);
debugfs_create_u32("head", 0400, eq.mana_eq_debugfs, &eq.eq->head);
debugfs_create_u32("tail", 0400, eq.mana_eq_debugfs, &eq.eq->tail);
debugfs_create_file("eq_dump", 0400, eq.mana_eq_debugfs, eq.eq, &mana_dbg_q_fops);
}
-static int mana_create_eq(struct mana_context *ac)
+int mana_create_eq(struct mana_port_context *apc)
{
- struct gdma_dev *gd = ac->gdma_dev;
+ struct gdma_dev *gd = apc->ac->gdma_dev;
struct gdma_context *gc = gd->gdma_context;
struct gdma_queue_spec spec = {};
int err;
int i;
- ac->eqs = kzalloc_objs(struct mana_eq, gc->max_num_queues);
- if (!ac->eqs)
+ if (WARN_ON(apc->eqs))
+ return -EEXIST;
+ apc->eqs = kzalloc_objs(struct mana_eq, apc->num_queues);
+ if (!apc->eqs)
return -ENOMEM;
spec.type = GDMA_EQ;
spec.monitor_avl_buf = false;
spec.queue_size = EQ_SIZE;
spec.eq.callback = NULL;
- spec.eq.context = ac->eqs;
+ spec.eq.context = apc->eqs;
spec.eq.log2_throttle_limit = LOG2_EQ_THROTTLE;
- ac->mana_eqs_debugfs = debugfs_create_dir("EQs", gc->mana_pci_debugfs);
+ apc->mana_eqs_debugfs =
+ debugfs_create_dir("EQs", apc->mana_port_debugfs);
- for (i = 0; i < gc->max_num_queues; i++) {
+ for (i = 0; i < apc->num_queues; i++) {
spec.eq.msix_index = (i + 1) % gc->num_msix_usable;
- err = mana_gd_create_mana_eq(gd, &spec, &ac->eqs[i].eq);
+ err = mana_gd_create_mana_eq(gd, &spec, &apc->eqs[i].eq);
if (err) {
dev_err(gc->dev, "Failed to create EQ %d : %d\n", i, err);
goto out;
}
- mana_create_eq_debugfs(ac, i);
+ mana_create_eq_debugfs(apc, i);
}
return 0;
out:
- mana_destroy_eq(ac);
+ mana_destroy_eq(apc);
return err;
}
+EXPORT_SYMBOL_NS(mana_create_eq, "NET_MANA");
static int mana_fence_rq(struct mana_port_context *apc, struct mana_rxq *rxq)
{
spec.monitor_avl_buf = false;
spec.queue_size = cq_size;
spec.cq.callback = mana_schedule_napi;
- spec.cq.parent_eq = ac->eqs[i].eq;
+ spec.cq.parent_eq = apc->eqs[i].eq;
spec.cq.context = cq;
err = mana_gd_create_mana_wq_cq(gd, &spec, &cq->gdma_cq);
if (err)
static int mana_add_rx_queues(struct mana_port_context *apc,
struct net_device *ndev)
{
- struct mana_context *ac = apc->ac;
struct mana_rxq *rxq;
int err = 0;
int i;
for (i = 0; i < apc->num_queues; i++) {
- rxq = mana_create_rxq(apc, i, &ac->eqs[i], ndev);
+ rxq = mana_create_rxq(apc, i, &apc->eqs[i], ndev);
if (!rxq) {
err = -ENOMEM;
netdev_err(ndev, "Failed to create rxq %d : %d\n", i, err);
return err;
}
-static void mana_destroy_vport(struct mana_port_context *apc)
+static void mana_destroy_rxqs(struct mana_port_context *apc)
{
- struct gdma_dev *gd = apc->ac->gdma_dev;
struct mana_rxq *rxq;
u32 rxq_idx;
apc->rxqs[rxq_idx] = NULL;
}
}
+}
+
+static void mana_destroy_vport(struct mana_port_context *apc)
+{
+ struct gdma_dev *gd = apc->ac->gdma_dev;
- mana_destroy_txq(apc);
mana_uncfg_vport(apc);
if (gd->gdma_context->is_pf && !apc->ac->bm_hostmode)
return err;
}
- err = mana_cfg_vport(apc, gd->pdid, gd->doorbell);
- if (err)
+ err = mana_cfg_vport(apc, gd->pdid, gd->doorbell, false);
+ if (err) {
+ if (gd->gdma_context->is_pf && !apc->ac->bm_hostmode)
+ mana_pf_deregister_hw_vport(apc);
return err;
+ }
- return mana_create_txq(apc, net);
+ return 0;
}
static int mana_rss_table_alloc(struct mana_port_context *apc)
err = mana_create_vport(apc, ndev);
if (err) {
- netdev_err(ndev, "Failed to create vPort %u : %d\n", apc->port_idx, err);
+ netdev_err(ndev, "Failed to create vPort %u : %d\n",
+ apc->port_idx, err);
return err;
}
+ err = mana_create_eq(apc);
+ if (err) {
+ netdev_err(ndev, "Failed to create EQ on vPort %u: %d\n",
+ apc->port_idx, err);
+ goto destroy_vport;
+ }
+
+ err = mana_create_txq(apc, ndev);
+ if (err) {
+ netdev_err(ndev, "Failed to create TXQ on vPort %u: %d\n",
+ apc->port_idx, err);
+ goto destroy_eq;
+ }
+
err = netif_set_real_num_tx_queues(ndev, apc->num_queues);
if (err) {
netdev_err(ndev,
"netif_set_real_num_tx_queues () failed for ndev with num_queues %u : %d\n",
apc->num_queues, err);
- goto destroy_vport;
+ goto destroy_txq;
}
err = mana_add_rx_queues(apc, ndev);
if (err)
- goto destroy_vport;
+ goto destroy_rxq;
apc->rss_state = apc->num_queues > 1 ? TRI_STATE_TRUE : TRI_STATE_FALSE;
netdev_err(ndev,
"netif_set_real_num_rx_queues () failed for ndev with num_queues %u : %d\n",
apc->num_queues, err);
- goto destroy_vport;
+ goto destroy_rxq;
}
mana_rss_table_init(apc);
err = mana_config_rss(apc, TRI_STATE_TRUE, true, true);
if (err) {
netdev_err(ndev, "Failed to configure RSS table: %d\n", err);
- goto destroy_vport;
+ goto destroy_rxq;
}
if (gd->gdma_context->is_pf && !apc->ac->bm_hostmode) {
err = mana_pf_register_filter(apc);
if (err)
- goto destroy_vport;
+ goto destroy_rxq;
}
mana_chn_setxdp(apc, mana_xdp_get(apc));
return 0;
+destroy_rxq:
+ mana_destroy_rxqs(apc);
+destroy_txq:
+ mana_destroy_txq(apc);
+destroy_eq:
+ mana_destroy_eq(apc);
destroy_vport:
mana_destroy_vport(apc);
return err;
mana_fence_rqs(apc);
/* Even in err case, still need to cleanup the vPort */
+ mana_destroy_rxqs(apc);
+ mana_destroy_txq(apc);
+ mana_destroy_eq(apc);
mana_destroy_vport(apc);
return 0;
INIT_DELAYED_WORK(&ac->gf_stats_work, mana_gf_stats_work_handler);
- err = mana_create_eq(ac);
- if (err) {
- dev_err(dev, "Failed to create EQs: %d\n", err);
- goto out;
- }
-
err = mana_query_device_cfg(ac, MANA_MAJOR_VERSION, MANA_MINOR_VERSION,
MANA_MICRO_VERSION, &num_ports, &bm_hostmode);
if (err)
free_netdev(ndev);
}
- mana_destroy_eq(ac);
-
if (ac->per_port_queue_reset_wq) {
destroy_workqueue(ac->per_port_queue_reset_wq);
ac->per_port_queue_reset_wq = NULL;
return err;
}
+/* mana_set_channels - change the number of queues on a port
+ *
+ * Returns -EBUSY if RDMA holds the vport with EQs sized to the
+ * current num_queues.
+ */
static int mana_set_channels(struct net_device *ndev,
struct ethtool_channels *channels)
{
unsigned int old_count = apc->num_queues;
int err;
+ /* Set channel_changing to block RDMA from grabbing the vport
+ * during the detach/attach window. mana_cfg_vport() checks
+ * this flag under vport_mutex and returns -EBUSY if set.
+ */
+ mutex_lock(&apc->vport_mutex);
+ if (!apc->port_is_up && apc->vport_use_count) {
+ mutex_unlock(&apc->vport_mutex);
+ return -EBUSY;
+ }
+ apc->channel_changing = true;
+ mutex_unlock(&apc->vport_mutex);
+
err = mana_pre_alloc_rxbufs(apc, ndev->mtu, new_count);
if (err) {
netdev_err(ndev, "Insufficient memory for new allocations");
- return err;
+ goto clear_flag;
}
err = mana_detach(ndev, false);
out:
mana_pre_dealloc_rxbufs(apc);
+clear_flag:
+ mutex_lock(&apc->vport_mutex);
+ apc->channel_changing = false;
+ mutex_unlock(&apc->vport_mutex);
return err;
}
u8 bm_hostmode;
struct mana_ethtool_hc_stats hc_stats;
- struct mana_eq *eqs;
- struct dentry *mana_eqs_debugfs;
struct workqueue_struct *per_port_queue_reset_wq;
/* Workqueue for querying hardware stats */
struct delayed_work gf_stats_work;
u8 mac_addr[ETH_ALEN];
+ struct mana_eq *eqs;
+ struct dentry *mana_eqs_debugfs;
+
enum TRI_STATE rss_state;
mana_handle_t default_rxobj;
struct mutex vport_mutex;
int vport_use_count;
+ /* Set by mana_set_channels() under vport_mutex to block RDMA
+ * from grabbing the vport during the detach/attach window.
+ * Checked by mana_cfg_vport() when called from the RDMA path.
+ */
+ bool channel_changing;
+
/* Net shaper handle*/
struct net_shaper_handle handle;
mana_handle_t wq_obj);
int mana_cfg_vport(struct mana_port_context *apc, u32 protection_dom_id,
- u32 doorbell_pg_id);
+ u32 doorbell_pg_id, bool check_channel_changing);
void mana_uncfg_vport(struct mana_port_context *apc);
+int mana_create_eq(struct mana_port_context *apc);
+void mana_destroy_eq(struct mana_port_context *apc);
struct net_device *mana_get_primary_netdev(struct mana_context *ac,
u32 port_index,