]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
RDMA/mlx5: Add DMAH support for reg_user_mr/reg_user_dmabuf_mr
authorYishai Hadas <yishaih@nvidia.com>
Thu, 17 Jul 2025 12:17:32 +0000 (15:17 +0300)
committerLeon Romanovsky <leon@kernel.org>
Wed, 23 Jul 2025 05:42:11 +0000 (01:42 -0400)
As part of this enhancement, allow the creation of an MKEY associated
with a DMA handle.

Additional notes:

MKEYs with TPH (i.e. TLP Processing Hints) attributes are currently not
UMR-capable unless explicitly enabled by firmware or hardware.
Therefore, to maintain such MKEYs in the MR cache, the TPH fields have
been added to the rb_key structure, with a dedicated hash bucket.

The ability to bypass the kernel verbs flow and create an MKEY with TPH
attributes using DEVX has been restricted. TPH must follow the standard
InfiniBand flow, where a DMAH is created with the appropriate security
checks and management mechanisms in place.

DMA handles are currently not supported in conjunction with On-Demand
Paging (ODP).

Re-registration of memory regions originally created with TPH attributes
is currently not supported.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Reviewed-by: Edward Srouji <edwards@nvidia.com>
Link: https://patch.msgid.link/1c485651cf8417694ddebb80446c5093d5a791a9.1752752567.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
drivers/infiniband/hw/mlx5/devx.c
drivers/infiniband/hw/mlx5/mlx5_ib.h
drivers/infiniband/hw/mlx5/mr.c
drivers/infiniband/hw/mlx5/odp.c

index c03fe5414b9aa0e205f0991ac465dd3e585e4d63..c6ced06111dd0c1c92e63adb560582d82743284c 100644 (file)
@@ -1393,6 +1393,10 @@ static int devx_handle_mkey_create(struct mlx5_ib_dev *dev,
        }
 
        MLX5_SET(create_mkey_in, in, mkey_umem_valid, 1);
+       /* TPH is not allowed to bypass the regular kernel's verbs flow */
+       MLX5_SET(mkc, mkc, pcie_tph_en, 0);
+       MLX5_SET(mkc, mkc, pcie_tph_steering_tag_index,
+                MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX);
        return 0;
 }
 
index dfd231333509dc0c72e229281ba95e43eb8a051c..692ec31d937a9281162deb54c3bcf417bb927a85 100644 (file)
@@ -638,8 +638,13 @@ enum mlx5_mkey_type {
        MLX5_MKEY_IMPLICIT_CHILD,
 };
 
+/* Used for non-existent ph value */
+#define MLX5_IB_NO_PH 0xff
+
 struct mlx5r_cache_rb_key {
        u8 ats:1;
+       u8 ph;
+       u16 st_index;
        unsigned int access_mode;
        unsigned int access_flags;
        unsigned int ndescs;
index 69f1403cc578d037e9ab16cac975edff2369f6c1..c47acb40c54310965063232ba5070b676a4ba6b8 100644 (file)
@@ -44,6 +44,7 @@
 #include "mlx5_ib.h"
 #include "umr.h"
 #include "data_direct.h"
+#include "dmah.h"
 
 enum {
        MAX_PENDING_REG_MR = 8,
@@ -57,7 +58,7 @@ create_mkey_callback(int status, struct mlx5_async_work *context);
 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
                                     u64 iova, int access_flags,
                                     unsigned long page_size, bool populate,
-                                    int access_mode);
+                                    int access_mode, u16 st_index, u8 ph);
 static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr);
 
 static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr,
@@ -256,6 +257,14 @@ static void set_cache_mkc(struct mlx5_cache_ent *ent, void *mkc)
                 get_mkc_octo_size(ent->rb_key.access_mode,
                                   ent->rb_key.ndescs));
        MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
+
+       if (ent->rb_key.ph != MLX5_IB_NO_PH) {
+               MLX5_SET(mkc, mkc, pcie_tph_en, 1);
+               MLX5_SET(mkc, mkc, pcie_tph_ph, ent->rb_key.ph);
+               if (ent->rb_key.st_index != MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX)
+                       MLX5_SET(mkc, mkc, pcie_tph_steering_tag_index,
+                                ent->rb_key.st_index);
+       }
 }
 
 /* Asynchronously schedule new MRs to be populated in the cache. */
@@ -641,6 +650,14 @@ static int cache_ent_key_cmp(struct mlx5r_cache_rb_key key1,
        if (res)
                return res;
 
+       res = key1.st_index - key2.st_index;
+       if (res)
+               return res;
+
+       res = key1.ph - key2.ph;
+       if (res)
+               return res;
+
        /*
         * keep ndescs the last in the compare table since the find function
         * searches for an exact match on all properties and only closest
@@ -712,6 +729,8 @@ mkey_cache_ent_from_rb_key(struct mlx5_ib_dev *dev,
                smallest->rb_key.access_mode == rb_key.access_mode &&
                smallest->rb_key.access_flags == rb_key.access_flags &&
                smallest->rb_key.ats == rb_key.ats &&
+               smallest->rb_key.st_index == rb_key.st_index &&
+               smallest->rb_key.ph == rb_key.ph &&
                smallest->rb_key.ndescs <= ndescs_limit) ?
                       smallest :
                       NULL;
@@ -786,7 +805,8 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
        struct mlx5r_cache_rb_key rb_key = {
                .ndescs = ndescs,
                .access_mode = access_mode,
-               .access_flags = get_unchangeable_access_flags(dev, access_flags)
+               .access_flags = get_unchangeable_access_flags(dev, access_flags),
+               .ph = MLX5_IB_NO_PH,
        };
        struct mlx5_cache_ent *ent = mkey_cache_ent_from_rb_key(dev, rb_key);
 
@@ -943,6 +963,7 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
        struct rb_root *root = &dev->cache.rb_root;
        struct mlx5r_cache_rb_key rb_key = {
                .access_mode = MLX5_MKC_ACCESS_MODE_MTT,
+               .ph = MLX5_IB_NO_PH,
        };
        struct mlx5_cache_ent *ent;
        struct rb_node *node;
@@ -1119,7 +1140,8 @@ static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem,
 
 static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
                                             struct ib_umem *umem, u64 iova,
-                                            int access_flags, int access_mode)
+                                            int access_flags, int access_mode,
+                                            u16 st_index, u8 ph)
 {
        struct mlx5_ib_dev *dev = to_mdev(pd->device);
        struct mlx5r_cache_rb_key rb_key = {};
@@ -1139,6 +1161,8 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
        rb_key.ndescs = ib_umem_num_dma_blocks(umem, page_size);
        rb_key.ats = mlx5_umem_needs_ats(dev, umem, access_flags);
        rb_key.access_flags = get_unchangeable_access_flags(dev, access_flags);
+       rb_key.st_index = st_index;
+       rb_key.ph = ph;
        ent = mkey_cache_ent_from_rb_key(dev, rb_key);
        /*
         * If the MR can't come from the cache then synchronously create an uncached
@@ -1146,7 +1170,8 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
         */
        if (!ent) {
                mutex_lock(&dev->slow_path_mutex);
-               mr = reg_create(pd, umem, iova, access_flags, page_size, false, access_mode);
+               mr = reg_create(pd, umem, iova, access_flags, page_size, false, access_mode,
+                               st_index, ph);
                mutex_unlock(&dev->slow_path_mutex);
                if (IS_ERR(mr))
                        return mr;
@@ -1231,7 +1256,7 @@ err_1:
 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
                                     u64 iova, int access_flags,
                                     unsigned long page_size, bool populate,
-                                    int access_mode)
+                                    int access_mode, u16 st_index, u8 ph)
 {
        struct mlx5_ib_dev *dev = to_mdev(pd->device);
        struct mlx5_ib_mr *mr;
@@ -1241,7 +1266,8 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
        u32 *in;
        int err;
        bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)) &&
-               (access_mode == MLX5_MKC_ACCESS_MODE_MTT);
+               (access_mode == MLX5_MKC_ACCESS_MODE_MTT) &&
+               (ph == MLX5_IB_NO_PH);
        bool ksm_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM);
 
        if (!page_size)
@@ -1305,6 +1331,13 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
                         get_octo_len(iova, umem->length, mr->page_shift));
        }
 
+       if (ph != MLX5_IB_NO_PH) {
+               MLX5_SET(mkc, mkc, pcie_tph_en, 1);
+               MLX5_SET(mkc, mkc, pcie_tph_ph, ph);
+               if (st_index != MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX)
+                       MLX5_SET(mkc, mkc, pcie_tph_steering_tag_index, st_index);
+       }
+
        err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
        if (err) {
                mlx5_ib_warn(dev, "create mkey failed\n");
@@ -1424,24 +1457,37 @@ struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm,
 }
 
 static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem,
-                                   u64 iova, int access_flags)
+                                   u64 iova, int access_flags,
+                                   struct ib_dmah *dmah)
 {
        struct mlx5_ib_dev *dev = to_mdev(pd->device);
        struct mlx5_ib_mr *mr = NULL;
        bool xlt_with_umr;
+       u16 st_index = MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX;
+       u8 ph = MLX5_IB_NO_PH;
        int err;
 
+       if (dmah) {
+               struct mlx5_ib_dmah *mdmah = to_mdmah(dmah);
+
+               ph = dmah->ph;
+               if (dmah->valid_fields & BIT(IB_DMAH_CPU_ID_EXISTS))
+                       st_index = mdmah->st_index;
+       }
+
        xlt_with_umr = mlx5r_umr_can_load_pas(dev, umem->length);
        if (xlt_with_umr) {
                mr = alloc_cacheable_mr(pd, umem, iova, access_flags,
-                                       MLX5_MKC_ACCESS_MODE_MTT);
+                                       MLX5_MKC_ACCESS_MODE_MTT,
+                                       st_index, ph);
        } else {
                unsigned long page_size = mlx5_umem_mkc_find_best_pgsz(
                                dev, umem, iova, MLX5_MKC_ACCESS_MODE_MTT);
 
                mutex_lock(&dev->slow_path_mutex);
                mr = reg_create(pd, umem, iova, access_flags, page_size,
-                               true, MLX5_MKC_ACCESS_MODE_MTT);
+                               true, MLX5_MKC_ACCESS_MODE_MTT,
+                               st_index, ph);
                mutex_unlock(&dev->slow_path_mutex);
        }
        if (IS_ERR(mr)) {
@@ -1505,7 +1551,9 @@ static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length,
                return ERR_CAST(odp);
 
        mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags,
-                               MLX5_MKC_ACCESS_MODE_MTT);
+                               MLX5_MKC_ACCESS_MODE_MTT,
+                               MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX,
+                               MLX5_IB_NO_PH);
        if (IS_ERR(mr)) {
                ib_umem_release(&odp->umem);
                return ERR_CAST(mr);
@@ -1536,7 +1584,8 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
        struct ib_umem *umem;
        int err;
 
-       if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || dmah)
+       if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) ||
+           ((access_flags & IB_ACCESS_ON_DEMAND) && dmah))
                return ERR_PTR(-EOPNOTSUPP);
 
        mlx5_ib_dbg(dev, "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n",
@@ -1552,7 +1601,7 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
        umem = ib_umem_get(&dev->ib_dev, start, length, access_flags);
        if (IS_ERR(umem))
                return ERR_CAST(umem);
-       return create_real_mr(pd, umem, iova, access_flags);
+       return create_real_mr(pd, umem, iova, access_flags, dmah);
 }
 
 static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach)
@@ -1577,12 +1626,15 @@ static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = {
 static struct ib_mr *
 reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device,
                   u64 offset, u64 length, u64 virt_addr,
-                  int fd, int access_flags, int access_mode)
+                  int fd, int access_flags, int access_mode,
+                  struct ib_dmah *dmah)
 {
        bool pinned_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM);
        struct mlx5_ib_dev *dev = to_mdev(pd->device);
        struct mlx5_ib_mr *mr = NULL;
        struct ib_umem_dmabuf *umem_dmabuf;
+       u16 st_index = MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX;
+       u8 ph = MLX5_IB_NO_PH;
        int err;
 
        err = mlx5r_umr_resource_init(dev);
@@ -1605,8 +1657,17 @@ reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device,
                return ERR_CAST(umem_dmabuf);
        }
 
+       if (dmah) {
+               struct mlx5_ib_dmah *mdmah = to_mdmah(dmah);
+
+               ph = dmah->ph;
+               if (dmah->valid_fields & BIT(IB_DMAH_CPU_ID_EXISTS))
+                       st_index = mdmah->st_index;
+       }
+
        mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr,
-                               access_flags, access_mode);
+                               access_flags, access_mode,
+                               st_index, ph);
        if (IS_ERR(mr)) {
                ib_umem_release(&umem_dmabuf->umem);
                return ERR_CAST(mr);
@@ -1663,7 +1724,8 @@ reg_user_mr_dmabuf_by_data_direct(struct ib_pd *pd, u64 offset,
        access_flags &= ~IB_ACCESS_RELAXED_ORDERING;
        crossed_mr = reg_user_mr_dmabuf(pd, &data_direct_dev->pdev->dev,
                                        offset, length, virt_addr, fd,
-                                       access_flags, MLX5_MKC_ACCESS_MODE_KSM);
+                                       access_flags, MLX5_MKC_ACCESS_MODE_KSM,
+                                       NULL);
        if (IS_ERR(crossed_mr)) {
                ret = PTR_ERR(crossed_mr);
                goto end;
@@ -1698,7 +1760,7 @@ struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
        int err;
 
        if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) ||
-           !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) || dmah)
+           !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
                return ERR_PTR(-EOPNOTSUPP);
 
        if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS)) {
@@ -1723,7 +1785,8 @@ struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
 
        return reg_user_mr_dmabuf(pd, pd->device->dma_device,
                                  offset, length, virt_addr,
-                                 fd, access_flags, MLX5_MKC_ACCESS_MODE_MTT);
+                                 fd, access_flags, MLX5_MKC_ACCESS_MODE_MTT,
+                                 dmah);
 }
 
 /*
@@ -1821,7 +1884,8 @@ struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
        struct mlx5_ib_mr *mr = to_mmr(ib_mr);
        int err;
 
-       if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || mr->data_direct)
+       if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || mr->data_direct ||
+           mr->mmkey.rb_key.ph != MLX5_IB_NO_PH)
                return ERR_PTR(-EOPNOTSUPP);
 
        mlx5_ib_dbg(
@@ -1865,7 +1929,7 @@ struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
                atomic_sub(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages);
 
                return create_real_mr(new_pd, umem, mr->ibmr.iova,
-                                     new_access_flags);
+                                     new_access_flags, NULL);
        }
 
        /*
@@ -1896,7 +1960,7 @@ struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
                        }
                        return NULL;
                }
-               return create_real_mr(new_pd, new_umem, iova, new_access_flags);
+               return create_real_mr(new_pd, new_umem, iova, new_access_flags, NULL);
        }
 
        /*
index 1199fc785e085a6cc3dcadc85ebaad569b725fbf..3459844eb5a9fcdd002b5931f4a05bc965d82c08 100644 (file)
@@ -1883,6 +1883,7 @@ int mlx5_odp_init_mkey_cache(struct mlx5_ib_dev *dev)
        struct mlx5r_cache_rb_key rb_key = {
                .access_mode = MLX5_MKC_ACCESS_MODE_KSM,
                .ndescs = mlx5_imr_ksm_entries,
+               .ph = MLX5_IB_NO_PH,
        };
        struct mlx5_cache_ent *ent;