]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
RDMA/irdma: Add support for revocable pinned dmabuf import
authorJacob Moroni <jmoroni@google.com>
Thu, 5 Mar 2026 17:08:26 +0000 (17:08 +0000)
committerLeon Romanovsky <leon@kernel.org>
Sun, 8 Mar 2026 12:37:38 +0000 (08:37 -0400)
Use the new API to support importing pinned dmabufs from exporters
that require revocation, such as VFIO. The revoke semantic is
achieved by issuing a HW invalidation command but not freeing
the key. This prevents further accesses to the region (they will
result in an invalid key AE), but also keeps the key reserved
until the region is actually deregistered (i.e., ibv_dereg_mr)
so that a new MR registration cannot acquire the same key.

Tested with lockdep+kasan and a memfd backed dmabuf.

The rereg_mr path is explicitly blocked in libibverbs for dmabuf MRs
(more specifically, any MR not of type IBV_MR_TYPE_MR), so the rereg_mr
path for dmabufs was tested with a modified libibverbs.

Signed-off-by: Jacob Moroni <jmoroni@google.com>
Link: https://patch.msgid.link/20260305170826.3803155-6-jmoroni@google.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
drivers/infiniband/hw/irdma/verbs.c

index 7251cd7a21471e80e821210341f118257d3370ed..1d0c2d8453a8ab3afb5c0199eb1be3e9a0cea108 100644 (file)
@@ -3590,6 +3590,36 @@ error:
        return ERR_PTR(err);
 }
 
+static int irdma_hwdereg_mr(struct ib_mr *ib_mr);
+
+static void irdma_umem_dmabuf_revoke(void *priv)
+{
+       /* priv is guaranteed to be valid any time this callback is invoked
+        * because we do not set the callback until after successful iwmr
+        * allocation and initialization.
+        */
+       struct irdma_mr *iwmr = priv;
+       int err;
+
+       /* Invalidate the key in hardware. This does not actually release the
+        * key for potential reuse - that only occurs when the region is fully
+        * deregistered.
+        *
+        * The irdma_hwdereg_mr call is a no-op if the region is not currently
+        * registered with hardware.
+        */
+       err = irdma_hwdereg_mr(&iwmr->ibmr);
+       if (err) {
+               struct irdma_device *iwdev = to_iwdev(iwmr->ibmr.device);
+
+               ibdev_err(&iwdev->ibdev, "dmabuf mr revoke failed %d", err);
+               if (!iwdev->rf->reset) {
+                       iwdev->rf->reset = true;
+                       iwdev->rf->gen_ops.request_reset(iwdev->rf);
+               }
+       }
+}
+
 static struct ib_mr *irdma_reg_user_mr_dmabuf(struct ib_pd *pd, u64 start,
                                              u64 len, u64 virt,
                                              int fd, int access,
@@ -3607,7 +3637,9 @@ static struct ib_mr *irdma_reg_user_mr_dmabuf(struct ib_pd *pd, u64 start,
        if (len > iwdev->rf->sc_dev.hw_attrs.max_mr_size)
                return ERR_PTR(-EINVAL);
 
-       umem_dmabuf = ib_umem_dmabuf_get_pinned(pd->device, start, len, fd, access);
+       umem_dmabuf =
+               ib_umem_dmabuf_get_pinned_revocable_and_lock(pd->device, start,
+                                                            len, fd, access);
        if (IS_ERR(umem_dmabuf)) {
                ibdev_dbg(&iwdev->ibdev, "Failed to get dmabuf umem[%pe]\n",
                          umem_dmabuf);
@@ -3624,12 +3656,20 @@ static struct ib_mr *irdma_reg_user_mr_dmabuf(struct ib_pd *pd, u64 start,
        if (err)
                goto err_iwmr;
 
+       ib_umem_dmabuf_set_revoke_locked(umem_dmabuf, irdma_umem_dmabuf_revoke,
+                                        iwmr);
+       ib_umem_dmabuf_revoke_unlock(umem_dmabuf);
        return &iwmr->ibmr;
 
 err_iwmr:
        irdma_free_iwmr(iwmr);
 
 err_release:
+       ib_umem_dmabuf_revoke_unlock(umem_dmabuf);
+
+       /* Will result in a call to revoke, but driver callback is not set and
+        * is therefore skipped.
+        */
        ib_umem_release(&umem_dmabuf->umem);
 
        return ERR_PTR(err);
@@ -3749,6 +3789,8 @@ static struct ib_mr *irdma_rereg_user_mr(struct ib_mr *ib_mr, int flags,
        struct irdma_device *iwdev = to_iwdev(ib_mr->device);
        struct irdma_mr *iwmr = to_iwmr(ib_mr);
        struct irdma_pbl *iwpbl = &iwmr->iwpbl;
+       bool dmabuf_revocable = iwmr->region && iwmr->region->is_dmabuf;
+       struct ib_umem_dmabuf *umem_dmabuf;
        int ret;
 
        if (len > iwdev->rf->sc_dev.hw_attrs.max_mr_size)
@@ -3757,9 +3799,26 @@ static struct ib_mr *irdma_rereg_user_mr(struct ib_mr *ib_mr, int flags,
        if (flags & ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS))
                return ERR_PTR(-EOPNOTSUPP);
 
+       if (dmabuf_revocable) {
+               umem_dmabuf = to_ib_umem_dmabuf(iwmr->region);
+
+               ib_umem_dmabuf_revoke_lock(umem_dmabuf);
+
+               /* If the dmabuf has been revoked, it means that the region has
+                * been invalidated in HW. We must not allow it to become valid
+                * again unless the user is requesting a change in translation
+                * which will end up dropping the umem dmabuf and allocating an
+                * entirely new umem anyway.
+                */
+               if (umem_dmabuf->revoked && !(flags & IB_MR_REREG_TRANS)) {
+                       ret = -EINVAL;
+                       goto err_unlock;
+               }
+       }
+
        ret = irdma_hwdereg_mr(ib_mr);
        if (ret)
-               return ERR_PTR(ret);
+               goto err_unlock;
 
        if (flags & IB_MR_REREG_ACCESS)
                iwmr->access = new_access;
@@ -3775,18 +3834,28 @@ static struct ib_mr *irdma_rereg_user_mr(struct ib_mr *ib_mr, int flags,
                                        &iwpbl->pble_alloc);
                        iwpbl->pbl_allocated = false;
                }
+
+               if (dmabuf_revocable) {
+                       /* Must unlock before release to prevent deadlock */
+                       ib_umem_dmabuf_revoke_unlock(umem_dmabuf);
+                       dmabuf_revocable = false;
+               }
+
                if (iwmr->region) {
                        ib_umem_release(iwmr->region);
                        iwmr->region = NULL;
                }
 
                ret = irdma_rereg_mr_trans(iwmr, start, len, virt);
-       } else
+       } else {
                ret = irdma_hwreg_mr(iwdev, iwmr, iwmr->access);
-       if (ret)
-               return ERR_PTR(ret);
+       }
 
-       return NULL;
+err_unlock:
+       if (dmabuf_revocable)
+               ib_umem_dmabuf_revoke_unlock(umem_dmabuf);
+
+       return ret ? ERR_PTR(ret) : NULL;
 }
 
 /**
@@ -3909,6 +3978,7 @@ static int irdma_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata)
        struct irdma_mr *iwmr = to_iwmr(ib_mr);
        struct irdma_device *iwdev = to_iwdev(ib_mr->device);
        struct irdma_pbl *iwpbl = &iwmr->iwpbl;
+       bool dmabuf_revocable = iwmr->region && iwmr->region->is_dmabuf;
        int ret;
 
        if (iwmr->type != IRDMA_MEMREG_TYPE_MEM) {
@@ -3923,17 +3993,28 @@ static int irdma_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata)
                goto done;
        }
 
-       ret = irdma_hwdereg_mr(ib_mr);
-       if (ret)
-               return ret;
+       if (!dmabuf_revocable) {
+               ret = irdma_hwdereg_mr(ib_mr);
+               if (ret)
+                       return ret;
 
-       irdma_free_stag(iwdev, iwmr->stag);
+               irdma_free_stag(iwdev, iwmr->stag);
+       }
 done:
+       if (iwmr->region)
+               /* For dmabuf MRs, ib_umem_release will trigger a synchronous
+                * call to the revoke callback which will perform the actual HW
+                * invalidation via irdma_hwdereg_mr. We rely on this for its
+                * implicit serialization w.r.t. concurrent revocations. This
+                * must be done before freeing the PBLEs.
+                */
+               ib_umem_release(iwmr->region);
+
        if (iwpbl->pbl_allocated)
                irdma_free_pble(iwdev->rf->pble_rsrc, &iwpbl->pble_alloc);
 
-       if (iwmr->region)
-               ib_umem_release(iwmr->region);
+       if (dmabuf_revocable)
+               irdma_free_stag(iwdev, iwmr->stag);
 
        kfree(iwmr);