From a5c98e9424573649e59988199a3356a79c9e1fd9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 1 May 2025 13:17:18 +0100 Subject: [PATCH] io_uring/zcrx: dmabuf backed zerocopy receive Add support for dmabuf backed zcrx areas. To use it, the user should pass IORING_ZCRX_AREA_DMABUF in the struct io_uring_zcrx_area_reg flags field and pass a dmabuf fd in the dmabuf_fd field. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/20bb1890e60a82ec945ab36370d1fd54be414ab6.1746097431.git.asml.silence@gmail.com Link: https://lore.kernel.org/io-uring/6e37db97303212bbd8955f9501cf99b579f8aece.1746547722.git.asml.silence@gmail.com [axboe: fold in fixup] Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 6 +- io_uring/zcrx.c | 163 ++++++++++++++++++++++++++++++---- io_uring/zcrx.h | 7 ++ 3 files changed, 159 insertions(+), 17 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 130f3bc71a691..5ce096090b0c6 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -990,12 +990,16 @@ struct io_uring_zcrx_offsets { __u64 __resv[2]; }; +enum io_uring_zcrx_area_flags { + IORING_ZCRX_AREA_DMABUF = 1, +}; + struct io_uring_zcrx_area_reg { __u64 addr; __u64 len; __u64 rq_area_token; __u32 flags; - __u32 __resv1; + __u32 dmabuf_fd; __u64 __resv2[2]; }; diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 34b09beba9926..9a568d0492047 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -47,30 +47,118 @@ static inline struct page *io_zcrx_iov_page(const struct net_iov *niov) return area->mem.pages[net_iov_idx(niov)]; } -static void io_release_area_mem(struct io_zcrx_mem *mem) +static void io_release_dmabuf(struct io_zcrx_mem *mem) { - if (mem->pages) { - unpin_user_pages(mem->pages, mem->nr_folios); - kvfree(mem->pages); + if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) + return; + + if (mem->sgt) + dma_buf_unmap_attachment_unlocked(mem->attach, mem->sgt, + DMA_FROM_DEVICE); + if (mem->attach) + dma_buf_detach(mem->dmabuf, mem->attach); + if (mem->dmabuf) + dma_buf_put(mem->dmabuf); + + mem->sgt = NULL; + mem->attach = NULL; + mem->dmabuf = NULL; +} + +static int io_import_dmabuf(struct io_zcrx_ifq *ifq, + struct io_zcrx_mem *mem, + struct io_uring_zcrx_area_reg *area_reg) +{ + unsigned long off = (unsigned long)area_reg->addr; + unsigned long len = (unsigned long)area_reg->len; + unsigned long total_size = 0; + struct scatterlist *sg; + int dmabuf_fd = area_reg->dmabuf_fd; + int i, ret; + + if (WARN_ON_ONCE(!ifq->dev)) + return -EFAULT; + if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) + return -EINVAL; + + mem->is_dmabuf = true; + mem->dmabuf = dma_buf_get(dmabuf_fd); + if (IS_ERR(mem->dmabuf)) { + ret = PTR_ERR(mem->dmabuf); + mem->dmabuf = NULL; + goto err; } + + mem->attach = dma_buf_attach(mem->dmabuf, ifq->dev); + if (IS_ERR(mem->attach)) { + ret = PTR_ERR(mem->attach); + mem->attach = NULL; + goto err; + } + + mem->sgt = dma_buf_map_attachment_unlocked(mem->attach, DMA_FROM_DEVICE); + if (IS_ERR(mem->sgt)) { + ret = PTR_ERR(mem->sgt); + mem->sgt = NULL; + goto err; + } + + for_each_sgtable_dma_sg(mem->sgt, sg, i) + total_size += sg_dma_len(sg); + + if (total_size < off + len) + return -EINVAL; + + mem->dmabuf_offset = off; + mem->size = len; + return 0; +err: + io_release_dmabuf(mem); + return ret; } -static int io_import_area(struct io_zcrx_ifq *ifq, +static int io_zcrx_map_area_dmabuf(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) +{ + unsigned long off = area->mem.dmabuf_offset; + struct scatterlist *sg; + unsigned i, niov_idx = 0; + + if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) + return -EINVAL; + + for_each_sgtable_dma_sg(area->mem.sgt, sg, i) { + dma_addr_t dma = sg_dma_address(sg); + unsigned long sg_len = sg_dma_len(sg); + unsigned long sg_off = min(sg_len, off); + + off -= sg_off; + sg_len -= sg_off; + dma += sg_off; + + while (sg_len && niov_idx < area->nia.num_niovs) { + struct net_iov *niov = &area->nia.niovs[niov_idx]; + + if (net_mp_niov_set_dma_addr(niov, dma)) + return 0; + sg_len -= PAGE_SIZE; + dma += PAGE_SIZE; + niov_idx++; + } + } + return niov_idx; +} + +static int io_import_umem(struct io_zcrx_ifq *ifq, struct io_zcrx_mem *mem, struct io_uring_zcrx_area_reg *area_reg) { struct page **pages; int nr_pages; - int ret; - ret = io_validate_user_buf_range(area_reg->addr, area_reg->len); - if (ret) - return ret; + if (area_reg->dmabuf_fd) + return -EINVAL; if (!area_reg->addr) return -EFAULT; - if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK) - return -EINVAL; - pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len, &nr_pages); if (IS_ERR(pages)) @@ -82,6 +170,35 @@ static int io_import_area(struct io_zcrx_ifq *ifq, return 0; } +static void io_release_area_mem(struct io_zcrx_mem *mem) +{ + if (mem->is_dmabuf) { + io_release_dmabuf(mem); + return; + } + if (mem->pages) { + unpin_user_pages(mem->pages, mem->nr_folios); + kvfree(mem->pages); + } +} + +static int io_import_area(struct io_zcrx_ifq *ifq, + struct io_zcrx_mem *mem, + struct io_uring_zcrx_area_reg *area_reg) +{ + int ret; + + ret = io_validate_user_buf_range(area_reg->addr, area_reg->len); + if (ret) + return ret; + if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK) + return -EINVAL; + + if (area_reg->flags & IORING_ZCRX_AREA_DMABUF) + return io_import_dmabuf(ifq, mem, area_reg); + return io_import_umem(ifq, mem, area_reg); +} + static void io_zcrx_unmap_umem(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area, int nr_mapped) { @@ -101,7 +218,10 @@ static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, { int i; - io_zcrx_unmap_umem(ifq, area, nr_mapped); + if (area->mem.is_dmabuf) + io_release_dmabuf(&area->mem); + else + io_zcrx_unmap_umem(ifq, area, nr_mapped); for (i = 0; i < area->nia.num_niovs; i++) net_mp_niov_set_dma_addr(&area->nia.niovs[i], 0); @@ -145,7 +265,11 @@ static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) if (area->is_mapped) return 0; - nr = io_zcrx_map_area_umem(ifq, area); + if (area->mem.is_dmabuf) + nr = io_zcrx_map_area_dmabuf(ifq, area); + else + nr = io_zcrx_map_area_umem(ifq, area); + if (nr != area->nia.num_niovs) { __io_zcrx_unmap_area(ifq, area, nr); return -EINVAL; @@ -251,6 +375,8 @@ static void io_zcrx_free_area(struct io_zcrx_area *area) kfree(area); } +#define IO_ZCRX_AREA_SUPPORTED_FLAGS (IORING_ZCRX_AREA_DMABUF) + static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area **res, struct io_uring_zcrx_area_reg *area_reg) @@ -259,9 +385,11 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, unsigned nr_iovs; int i, ret; - if (area_reg->flags || area_reg->rq_area_token) + if (area_reg->flags & ~IO_ZCRX_AREA_SUPPORTED_FLAGS) + return -EINVAL; + if (area_reg->rq_area_token) return -EINVAL; - if (area_reg->__resv1 || area_reg->__resv2[0] || area_reg->__resv2[1]) + if (area_reg->__resv2[0] || area_reg->__resv2[1]) return -EINVAL; ret = -ENOMEM; @@ -819,6 +947,9 @@ static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq, size_t copied = 0; int ret = 0; + if (area->mem.is_dmabuf) + return -EFAULT; + while (len) { size_t copy_size = min_t(size_t, PAGE_SIZE, len); const int dst_off = 0; diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index 9c22807af8072..2f5e26389f221 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -3,15 +3,22 @@ #define IOU_ZC_RX_H #include +#include #include #include #include struct io_zcrx_mem { unsigned long size; + bool is_dmabuf; struct page **pages; unsigned long nr_folios; + + struct dma_buf_attachment *attach; + struct dma_buf *dmabuf; + struct sg_table *sgt; + unsigned long dmabuf_offset; }; struct io_zcrx_area { -- 2.47.2