From 026c6fabcec5dce1d6f7c623fdde3dbaeea29db0 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sun, 16 Mar 2025 07:00:40 +0100 Subject: [PATCH] 6.6-stable patches added patches: io_uring-add-ring-freeing-helper.patch io_uring-don-t-attempt-to-mmap-larger-than-what-the-user-asks-for.patch io_uring-fix-corner-case-forgetting-to-vunmap.patch io_uring-fix-error-pbuf-checking.patch io_uring-get-rid-of-remap_pfn_range-for-mapping-rings-sqes.patch io_uring-kbuf-use-vm_insert_pages-for-mmap-ed-pbuf-ring.patch io_uring-kbuf-vmap-pinned-buffer-ring.patch io_uring-return-error-pointer-from-io_mem_alloc.patch io_uring-unify-io_pin_pages.patch io_uring-use-unpin_user_pages-where-appropriate.patch io_uring-use-vmap-for-ring-mapping.patch mm-add-nommu-variant-of-vm_insert_pages.patch --- .../io_uring-add-ring-freeing-helper.patch | 68 +++ ...p-larger-than-what-the-user-asks-for.patch | 51 +++ ...fix-corner-case-forgetting-to-vunmap.patch | 49 ++ .../io_uring-fix-error-pbuf-checking.patch | 52 +++ ...map_pfn_range-for-mapping-rings-sqes.patch | 245 ++++++++++ ...m_insert_pages-for-mmap-ed-pbuf-ring.patch | 423 ++++++++++++++++++ ...o_uring-kbuf-vmap-pinned-buffer-ring.patch | 112 +++++ ...turn-error-pointer-from-io_mem_alloc.patch | 76 ++++ queue-6.6/io_uring-unify-io_pin_pages.patch | 156 +++++++ ...e-unpin_user_pages-where-appropriate.patch | 40 ++ .../io_uring-use-vmap-for-ring-mapping.patch | 87 ++++ ...add-nommu-variant-of-vm_insert_pages.patch | 36 ++ queue-6.6/series | 12 + 13 files changed, 1407 insertions(+) create mode 100644 queue-6.6/io_uring-add-ring-freeing-helper.patch create mode 100644 queue-6.6/io_uring-don-t-attempt-to-mmap-larger-than-what-the-user-asks-for.patch create mode 100644 queue-6.6/io_uring-fix-corner-case-forgetting-to-vunmap.patch create mode 100644 queue-6.6/io_uring-fix-error-pbuf-checking.patch create mode 100644 queue-6.6/io_uring-get-rid-of-remap_pfn_range-for-mapping-rings-sqes.patch create mode 100644 queue-6.6/io_uring-kbuf-use-vm_insert_pages-for-mmap-ed-pbuf-ring.patch create mode 100644 queue-6.6/io_uring-kbuf-vmap-pinned-buffer-ring.patch create mode 100644 queue-6.6/io_uring-return-error-pointer-from-io_mem_alloc.patch create mode 100644 queue-6.6/io_uring-unify-io_pin_pages.patch create mode 100644 queue-6.6/io_uring-use-unpin_user_pages-where-appropriate.patch create mode 100644 queue-6.6/io_uring-use-vmap-for-ring-mapping.patch create mode 100644 queue-6.6/mm-add-nommu-variant-of-vm_insert_pages.patch diff --git a/queue-6.6/io_uring-add-ring-freeing-helper.patch b/queue-6.6/io_uring-add-ring-freeing-helper.patch new file mode 100644 index 0000000000..fd3c4a2095 --- /dev/null +++ b/queue-6.6/io_uring-add-ring-freeing-helper.patch @@ -0,0 +1,68 @@ +From 8c273186074a591cfdcd4370849676bc3eeb6ecb Mon Sep 17 00:00:00 2001 +From: Jens Axboe +Date: Fri, 5 Nov 2021 17:15:46 -0600 +Subject: io_uring: add ring freeing helper + +From: Jens Axboe + +Commit 9c189eee73af1825ea9c895fafad469de5f82641 upstream. + +We do rings and sqes separately, move them into a helper that does both +the freeing and clearing of the memory. + +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + io_uring/io_uring.c | 17 +++++++++++------ + 1 file changed, 11 insertions(+), 6 deletions(-) + +diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c +index ebcb0680f1cc..b211feb0d2b1 100644 +--- a/io_uring/io_uring.c ++++ b/io_uring/io_uring.c +@@ -2525,6 +2525,14 @@ static void io_mem_free(void *ptr) + free_compound_page(page); + } + ++static void io_rings_free(struct io_ring_ctx *ctx) ++{ ++ io_mem_free(ctx->rings); ++ io_mem_free(ctx->sq_sqes); ++ ctx->rings = NULL; ++ ctx->sq_sqes = NULL; ++} ++ + static void *io_mem_alloc(size_t size) + { + gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; +@@ -2684,8 +2692,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) + mmdrop(ctx->mm_account); + ctx->mm_account = NULL; + } +- io_mem_free(ctx->rings); +- io_mem_free(ctx->sq_sqes); ++ io_rings_free(ctx); + + percpu_ref_exit(&ctx->refs); + free_uid(ctx->user); +@@ -3452,15 +3459,13 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, + else + size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); + if (size == SIZE_MAX) { +- io_mem_free(ctx->rings); +- ctx->rings = NULL; ++ io_rings_free(ctx); + return -EOVERFLOW; + } + + ptr = io_mem_alloc(size); + if (IS_ERR(ptr)) { +- io_mem_free(ctx->rings); +- ctx->rings = NULL; ++ io_rings_free(ctx); + return PTR_ERR(ptr); + } + +-- +2.47.2 + diff --git a/queue-6.6/io_uring-don-t-attempt-to-mmap-larger-than-what-the-user-asks-for.patch b/queue-6.6/io_uring-don-t-attempt-to-mmap-larger-than-what-the-user-asks-for.patch new file mode 100644 index 0000000000..957322cb1e --- /dev/null +++ b/queue-6.6/io_uring-don-t-attempt-to-mmap-larger-than-what-the-user-asks-for.patch @@ -0,0 +1,51 @@ +From 71318baa99b6fbb65edf76dd0afaad3afd7007cc Mon Sep 17 00:00:00 2001 +From: Jens Axboe +Date: Wed, 29 May 2024 09:38:38 -0600 +Subject: io_uring: don't attempt to mmap larger than what the user asks for + +From: Jens Axboe + +Commit 06fe9b1df1086b42718d632aa57e8f7cd1a66a21 upstream. + +If IORING_FEAT_SINGLE_MMAP is ignored, as can happen if an application +uses an ancient liburing or does setup manually, then 3 mmap's are +required to map the ring into userspace. The kernel will still have +collapsed the mappings, however userspace may ask for mapping them +individually. If so, then we should not use the full number of ring +pages, as it may exceed the partial mapping. Doing so will yield an +-EFAULT from vm_insert_pages(), as we pass in more pages than what the +application asked for. + +Cap the number of pages to match what the application asked for, for +the particular mapping operation. + +Reported-by: Lucas Mülling +Link: https://github.com/axboe/liburing/issues/1157 +Fixes: 3ab1db3c6039 ("io_uring: get rid of remap_pfn_range() for mapping rings/sqes") +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + io_uring/io_uring.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +--- a/io_uring/io_uring.c ++++ b/io_uring/io_uring.c +@@ -3612,6 +3612,7 @@ static __cold int io_uring_mmap(struct f + struct io_ring_ctx *ctx = file->private_data; + size_t sz = vma->vm_end - vma->vm_start; + long offset = vma->vm_pgoff << PAGE_SHIFT; ++ unsigned int npages; + unsigned long pfn; + void *ptr; + +@@ -3622,8 +3623,8 @@ static __cold int io_uring_mmap(struct f + switch (offset & IORING_OFF_MMAP_MASK) { + case IORING_OFF_SQ_RING: + case IORING_OFF_CQ_RING: +- return io_uring_mmap_pages(ctx, vma, ctx->ring_pages, +- ctx->n_ring_pages); ++ npages = min(ctx->n_ring_pages, (sz + PAGE_SIZE - 1) >> PAGE_SHIFT); ++ return io_uring_mmap_pages(ctx, vma, ctx->ring_pages, npages); + case IORING_OFF_SQES: + return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages, + ctx->n_sqe_pages); diff --git a/queue-6.6/io_uring-fix-corner-case-forgetting-to-vunmap.patch b/queue-6.6/io_uring-fix-corner-case-forgetting-to-vunmap.patch new file mode 100644 index 0000000000..3c2364f957 --- /dev/null +++ b/queue-6.6/io_uring-fix-corner-case-forgetting-to-vunmap.patch @@ -0,0 +1,49 @@ +From fb318430c8de3dcee5727f050dfe3f3dd8c4549c Mon Sep 17 00:00:00 2001 +From: Pavel Begunkov +Date: Mon, 25 Nov 2024 23:10:31 +0000 +Subject: io_uring: fix corner case forgetting to vunmap + +From: Pavel Begunkov + +Commit 43eef70e7e2ac74e7767731dd806720c7fb5e010 upstream. + +io_pages_unmap() is a bit tricky in trying to figure whether the pages +were previously vmap'ed or not. In particular If there is juts one page +it belives there is no need to vunmap. Paired io_pages_map(), however, +could've failed io_mem_alloc_compound() and attempted to +io_mem_alloc_single(), which does vmap, and that leads to unpaired vmap. + +The solution is to fail if io_mem_alloc_compound() can't allocate a +single page. That's the easiest way to deal with it, and those two +functions are getting removed soon, so no need to overcomplicate it. + +Cc: stable@vger.kernel.org +Fixes: 3ab1db3c6039e ("io_uring: get rid of remap_pfn_range() for mapping rings/sqes") +Signed-off-by: Pavel Begunkov +Link: https://lore.kernel.org/r/477e75a3907a2fe83249e49c0a92cd480b2c60e0.1732569842.git.asml.silence@gmail.com +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + io_uring/io_uring.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/io_uring/io_uring.c ++++ b/io_uring/io_uring.c +@@ -2889,6 +2889,8 @@ static void *io_pages_map(struct page ** + ret = io_mem_alloc_compound(pages, nr_pages, size, gfp); + if (!IS_ERR(ret)) + goto done; ++ if (nr_pages == 1) ++ goto fail; + + ret = io_mem_alloc_single(pages, nr_pages, size, gfp); + if (!IS_ERR(ret)) { +@@ -2897,7 +2899,7 @@ done: + *npages = nr_pages; + return ret; + } +- ++fail: + kvfree(pages); + *out_pages = NULL; + *npages = 0; diff --git a/queue-6.6/io_uring-fix-error-pbuf-checking.patch b/queue-6.6/io_uring-fix-error-pbuf-checking.patch new file mode 100644 index 0000000000..a6c576c934 --- /dev/null +++ b/queue-6.6/io_uring-fix-error-pbuf-checking.patch @@ -0,0 +1,52 @@ +From 55b2d61e07a887351cf2996e96f89ade5ab7f1b7 Mon Sep 17 00:00:00 2001 +From: Pavel Begunkov +Date: Thu, 18 Jul 2024 20:00:53 +0100 +Subject: io_uring: fix error pbuf checking + +From: Pavel Begunkov + +Commit bcc87d978b834c298bbdd9c52454c5d0a946e97e upstream. + +Syz reports a problem, which boils down to NULL vs IS_ERR inconsistent +error handling in io_alloc_pbuf_ring(). + +KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] +RIP: 0010:__io_remove_buffers+0xac/0x700 io_uring/kbuf.c:341 +Call Trace: + + io_put_bl io_uring/kbuf.c:378 [inline] + io_destroy_buffers+0x14e/0x490 io_uring/kbuf.c:392 + io_ring_ctx_free+0xa00/0x1070 io_uring/io_uring.c:2613 + io_ring_exit_work+0x80f/0x8a0 io_uring/io_uring.c:2844 + process_one_work kernel/workqueue.c:3231 [inline] + process_scheduled_works+0xa2c/0x1830 kernel/workqueue.c:3312 + worker_thread+0x86d/0xd40 kernel/workqueue.c:3390 + kthread+0x2f0/0x390 kernel/kthread.c:389 + ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147 + ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 + +Cc: stable@vger.kernel.org +Reported-by: syzbot+2074b1a3d447915c6f1c@syzkaller.appspotmail.com +Fixes: 87585b05757dc ("io_uring/kbuf: use vm_insert_pages() for mmap'ed pbuf ring") +Signed-off-by: Pavel Begunkov +Link: https://lore.kernel.org/r/c5f9df20560bd9830401e8e48abc029e7cfd9f5e.1721329239.git.asml.silence@gmail.com +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + io_uring/kbuf.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/io_uring/kbuf.c ++++ b/io_uring/kbuf.c +@@ -510,8 +510,10 @@ static int io_alloc_pbuf_ring(struct io_ + ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring); + + bl->buf_ring = io_pages_map(&bl->buf_pages, &bl->buf_nr_pages, ring_size); +- if (!bl->buf_ring) ++ if (IS_ERR(bl->buf_ring)) { ++ bl->buf_ring = NULL; + return -ENOMEM; ++ } + bl->is_mapped = 1; + bl->is_mmap = 1; + return 0; diff --git a/queue-6.6/io_uring-get-rid-of-remap_pfn_range-for-mapping-rings-sqes.patch b/queue-6.6/io_uring-get-rid-of-remap_pfn_range-for-mapping-rings-sqes.patch new file mode 100644 index 0000000000..4287909561 --- /dev/null +++ b/queue-6.6/io_uring-get-rid-of-remap_pfn_range-for-mapping-rings-sqes.patch @@ -0,0 +1,245 @@ +From bfaf932689d7e59e3558977854b74bde1b137fae Mon Sep 17 00:00:00 2001 +From: Jens Axboe +Date: Wed, 13 Mar 2024 09:56:14 -0600 +Subject: io_uring: get rid of remap_pfn_range() for mapping rings/sqes + +From: Jens Axboe + +Commit 3ab1db3c6039e02a9deb9d5091d28d559917a645 upstream. + +Rather than use remap_pfn_range() for this and manually free later, +switch to using vm_insert_pages() and have it Just Work. + +If possible, allocate a single compound page that covers the range that +is needed. If that works, then we can just use page_address() on that +page. If we fail to get a compound page, allocate single pages and use +vmap() to map them into the kernel virtual address space. + +This just covers the rings/sqes, the other remaining user of the mmap +remap_pfn_range() user will be converted separately. Once that is done, +we can kill the old alloc/free code. + +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + io_uring/io_uring.c | 139 +++++++++++++++++++++++++++++++++++++++++++++++++--- + io_uring/io_uring.h | 2 + 2 files changed, 133 insertions(+), 8 deletions(-) + +--- a/io_uring/io_uring.c ++++ b/io_uring/io_uring.c +@@ -2683,6 +2683,36 @@ static int io_cqring_wait(struct io_ring + return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; + } + ++static void io_pages_unmap(void *ptr, struct page ***pages, ++ unsigned short *npages) ++{ ++ bool do_vunmap = false; ++ ++ if (!ptr) ++ return; ++ ++ if (*npages) { ++ struct page **to_free = *pages; ++ int i; ++ ++ /* ++ * Only did vmap for the non-compound multiple page case. ++ * For the compound page, we just need to put the head. ++ */ ++ if (PageCompound(to_free[0])) ++ *npages = 1; ++ else if (*npages > 1) ++ do_vunmap = true; ++ for (i = 0; i < *npages; i++) ++ put_page(to_free[i]); ++ } ++ if (do_vunmap) ++ vunmap(ptr); ++ kvfree(*pages); ++ *pages = NULL; ++ *npages = 0; ++} ++ + void io_mem_free(void *ptr) + { + if (!ptr) +@@ -2787,8 +2817,8 @@ static void *io_sqes_map(struct io_ring_ + static void io_rings_free(struct io_ring_ctx *ctx) + { + if (!(ctx->flags & IORING_SETUP_NO_MMAP)) { +- io_mem_free(ctx->rings); +- io_mem_free(ctx->sq_sqes); ++ io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages); ++ io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages); + } else { + io_pages_free(&ctx->ring_pages, ctx->n_ring_pages); + ctx->n_ring_pages = 0; +@@ -2800,6 +2830,80 @@ static void io_rings_free(struct io_ring + ctx->sq_sqes = NULL; + } + ++static void *io_mem_alloc_compound(struct page **pages, int nr_pages, ++ size_t size, gfp_t gfp) ++{ ++ struct page *page; ++ int i, order; ++ ++ order = get_order(size); ++ if (order > 10) ++ return ERR_PTR(-ENOMEM); ++ else if (order) ++ gfp |= __GFP_COMP; ++ ++ page = alloc_pages(gfp, order); ++ if (!page) ++ return ERR_PTR(-ENOMEM); ++ ++ for (i = 0; i < nr_pages; i++) ++ pages[i] = page + i; ++ ++ return page_address(page); ++} ++ ++static void *io_mem_alloc_single(struct page **pages, int nr_pages, size_t size, ++ gfp_t gfp) ++{ ++ void *ret; ++ int i; ++ ++ for (i = 0; i < nr_pages; i++) { ++ pages[i] = alloc_page(gfp); ++ if (!pages[i]) ++ goto err; ++ } ++ ++ ret = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); ++ if (ret) ++ return ret; ++err: ++ while (i--) ++ put_page(pages[i]); ++ return ERR_PTR(-ENOMEM); ++} ++ ++static void *io_pages_map(struct page ***out_pages, unsigned short *npages, ++ size_t size) ++{ ++ gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN; ++ struct page **pages; ++ int nr_pages; ++ void *ret; ++ ++ nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; ++ pages = kvmalloc_array(nr_pages, sizeof(struct page *), gfp); ++ if (!pages) ++ return ERR_PTR(-ENOMEM); ++ ++ ret = io_mem_alloc_compound(pages, nr_pages, size, gfp); ++ if (!IS_ERR(ret)) ++ goto done; ++ ++ ret = io_mem_alloc_single(pages, nr_pages, size, gfp); ++ if (!IS_ERR(ret)) { ++done: ++ *out_pages = pages; ++ *npages = nr_pages; ++ return ret; ++ } ++ ++ kvfree(pages); ++ *out_pages = NULL; ++ *npages = 0; ++ return ret; ++} ++ + void *io_mem_alloc(size_t size) + { + gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; +@@ -3463,14 +3567,12 @@ static void *io_uring_validate_mmap_requ + /* Don't allow mmap if the ring was setup without it */ + if (ctx->flags & IORING_SETUP_NO_MMAP) + return ERR_PTR(-EINVAL); +- ptr = ctx->rings; +- break; ++ return ctx->rings; + case IORING_OFF_SQES: + /* Don't allow mmap if the ring was setup without it */ + if (ctx->flags & IORING_SETUP_NO_MMAP) + return ERR_PTR(-EINVAL); +- ptr = ctx->sq_sqes; +- break; ++ return ctx->sq_sqes; + case IORING_OFF_PBUF_RING: { + struct io_buffer_list *bl; + unsigned int bgid; +@@ -3494,11 +3596,22 @@ static void *io_uring_validate_mmap_requ + return ptr; + } + ++int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, ++ struct page **pages, int npages) ++{ ++ unsigned long nr_pages = npages; ++ ++ vm_flags_set(vma, VM_DONTEXPAND); ++ return vm_insert_pages(vma, vma->vm_start, pages, &nr_pages); ++} ++ + #ifdef CONFIG_MMU + + static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) + { ++ struct io_ring_ctx *ctx = file->private_data; + size_t sz = vma->vm_end - vma->vm_start; ++ long offset = vma->vm_pgoff << PAGE_SHIFT; + unsigned long pfn; + void *ptr; + +@@ -3506,6 +3619,16 @@ static __cold int io_uring_mmap(struct f + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + ++ switch (offset & IORING_OFF_MMAP_MASK) { ++ case IORING_OFF_SQ_RING: ++ case IORING_OFF_CQ_RING: ++ return io_uring_mmap_pages(ctx, vma, ctx->ring_pages, ++ ctx->n_ring_pages); ++ case IORING_OFF_SQES: ++ return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages, ++ ctx->n_sqe_pages); ++ } ++ + pfn = virt_to_phys(ptr) >> PAGE_SHIFT; + return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); + } +@@ -3795,7 +3918,7 @@ static __cold int io_allocate_scq_urings + return -EOVERFLOW; + + if (!(ctx->flags & IORING_SETUP_NO_MMAP)) +- rings = io_mem_alloc(size); ++ rings = io_pages_map(&ctx->ring_pages, &ctx->n_ring_pages, size); + else + rings = io_rings_map(ctx, p->cq_off.user_addr, size); + +@@ -3820,7 +3943,7 @@ static __cold int io_allocate_scq_urings + } + + if (!(ctx->flags & IORING_SETUP_NO_MMAP)) +- ptr = io_mem_alloc(size); ++ ptr = io_pages_map(&ctx->sqe_pages, &ctx->n_sqe_pages, size); + else + ptr = io_sqes_map(ctx, p->sq_off.user_addr, size); + +--- a/io_uring/io_uring.h ++++ b/io_uring/io_uring.h +@@ -55,6 +55,8 @@ bool io_fill_cqe_req_aux(struct io_kiocb + void __io_commit_cqring_flush(struct io_ring_ctx *ctx); + + struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); ++int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, ++ struct page **pages, int npages); + + struct file *io_file_get_normal(struct io_kiocb *req, int fd); + struct file *io_file_get_fixed(struct io_kiocb *req, int fd, diff --git a/queue-6.6/io_uring-kbuf-use-vm_insert_pages-for-mmap-ed-pbuf-ring.patch b/queue-6.6/io_uring-kbuf-use-vm_insert_pages-for-mmap-ed-pbuf-ring.patch new file mode 100644 index 0000000000..5221441013 --- /dev/null +++ b/queue-6.6/io_uring-kbuf-use-vm_insert_pages-for-mmap-ed-pbuf-ring.patch @@ -0,0 +1,423 @@ +From bd9194ea9dc6647ca247b948aa8587fcb348ac6e Mon Sep 17 00:00:00 2001 +From: Jens Axboe +Date: Tue, 12 Mar 2024 20:24:21 -0600 +Subject: io_uring/kbuf: use vm_insert_pages() for mmap'ed pbuf ring + +From: Jens Axboe + +Commit 87585b05757dc70545efb434669708d276125559 upstream. + +Rather than use remap_pfn_range() for this and manually free later, +switch to using vm_insert_page() and have it Just Work. + +This requires a bit of effort on the mmap lookup side, as the ctx +uring_lock isn't held, which otherwise protects buffer_lists from being +torn down, and it's not safe to grab from mmap context that would +introduce an ABBA deadlock between the mmap lock and the ctx uring_lock. +Instead, lookup the buffer_list under RCU, as the the list is RCU freed +already. Use the existing reference count to determine whether it's +possible to safely grab a reference to it (eg if it's not zero already), +and drop that reference when done with the mapping. If the mmap +reference is the last one, the buffer_list and the associated memory can +go away, since the vma insertion has references to the inserted pages at +that point. + +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/io_uring_types.h | 3 + io_uring/io_uring.c | 58 ++++------------- + io_uring/io_uring.h | 6 + + io_uring/kbuf.c | 137 ++++++++--------------------------------- + io_uring/kbuf.h | 3 + 5 files changed, 48 insertions(+), 159 deletions(-) + +--- a/include/linux/io_uring_types.h ++++ b/include/linux/io_uring_types.h +@@ -326,9 +326,6 @@ struct io_ring_ctx { + + struct list_head io_buffers_cache; + +- /* deferred free list, protected by ->uring_lock */ +- struct hlist_head io_buf_list; +- + /* Keep this last, we don't need it for the fast path */ + struct wait_queue_head poll_wq; + struct io_restriction restrictions; +--- a/io_uring/io_uring.c ++++ b/io_uring/io_uring.c +@@ -311,7 +311,6 @@ static __cold struct io_ring_ctx *io_rin + INIT_LIST_HEAD(&ctx->sqd_list); + INIT_LIST_HEAD(&ctx->cq_overflow_list); + INIT_LIST_HEAD(&ctx->io_buffers_cache); +- INIT_HLIST_HEAD(&ctx->io_buf_list); + io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX, + sizeof(struct io_rsrc_node)); + io_alloc_cache_init(&ctx->apoll_cache, IO_ALLOC_CACHE_MAX, +@@ -2682,15 +2681,15 @@ static int io_cqring_wait(struct io_ring + return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; + } + +-static void io_pages_unmap(void *ptr, struct page ***pages, +- unsigned short *npages) ++void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages, ++ bool put_pages) + { + bool do_vunmap = false; + + if (!ptr) + return; + +- if (*npages) { ++ if (put_pages && *npages) { + struct page **to_free = *pages; + int i; + +@@ -2712,14 +2711,6 @@ static void io_pages_unmap(void *ptr, st + *npages = 0; + } + +-void io_mem_free(void *ptr) +-{ +- if (!ptr) +- return; +- +- folio_put(virt_to_folio(ptr)); +-} +- + static void io_pages_free(struct page ***pages, int npages) + { + struct page **page_array; +@@ -2818,8 +2809,10 @@ static void *io_sqes_map(struct io_ring_ + static void io_rings_free(struct io_ring_ctx *ctx) + { + if (!(ctx->flags & IORING_SETUP_NO_MMAP)) { +- io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages); +- io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages); ++ io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages, ++ true); ++ io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages, ++ true); + } else { + io_pages_free(&ctx->ring_pages, ctx->n_ring_pages); + ctx->n_ring_pages = 0; +@@ -2876,8 +2869,8 @@ err: + return ERR_PTR(-ENOMEM); + } + +-static void *io_pages_map(struct page ***out_pages, unsigned short *npages, +- size_t size) ++void *io_pages_map(struct page ***out_pages, unsigned short *npages, ++ size_t size) + { + gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN; + struct page **pages; +@@ -2909,17 +2902,6 @@ fail: + return ret; + } + +-void *io_mem_alloc(size_t size) +-{ +- gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; +- void *ret; +- +- ret = (void *) __get_free_pages(gfp, get_order(size)); +- if (ret) +- return ret; +- return ERR_PTR(-ENOMEM); +-} +- + static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries, + unsigned int cq_entries, size_t *sq_offset) + { +@@ -3073,7 +3055,6 @@ static __cold void io_ring_ctx_free(stru + ctx->mm_account = NULL; + } + io_rings_free(ctx); +- io_kbuf_mmap_list_free(ctx); + + percpu_ref_exit(&ctx->refs); + free_uid(ctx->user); +@@ -3563,10 +3544,8 @@ static void *io_uring_validate_mmap_requ + { + struct io_ring_ctx *ctx = file->private_data; + loff_t offset = pgoff << PAGE_SHIFT; +- struct page *page; +- void *ptr; + +- switch (offset & IORING_OFF_MMAP_MASK) { ++ switch ((pgoff << PAGE_SHIFT) & IORING_OFF_MMAP_MASK) { + case IORING_OFF_SQ_RING: + case IORING_OFF_CQ_RING: + /* Don't allow mmap if the ring was setup without it */ +@@ -3581,6 +3560,7 @@ static void *io_uring_validate_mmap_requ + case IORING_OFF_PBUF_RING: { + struct io_buffer_list *bl; + unsigned int bgid; ++ void *ptr; + + bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; + bl = io_pbuf_get_bl(ctx, bgid); +@@ -3588,17 +3568,11 @@ static void *io_uring_validate_mmap_requ + return bl; + ptr = bl->buf_ring; + io_put_bl(ctx, bl); +- break; ++ return ptr; + } +- default: +- return ERR_PTR(-EINVAL); + } + +- page = virt_to_head_page(ptr); +- if (sz > page_size(page)) +- return ERR_PTR(-EINVAL); +- +- return ptr; ++ return ERR_PTR(-EINVAL); + } + + int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, +@@ -3618,7 +3592,6 @@ static __cold int io_uring_mmap(struct f + size_t sz = vma->vm_end - vma->vm_start; + long offset = vma->vm_pgoff << PAGE_SHIFT; + unsigned int npages; +- unsigned long pfn; + void *ptr; + + ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); +@@ -3633,10 +3606,11 @@ static __cold int io_uring_mmap(struct f + case IORING_OFF_SQES: + return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages, + ctx->n_sqe_pages); ++ case IORING_OFF_PBUF_RING: ++ return io_pbuf_mmap(file, vma); + } + +- pfn = virt_to_phys(ptr) >> PAGE_SHIFT; +- return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); ++ return -EINVAL; + } + + static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp, +--- a/io_uring/io_uring.h ++++ b/io_uring/io_uring.h +@@ -93,8 +93,10 @@ bool __io_alloc_req_refill(struct io_rin + bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, + bool cancel_all); + +-void *io_mem_alloc(size_t size); +-void io_mem_free(void *ptr); ++void *io_pages_map(struct page ***out_pages, unsigned short *npages, ++ size_t size); ++void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages, ++ bool put_pages); + + #if defined(CONFIG_PROVE_LOCKING) + static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) +--- a/io_uring/kbuf.c ++++ b/io_uring/kbuf.c +@@ -30,25 +30,12 @@ struct io_provide_buf { + __u16 bid; + }; + +-static inline struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx, +- unsigned int bgid) +-{ +- return xa_load(&ctx->io_bl_xa, bgid); +-} +- +-struct io_buf_free { +- struct hlist_node list; +- void *mem; +- size_t size; +- int inuse; +-}; +- + static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, + unsigned int bgid) + { + lockdep_assert_held(&ctx->uring_lock); + +- return __io_buffer_get_list(ctx, bgid); ++ return xa_load(&ctx->io_bl_xa, bgid); + } + + static int io_buffer_add_list(struct io_ring_ctx *ctx, +@@ -199,24 +186,6 @@ void __user *io_buffer_select(struct io_ + return ret; + } + +-/* +- * Mark the given mapped range as free for reuse +- */ +-static void io_kbuf_mark_free(struct io_ring_ctx *ctx, struct io_buffer_list *bl) +-{ +- struct io_buf_free *ibf; +- +- hlist_for_each_entry(ibf, &ctx->io_buf_list, list) { +- if (bl->buf_ring == ibf->mem) { +- ibf->inuse = 0; +- return; +- } +- } +- +- /* can't happen... */ +- WARN_ON_ONCE(1); +-} +- + static int __io_remove_buffers(struct io_ring_ctx *ctx, + struct io_buffer_list *bl, unsigned nbufs) + { +@@ -228,23 +197,16 @@ static int __io_remove_buffers(struct io + + if (bl->is_mapped) { + i = bl->buf_ring->tail - bl->head; +- if (bl->is_mmap) { +- /* +- * io_kbuf_list_free() will free the page(s) at +- * ->release() time. +- */ +- io_kbuf_mark_free(ctx, bl); +- bl->buf_ring = NULL; +- bl->is_mmap = 0; +- } else if (bl->buf_nr_pages) { ++ if (bl->buf_nr_pages) { + int j; + +- for (j = 0; j < bl->buf_nr_pages; j++) +- unpin_user_page(bl->buf_pages[j]); +- kvfree(bl->buf_pages); +- vunmap(bl->buf_ring); +- bl->buf_pages = NULL; +- bl->buf_nr_pages = 0; ++ if (!bl->is_mmap) { ++ for (j = 0; j < bl->buf_nr_pages; j++) ++ unpin_user_page(bl->buf_pages[j]); ++ } ++ io_pages_unmap(bl->buf_ring, &bl->buf_pages, ++ &bl->buf_nr_pages, bl->is_mmap); ++ bl->is_mmap = 0; + } + /* make sure it's seen as empty */ + INIT_LIST_HEAD(&bl->buf_list); +@@ -540,63 +502,17 @@ error_unpin: + return ret; + } + +-/* +- * See if we have a suitable region that we can reuse, rather than allocate +- * both a new io_buf_free and mem region again. We leave it on the list as +- * even a reused entry will need freeing at ring release. +- */ +-static struct io_buf_free *io_lookup_buf_free_entry(struct io_ring_ctx *ctx, +- size_t ring_size) +-{ +- struct io_buf_free *ibf, *best = NULL; +- size_t best_dist; +- +- hlist_for_each_entry(ibf, &ctx->io_buf_list, list) { +- size_t dist; +- +- if (ibf->inuse || ibf->size < ring_size) +- continue; +- dist = ibf->size - ring_size; +- if (!best || dist < best_dist) { +- best = ibf; +- if (!dist) +- break; +- best_dist = dist; +- } +- } +- +- return best; +-} +- + static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx, + struct io_uring_buf_reg *reg, + struct io_buffer_list *bl) + { +- struct io_buf_free *ibf; + size_t ring_size; +- void *ptr; + + ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring); + +- /* Reuse existing entry, if we can */ +- ibf = io_lookup_buf_free_entry(ctx, ring_size); +- if (!ibf) { +- ptr = io_mem_alloc(ring_size); +- if (IS_ERR(ptr)) +- return PTR_ERR(ptr); +- +- /* Allocate and store deferred free entry */ +- ibf = kmalloc(sizeof(*ibf), GFP_KERNEL_ACCOUNT); +- if (!ibf) { +- io_mem_free(ptr); +- return -ENOMEM; +- } +- ibf->mem = ptr; +- ibf->size = ring_size; +- hlist_add_head(&ibf->list, &ctx->io_buf_list); +- } +- ibf->inuse = 1; +- bl->buf_ring = ibf->mem; ++ bl->buf_ring = io_pages_map(&bl->buf_pages, &bl->buf_nr_pages, ring_size); ++ if (!bl->buf_ring) ++ return -ENOMEM; + bl->is_mapped = 1; + bl->is_mmap = 1; + return 0; +@@ -719,18 +635,19 @@ struct io_buffer_list *io_pbuf_get_bl(st + return ERR_PTR(-EINVAL); + } + +-/* +- * Called at or after ->release(), free the mmap'ed buffers that we used +- * for memory mapped provided buffer rings. +- */ +-void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx) +-{ +- struct io_buf_free *ibf; +- struct hlist_node *tmp; +- +- hlist_for_each_entry_safe(ibf, tmp, &ctx->io_buf_list, list) { +- hlist_del(&ibf->list); +- io_mem_free(ibf->mem); +- kfree(ibf); +- } ++int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma) ++{ ++ struct io_ring_ctx *ctx = file->private_data; ++ loff_t pgoff = vma->vm_pgoff << PAGE_SHIFT; ++ struct io_buffer_list *bl; ++ int bgid, ret; ++ ++ bgid = (pgoff & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; ++ bl = io_pbuf_get_bl(ctx, bgid); ++ if (IS_ERR(bl)) ++ return PTR_ERR(bl); ++ ++ ret = io_uring_mmap_pages(ctx, vma, bl->buf_pages, bl->buf_nr_pages); ++ io_put_bl(ctx, bl); ++ return ret; + } +--- a/io_uring/kbuf.h ++++ b/io_uring/kbuf.h +@@ -54,8 +54,6 @@ int io_provide_buffers(struct io_kiocb * + int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); + int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); + +-void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx); +- + unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags); + + void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); +@@ -63,6 +61,7 @@ void io_kbuf_recycle_legacy(struct io_ki + void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl); + struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx, + unsigned long bgid); ++int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma); + + static inline void io_kbuf_recycle_ring(struct io_kiocb *req) + { diff --git a/queue-6.6/io_uring-kbuf-vmap-pinned-buffer-ring.patch b/queue-6.6/io_uring-kbuf-vmap-pinned-buffer-ring.patch new file mode 100644 index 0000000000..5b75f52b31 --- /dev/null +++ b/queue-6.6/io_uring-kbuf-vmap-pinned-buffer-ring.patch @@ -0,0 +1,112 @@ +From 432b583ab581f2c21cad164d396a8e9fa4754a22 Mon Sep 17 00:00:00 2001 +From: Jens Axboe +Date: Tue, 12 Mar 2024 10:42:27 -0600 +Subject: io_uring/kbuf: vmap pinned buffer ring + +From: Jens Axboe + +Commit e270bfd22a2a10d1cfbaddf23e79b6d0b405d21e upstream. + +This avoids needing to care about HIGHMEM, and it makes the buffer +indexing easier as both ring provided buffer methods are now virtually +mapped in a contigious fashion. + +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + io_uring/kbuf.c | 39 +++++++++++++++------------------------ + 1 file changed, 15 insertions(+), 24 deletions(-) + +--- a/io_uring/kbuf.c ++++ b/io_uring/kbuf.c +@@ -7,6 +7,7 @@ + #include + #include + #include ++#include + #include + + #include +@@ -153,15 +154,7 @@ static void __user *io_ring_buffer_selec + return NULL; + + head &= bl->mask; +- /* mmaped buffers are always contig */ +- if (bl->is_mmap || head < IO_BUFFER_LIST_BUF_PER_PAGE) { +- buf = &br->bufs[head]; +- } else { +- int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1); +- int index = head / IO_BUFFER_LIST_BUF_PER_PAGE; +- buf = page_address(bl->buf_pages[index]); +- buf += off; +- } ++ buf = &br->bufs[head]; + if (*len == 0 || *len > buf->len) + *len = buf->len; + req->flags |= REQ_F_BUFFER_RING; +@@ -249,6 +242,7 @@ static int __io_remove_buffers(struct io + for (j = 0; j < bl->buf_nr_pages; j++) + unpin_user_page(bl->buf_pages[j]); + kvfree(bl->buf_pages); ++ vunmap(bl->buf_ring); + bl->buf_pages = NULL; + bl->buf_nr_pages = 0; + } +@@ -501,9 +495,9 @@ err: + static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, + struct io_buffer_list *bl) + { +- struct io_uring_buf_ring *br; ++ struct io_uring_buf_ring *br = NULL; ++ int nr_pages, ret, i; + struct page **pages; +- int i, nr_pages; + + pages = io_pin_pages(reg->ring_addr, + flex_array_size(br, bufs, reg->ring_entries), +@@ -511,18 +505,12 @@ static int io_pin_pbuf_ring(struct io_ur + if (IS_ERR(pages)) + return PTR_ERR(pages); + +- /* +- * Apparently some 32-bit boxes (ARM) will return highmem pages, +- * which then need to be mapped. We could support that, but it'd +- * complicate the code and slowdown the common cases quite a bit. +- * So just error out, returning -EINVAL just like we did on kernels +- * that didn't support mapped buffer rings. +- */ +- for (i = 0; i < nr_pages; i++) +- if (PageHighMem(pages[i])) +- goto error_unpin; ++ br = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); ++ if (!br) { ++ ret = -ENOMEM; ++ goto error_unpin; ++ } + +- br = page_address(pages[0]); + #ifdef SHM_COLOUR + /* + * On platforms that have specific aliasing requirements, SHM_COLOUR +@@ -533,8 +521,10 @@ static int io_pin_pbuf_ring(struct io_ur + * should use IOU_PBUF_RING_MMAP instead, and liburing will handle + * this transparently. + */ +- if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) ++ if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) { ++ ret = -EINVAL; + goto error_unpin; ++ } + #endif + bl->buf_pages = pages; + bl->buf_nr_pages = nr_pages; +@@ -546,7 +536,8 @@ error_unpin: + for (i = 0; i < nr_pages; i++) + unpin_user_page(pages[i]); + kvfree(pages); +- return -EINVAL; ++ vunmap(br); ++ return ret; + } + + /* diff --git a/queue-6.6/io_uring-return-error-pointer-from-io_mem_alloc.patch b/queue-6.6/io_uring-return-error-pointer-from-io_mem_alloc.patch new file mode 100644 index 0000000000..b3487bede6 --- /dev/null +++ b/queue-6.6/io_uring-return-error-pointer-from-io_mem_alloc.patch @@ -0,0 +1,76 @@ +From b001225fa4fe09610b35b428e46193ed2a28c95f Mon Sep 17 00:00:00 2001 +From: Jens Axboe +Date: Fri, 5 Nov 2021 17:13:52 -0600 +Subject: io_uring: return error pointer from io_mem_alloc() + +From: Jens Axboe + +Commit e27cef86a0edd4ef7f8b4670f508a03b509cbbb2 upstream. + +In preparation for having more than one time of ring allocator, make the +existing one return valid/error-pointer rather than just NULL. + +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + io_uring/io_uring.c | 18 ++++++++++++------ + 1 file changed, 12 insertions(+), 6 deletions(-) + +diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c +index 33597284e1cb..ebcb0680f1cc 100644 +--- a/io_uring/io_uring.c ++++ b/io_uring/io_uring.c +@@ -2528,8 +2528,12 @@ static void io_mem_free(void *ptr) + static void *io_mem_alloc(size_t size) + { + gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; ++ void *ret; + +- return (void *) __get_free_pages(gfp, get_order(size)); ++ ret = (void *) __get_free_pages(gfp, get_order(size)); ++ if (ret) ++ return ret; ++ return ERR_PTR(-ENOMEM); + } + + static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries, +@@ -3422,6 +3426,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, + { + struct io_rings *rings; + size_t size, sq_array_offset; ++ void *ptr; + + /* make sure these are sane, as we already accounted them */ + ctx->sq_entries = p->sq_entries; +@@ -3432,8 +3437,8 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, + return -EOVERFLOW; + + rings = io_mem_alloc(size); +- if (!rings) +- return -ENOMEM; ++ if (IS_ERR(rings)) ++ return PTR_ERR(rings); + + ctx->rings = rings; + ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); +@@ -3452,13 +3457,14 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, + return -EOVERFLOW; + } + +- ctx->sq_sqes = io_mem_alloc(size); +- if (!ctx->sq_sqes) { ++ ptr = io_mem_alloc(size); ++ if (IS_ERR(ptr)) { + io_mem_free(ctx->rings); + ctx->rings = NULL; +- return -ENOMEM; ++ return PTR_ERR(ptr); + } + ++ ctx->sq_sqes = ptr; + return 0; + } + +-- +2.47.2 + diff --git a/queue-6.6/io_uring-unify-io_pin_pages.patch b/queue-6.6/io_uring-unify-io_pin_pages.patch new file mode 100644 index 0000000000..b0088137a9 --- /dev/null +++ b/queue-6.6/io_uring-unify-io_pin_pages.patch @@ -0,0 +1,156 @@ +From dc5ec8a2f867b4211508a5ded8616103f4d67112 Mon Sep 17 00:00:00 2001 +From: Jens Axboe +Date: Wed, 13 Mar 2024 14:58:14 -0600 +Subject: io_uring: unify io_pin_pages() + +From: Jens Axboe + +Commit 1943f96b3816e0f0d3d6686374d6e1d617c8b42c upstream. + +Move it into io_uring.c where it belongs, and use it in there as well +rather than have two implementations of this. + +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + io_uring/io_uring.c | 61 +++++++++++++++++++++++++++++++++++----------------- + io_uring/rsrc.c | 39 --------------------------------- + 2 files changed, 42 insertions(+), 58 deletions(-) + +--- a/io_uring/io_uring.c ++++ b/io_uring/io_uring.c +@@ -2738,33 +2738,57 @@ static void io_pages_free(struct page ** + *pages = NULL; + } + ++struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages) ++{ ++ unsigned long start, end, nr_pages; ++ struct page **pages; ++ int ret; ++ ++ end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; ++ start = uaddr >> PAGE_SHIFT; ++ nr_pages = end - start; ++ if (WARN_ON_ONCE(!nr_pages)) ++ return ERR_PTR(-EINVAL); ++ ++ pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); ++ if (!pages) ++ return ERR_PTR(-ENOMEM); ++ ++ ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM, ++ pages); ++ /* success, mapped all pages */ ++ if (ret == nr_pages) { ++ *npages = nr_pages; ++ return pages; ++ } ++ ++ /* partial map, or didn't map anything */ ++ if (ret >= 0) { ++ /* if we did partial map, release any pages we did get */ ++ if (ret) ++ unpin_user_pages(pages, ret); ++ ret = -EFAULT; ++ } ++ kvfree(pages); ++ return ERR_PTR(ret); ++} ++ + static void *__io_uaddr_map(struct page ***pages, unsigned short *npages, + unsigned long uaddr, size_t size) + { + struct page **page_array; + unsigned int nr_pages; + void *page_addr; +- int ret, pinned; + + *npages = 0; + + if (uaddr & (PAGE_SIZE - 1) || !size) + return ERR_PTR(-EINVAL); + +- nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; +- if (nr_pages > USHRT_MAX) +- return ERR_PTR(-EINVAL); +- page_array = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); +- if (!page_array) +- return ERR_PTR(-ENOMEM); +- +- +- pinned = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM, +- page_array); +- if (pinned != nr_pages) { +- ret = (pinned < 0) ? pinned : -EFAULT; +- goto free_pages; +- } ++ nr_pages = 0; ++ page_array = io_pin_pages(uaddr, size, &nr_pages); ++ if (IS_ERR(page_array)) ++ return page_array; + + page_addr = vmap(page_array, nr_pages, VM_MAP, PAGE_KERNEL); + if (page_addr) { +@@ -2772,10 +2796,9 @@ static void *__io_uaddr_map(struct page + *npages = nr_pages; + return page_addr; + } +- ret = -ENOMEM; +-free_pages: +- io_pages_free(&page_array, pinned > 0 ? pinned : 0); +- return ERR_PTR(ret); ++ ++ io_pages_free(&page_array, nr_pages); ++ return ERR_PTR(-ENOMEM); + } + + static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr, +--- a/io_uring/rsrc.c ++++ b/io_uring/rsrc.c +@@ -873,45 +873,6 @@ static int io_buffer_account_pin(struct + return ret; + } + +-struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages) +-{ +- unsigned long start, end, nr_pages; +- struct page **pages = NULL; +- int pret, ret = -ENOMEM; +- +- end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT; +- start = ubuf >> PAGE_SHIFT; +- nr_pages = end - start; +- +- pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); +- if (!pages) +- goto done; +- +- ret = 0; +- mmap_read_lock(current->mm); +- pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, +- pages); +- if (pret == nr_pages) +- *npages = nr_pages; +- else +- ret = pret < 0 ? pret : -EFAULT; +- +- mmap_read_unlock(current->mm); +- if (ret) { +- /* if we did partial map, release any pages we did get */ +- if (pret > 0) +- unpin_user_pages(pages, pret); +- goto done; +- } +- ret = 0; +-done: +- if (ret < 0) { +- kvfree(pages); +- pages = ERR_PTR(ret); +- } +- return pages; +-} +- + static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, + struct io_mapped_ubuf **pimu, + struct page **last_hpage) diff --git a/queue-6.6/io_uring-use-unpin_user_pages-where-appropriate.patch b/queue-6.6/io_uring-use-unpin_user_pages-where-appropriate.patch new file mode 100644 index 0000000000..06d703047c --- /dev/null +++ b/queue-6.6/io_uring-use-unpin_user_pages-where-appropriate.patch @@ -0,0 +1,40 @@ +From c8e556f54f547266d984bcffbb44279ec3884258 Mon Sep 17 00:00:00 2001 +From: Jens Axboe +Date: Wed, 13 Mar 2024 15:01:03 -0600 +Subject: io_uring: use unpin_user_pages() where appropriate + +From: Jens Axboe + +Commit 18595c0a58ae29ac6a996c5b664610119b73182d upstream. + +There are a few cases of open-rolled loops around unpin_user_page(), use +the generic helper instead. + +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + io_uring/kbuf.c | 5 ++--- + 1 file changed, 2 insertions(+), 3 deletions(-) + +--- a/io_uring/kbuf.c ++++ b/io_uring/kbuf.c +@@ -458,8 +458,8 @@ static int io_pin_pbuf_ring(struct io_ur + struct io_buffer_list *bl) + { + struct io_uring_buf_ring *br = NULL; +- int nr_pages, ret, i; + struct page **pages; ++ int nr_pages, ret; + + pages = io_pin_pages(reg->ring_addr, + flex_array_size(br, bufs, reg->ring_entries), +@@ -495,8 +495,7 @@ static int io_pin_pbuf_ring(struct io_ur + bl->is_mmap = 0; + return 0; + error_unpin: +- for (i = 0; i < nr_pages; i++) +- unpin_user_page(pages[i]); ++ unpin_user_pages(pages, nr_pages); + kvfree(pages); + vunmap(br); + return ret; diff --git a/queue-6.6/io_uring-use-vmap-for-ring-mapping.patch b/queue-6.6/io_uring-use-vmap-for-ring-mapping.patch new file mode 100644 index 0000000000..a0a21c9691 --- /dev/null +++ b/queue-6.6/io_uring-use-vmap-for-ring-mapping.patch @@ -0,0 +1,87 @@ +From 23cd4c4db8836b441e401328244a1864b47ac3c8 Mon Sep 17 00:00:00 2001 +From: Jens Axboe +Date: Wed, 13 Mar 2024 14:10:40 -0600 +Subject: io_uring: use vmap() for ring mapping + +From: Jens Axboe + +Commit 09fc75e0c035a2cabb8caa15cec6e85159dd94f0 upstream. + +This is the last holdout which does odd page checking, convert it to +vmap just like what is done for the non-mmap path. + +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + io_uring/io_uring.c | 38 +++++++++----------------------------- + 1 file changed, 9 insertions(+), 29 deletions(-) + +--- a/io_uring/io_uring.c ++++ b/io_uring/io_uring.c +@@ -64,7 +64,6 @@ + #include + #include + #include +-#include + #include + #include + #include +@@ -2745,7 +2744,7 @@ static void *__io_uaddr_map(struct page + struct page **page_array; + unsigned int nr_pages; + void *page_addr; +- int ret, i, pinned; ++ int ret, pinned; + + *npages = 0; + +@@ -2767,34 +2766,13 @@ static void *__io_uaddr_map(struct page + goto free_pages; + } + +- page_addr = page_address(page_array[0]); +- for (i = 0; i < nr_pages; i++) { +- ret = -EINVAL; +- +- /* +- * Can't support mapping user allocated ring memory on 32-bit +- * archs where it could potentially reside in highmem. Just +- * fail those with -EINVAL, just like we did on kernels that +- * didn't support this feature. +- */ +- if (PageHighMem(page_array[i])) +- goto free_pages; +- +- /* +- * No support for discontig pages for now, should either be a +- * single normal page, or a huge page. Later on we can add +- * support for remapping discontig pages, for now we will +- * just fail them with EINVAL. +- */ +- if (page_address(page_array[i]) != page_addr) +- goto free_pages; +- page_addr += PAGE_SIZE; ++ page_addr = vmap(page_array, nr_pages, VM_MAP, PAGE_KERNEL); ++ if (page_addr) { ++ *pages = page_array; ++ *npages = nr_pages; ++ return page_addr; + } +- +- *pages = page_array; +- *npages = nr_pages; +- return page_to_virt(page_array[0]); +- ++ ret = -ENOMEM; + free_pages: + io_pages_free(&page_array, pinned > 0 ? pinned : 0); + return ERR_PTR(ret); +@@ -2824,6 +2802,8 @@ static void io_rings_free(struct io_ring + ctx->n_ring_pages = 0; + io_pages_free(&ctx->sqe_pages, ctx->n_sqe_pages); + ctx->n_sqe_pages = 0; ++ vunmap(ctx->rings); ++ vunmap(ctx->sq_sqes); + } + + ctx->rings = NULL; diff --git a/queue-6.6/mm-add-nommu-variant-of-vm_insert_pages.patch b/queue-6.6/mm-add-nommu-variant-of-vm_insert_pages.patch new file mode 100644 index 0000000000..0a2953dd40 --- /dev/null +++ b/queue-6.6/mm-add-nommu-variant-of-vm_insert_pages.patch @@ -0,0 +1,36 @@ +From ac77b7bfe1633f5366bceb76d74d2f04846b2186 Mon Sep 17 00:00:00 2001 +From: Jens Axboe +Date: Sat, 16 Mar 2024 07:21:43 -0600 +Subject: mm: add nommu variant of vm_insert_pages() + +From: Jens Axboe + +Commit 62346c6cb28b043f2a6e95337d9081ec0b37b5f5 upstream. + +An identical one exists for vm_insert_page(), add one for +vm_insert_pages() to avoid needing to check for CONFIG_MMU in code using +it. + +Acked-by: Johannes Weiner +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + mm/nommu.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +--- a/mm/nommu.c ++++ b/mm/nommu.c +@@ -357,6 +357,13 @@ int vm_insert_page(struct vm_area_struct + } + EXPORT_SYMBOL(vm_insert_page); + ++int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, ++ struct page **pages, unsigned long *num) ++{ ++ return -EINVAL; ++} ++EXPORT_SYMBOL(vm_insert_pages); ++ + int vm_map_pages(struct vm_area_struct *vma, struct page **pages, + unsigned long num) + { diff --git a/queue-6.6/series b/queue-6.6/series index 97f1e6badf..0488fcb0ee 100644 --- a/queue-6.6/series +++ b/queue-6.6/series @@ -98,3 +98,15 @@ mptcp-safety-check-before-fallback.patch drm-nouveau-do-not-override-forced-connector-status.patch net-handle-napi_schedule-calls-from-non-interrupt.patch block-fix-kmem_cache-of-name-bio-108-already-exists.patch +mm-add-nommu-variant-of-vm_insert_pages.patch +io_uring-get-rid-of-remap_pfn_range-for-mapping-rings-sqes.patch +io_uring-don-t-attempt-to-mmap-larger-than-what-the-user-asks-for.patch +io_uring-fix-corner-case-forgetting-to-vunmap.patch +io_uring-use-vmap-for-ring-mapping.patch +io_uring-unify-io_pin_pages.patch +io_uring-kbuf-vmap-pinned-buffer-ring.patch +io_uring-kbuf-use-vm_insert_pages-for-mmap-ed-pbuf-ring.patch +io_uring-use-unpin_user_pages-where-appropriate.patch +io_uring-fix-error-pbuf-checking.patch +io_uring-add-ring-freeing-helper.patch +io_uring-return-error-pointer-from-io_mem_alloc.patch -- 2.47.2