]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
6.6-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sun, 16 Mar 2025 06:00:40 +0000 (07:00 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sun, 16 Mar 2025 06:00:40 +0000 (07:00 +0100)
added patches:
io_uring-add-ring-freeing-helper.patch
io_uring-don-t-attempt-to-mmap-larger-than-what-the-user-asks-for.patch
io_uring-fix-corner-case-forgetting-to-vunmap.patch
io_uring-fix-error-pbuf-checking.patch
io_uring-get-rid-of-remap_pfn_range-for-mapping-rings-sqes.patch
io_uring-kbuf-use-vm_insert_pages-for-mmap-ed-pbuf-ring.patch
io_uring-kbuf-vmap-pinned-buffer-ring.patch
io_uring-return-error-pointer-from-io_mem_alloc.patch
io_uring-unify-io_pin_pages.patch
io_uring-use-unpin_user_pages-where-appropriate.patch
io_uring-use-vmap-for-ring-mapping.patch
mm-add-nommu-variant-of-vm_insert_pages.patch

13 files changed:
queue-6.6/io_uring-add-ring-freeing-helper.patch [new file with mode: 0644]
queue-6.6/io_uring-don-t-attempt-to-mmap-larger-than-what-the-user-asks-for.patch [new file with mode: 0644]
queue-6.6/io_uring-fix-corner-case-forgetting-to-vunmap.patch [new file with mode: 0644]
queue-6.6/io_uring-fix-error-pbuf-checking.patch [new file with mode: 0644]
queue-6.6/io_uring-get-rid-of-remap_pfn_range-for-mapping-rings-sqes.patch [new file with mode: 0644]
queue-6.6/io_uring-kbuf-use-vm_insert_pages-for-mmap-ed-pbuf-ring.patch [new file with mode: 0644]
queue-6.6/io_uring-kbuf-vmap-pinned-buffer-ring.patch [new file with mode: 0644]
queue-6.6/io_uring-return-error-pointer-from-io_mem_alloc.patch [new file with mode: 0644]
queue-6.6/io_uring-unify-io_pin_pages.patch [new file with mode: 0644]
queue-6.6/io_uring-use-unpin_user_pages-where-appropriate.patch [new file with mode: 0644]
queue-6.6/io_uring-use-vmap-for-ring-mapping.patch [new file with mode: 0644]
queue-6.6/mm-add-nommu-variant-of-vm_insert_pages.patch [new file with mode: 0644]
queue-6.6/series

diff --git a/queue-6.6/io_uring-add-ring-freeing-helper.patch b/queue-6.6/io_uring-add-ring-freeing-helper.patch
new file mode 100644 (file)
index 0000000..fd3c4a2
--- /dev/null
@@ -0,0 +1,68 @@
+From 8c273186074a591cfdcd4370849676bc3eeb6ecb Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@kernel.dk>
+Date: Fri, 5 Nov 2021 17:15:46 -0600
+Subject: io_uring: add ring freeing helper
+
+From: Jens Axboe <axboe@kernel.dk>
+
+Commit 9c189eee73af1825ea9c895fafad469de5f82641 upstream.
+
+We do rings and sqes separately, move them into a helper that does both
+the freeing and clearing of the memory.
+
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ io_uring/io_uring.c | 17 +++++++++++------
+ 1 file changed, 11 insertions(+), 6 deletions(-)
+
+diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
+index ebcb0680f1cc..b211feb0d2b1 100644
+--- a/io_uring/io_uring.c
++++ b/io_uring/io_uring.c
+@@ -2525,6 +2525,14 @@ static void io_mem_free(void *ptr)
+               free_compound_page(page);
+ }
++static void io_rings_free(struct io_ring_ctx *ctx)
++{
++      io_mem_free(ctx->rings);
++      io_mem_free(ctx->sq_sqes);
++      ctx->rings = NULL;
++      ctx->sq_sqes = NULL;
++}
++
+ static void *io_mem_alloc(size_t size)
+ {
+       gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
+@@ -2684,8 +2692,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
+               mmdrop(ctx->mm_account);
+               ctx->mm_account = NULL;
+       }
+-      io_mem_free(ctx->rings);
+-      io_mem_free(ctx->sq_sqes);
++      io_rings_free(ctx);
+       percpu_ref_exit(&ctx->refs);
+       free_uid(ctx->user);
+@@ -3452,15 +3459,13 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
+       else
+               size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
+       if (size == SIZE_MAX) {
+-              io_mem_free(ctx->rings);
+-              ctx->rings = NULL;
++              io_rings_free(ctx);
+               return -EOVERFLOW;
+       }
+       ptr = io_mem_alloc(size);
+       if (IS_ERR(ptr)) {
+-              io_mem_free(ctx->rings);
+-              ctx->rings = NULL;
++              io_rings_free(ctx);
+               return PTR_ERR(ptr);
+       }
+-- 
+2.47.2
+
diff --git a/queue-6.6/io_uring-don-t-attempt-to-mmap-larger-than-what-the-user-asks-for.patch b/queue-6.6/io_uring-don-t-attempt-to-mmap-larger-than-what-the-user-asks-for.patch
new file mode 100644 (file)
index 0000000..957322c
--- /dev/null
@@ -0,0 +1,51 @@
+From 71318baa99b6fbb65edf76dd0afaad3afd7007cc Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@kernel.dk>
+Date: Wed, 29 May 2024 09:38:38 -0600
+Subject: io_uring: don't attempt to mmap larger than what the user asks for
+
+From: Jens Axboe <axboe@kernel.dk>
+
+Commit 06fe9b1df1086b42718d632aa57e8f7cd1a66a21 upstream.
+
+If IORING_FEAT_SINGLE_MMAP is ignored, as can happen if an application
+uses an ancient liburing or does setup manually, then 3 mmap's are
+required to map the ring into userspace. The kernel will still have
+collapsed the mappings, however userspace may ask for mapping them
+individually. If so, then we should not use the full number of ring
+pages, as it may exceed the partial mapping. Doing so will yield an
+-EFAULT from vm_insert_pages(), as we pass in more pages than what the
+application asked for.
+
+Cap the number of pages to match what the application asked for, for
+the particular mapping operation.
+
+Reported-by: Lucas Mülling <lmulling@proton.me>
+Link: https://github.com/axboe/liburing/issues/1157
+Fixes: 3ab1db3c6039 ("io_uring: get rid of remap_pfn_range() for mapping rings/sqes")
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ io_uring/io_uring.c |    5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+--- a/io_uring/io_uring.c
++++ b/io_uring/io_uring.c
+@@ -3612,6 +3612,7 @@ static __cold int io_uring_mmap(struct f
+       struct io_ring_ctx *ctx = file->private_data;
+       size_t sz = vma->vm_end - vma->vm_start;
+       long offset = vma->vm_pgoff << PAGE_SHIFT;
++      unsigned int npages;
+       unsigned long pfn;
+       void *ptr;
+@@ -3622,8 +3623,8 @@ static __cold int io_uring_mmap(struct f
+       switch (offset & IORING_OFF_MMAP_MASK) {
+       case IORING_OFF_SQ_RING:
+       case IORING_OFF_CQ_RING:
+-              return io_uring_mmap_pages(ctx, vma, ctx->ring_pages,
+-                                              ctx->n_ring_pages);
++              npages = min(ctx->n_ring_pages, (sz + PAGE_SIZE - 1) >> PAGE_SHIFT);
++              return io_uring_mmap_pages(ctx, vma, ctx->ring_pages, npages);
+       case IORING_OFF_SQES:
+               return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages,
+                                               ctx->n_sqe_pages);
diff --git a/queue-6.6/io_uring-fix-corner-case-forgetting-to-vunmap.patch b/queue-6.6/io_uring-fix-corner-case-forgetting-to-vunmap.patch
new file mode 100644 (file)
index 0000000..3c2364f
--- /dev/null
@@ -0,0 +1,49 @@
+From fb318430c8de3dcee5727f050dfe3f3dd8c4549c Mon Sep 17 00:00:00 2001
+From: Pavel Begunkov <asml.silence@gmail.com>
+Date: Mon, 25 Nov 2024 23:10:31 +0000
+Subject: io_uring: fix corner case forgetting to vunmap
+
+From: Pavel Begunkov <asml.silence@gmail.com>
+
+Commit 43eef70e7e2ac74e7767731dd806720c7fb5e010 upstream.
+
+io_pages_unmap() is a bit tricky in trying to figure whether the pages
+were previously vmap'ed or not. In particular If there is juts one page
+it belives there is no need to vunmap. Paired io_pages_map(), however,
+could've failed io_mem_alloc_compound() and attempted to
+io_mem_alloc_single(), which does vmap, and that leads to unpaired vmap.
+
+The solution is to fail if io_mem_alloc_compound() can't allocate a
+single page. That's the easiest way to deal with it, and those two
+functions are getting removed soon, so no need to overcomplicate it.
+
+Cc: stable@vger.kernel.org
+Fixes: 3ab1db3c6039e ("io_uring: get rid of remap_pfn_range() for mapping rings/sqes")
+Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
+Link: https://lore.kernel.org/r/477e75a3907a2fe83249e49c0a92cd480b2c60e0.1732569842.git.asml.silence@gmail.com
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ io_uring/io_uring.c |    4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/io_uring/io_uring.c
++++ b/io_uring/io_uring.c
+@@ -2889,6 +2889,8 @@ static void *io_pages_map(struct page **
+       ret = io_mem_alloc_compound(pages, nr_pages, size, gfp);
+       if (!IS_ERR(ret))
+               goto done;
++      if (nr_pages == 1)
++              goto fail;
+       ret = io_mem_alloc_single(pages, nr_pages, size, gfp);
+       if (!IS_ERR(ret)) {
+@@ -2897,7 +2899,7 @@ done:
+               *npages = nr_pages;
+               return ret;
+       }
+-
++fail:
+       kvfree(pages);
+       *out_pages = NULL;
+       *npages = 0;
diff --git a/queue-6.6/io_uring-fix-error-pbuf-checking.patch b/queue-6.6/io_uring-fix-error-pbuf-checking.patch
new file mode 100644 (file)
index 0000000..a6c576c
--- /dev/null
@@ -0,0 +1,52 @@
+From 55b2d61e07a887351cf2996e96f89ade5ab7f1b7 Mon Sep 17 00:00:00 2001
+From: Pavel Begunkov <asml.silence@gmail.com>
+Date: Thu, 18 Jul 2024 20:00:53 +0100
+Subject: io_uring: fix error pbuf checking
+
+From: Pavel Begunkov <asml.silence@gmail.com>
+
+Commit bcc87d978b834c298bbdd9c52454c5d0a946e97e upstream.
+
+Syz reports a problem, which boils down to NULL vs IS_ERR inconsistent
+error handling in io_alloc_pbuf_ring().
+
+KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007]
+RIP: 0010:__io_remove_buffers+0xac/0x700 io_uring/kbuf.c:341
+Call Trace:
+ <TASK>
+ io_put_bl io_uring/kbuf.c:378 [inline]
+ io_destroy_buffers+0x14e/0x490 io_uring/kbuf.c:392
+ io_ring_ctx_free+0xa00/0x1070 io_uring/io_uring.c:2613
+ io_ring_exit_work+0x80f/0x8a0 io_uring/io_uring.c:2844
+ process_one_work kernel/workqueue.c:3231 [inline]
+ process_scheduled_works+0xa2c/0x1830 kernel/workqueue.c:3312
+ worker_thread+0x86d/0xd40 kernel/workqueue.c:3390
+ kthread+0x2f0/0x390 kernel/kthread.c:389
+ ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147
+ ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244
+
+Cc: stable@vger.kernel.org
+Reported-by: syzbot+2074b1a3d447915c6f1c@syzkaller.appspotmail.com
+Fixes: 87585b05757dc ("io_uring/kbuf: use vm_insert_pages() for mmap'ed pbuf ring")
+Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
+Link: https://lore.kernel.org/r/c5f9df20560bd9830401e8e48abc029e7cfd9f5e.1721329239.git.asml.silence@gmail.com
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ io_uring/kbuf.c |    4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/io_uring/kbuf.c
++++ b/io_uring/kbuf.c
+@@ -510,8 +510,10 @@ static int io_alloc_pbuf_ring(struct io_
+       ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring);
+       bl->buf_ring = io_pages_map(&bl->buf_pages, &bl->buf_nr_pages, ring_size);
+-      if (!bl->buf_ring)
++      if (IS_ERR(bl->buf_ring)) {
++              bl->buf_ring = NULL;
+               return -ENOMEM;
++      }
+       bl->is_mapped = 1;
+       bl->is_mmap = 1;
+       return 0;
diff --git a/queue-6.6/io_uring-get-rid-of-remap_pfn_range-for-mapping-rings-sqes.patch b/queue-6.6/io_uring-get-rid-of-remap_pfn_range-for-mapping-rings-sqes.patch
new file mode 100644 (file)
index 0000000..4287909
--- /dev/null
@@ -0,0 +1,245 @@
+From bfaf932689d7e59e3558977854b74bde1b137fae Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@kernel.dk>
+Date: Wed, 13 Mar 2024 09:56:14 -0600
+Subject: io_uring: get rid of remap_pfn_range() for mapping rings/sqes
+
+From: Jens Axboe <axboe@kernel.dk>
+
+Commit 3ab1db3c6039e02a9deb9d5091d28d559917a645 upstream.
+
+Rather than use remap_pfn_range() for this and manually free later,
+switch to using vm_insert_pages() and have it Just Work.
+
+If possible, allocate a single compound page that covers the range that
+is needed. If that works, then we can just use page_address() on that
+page. If we fail to get a compound page, allocate single pages and use
+vmap() to map them into the kernel virtual address space.
+
+This just covers the rings/sqes, the other remaining user of the mmap
+remap_pfn_range() user will be converted separately. Once that is done,
+we can kill the old alloc/free code.
+
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ io_uring/io_uring.c |  139 +++++++++++++++++++++++++++++++++++++++++++++++++---
+ io_uring/io_uring.h |    2 
+ 2 files changed, 133 insertions(+), 8 deletions(-)
+
+--- a/io_uring/io_uring.c
++++ b/io_uring/io_uring.c
+@@ -2683,6 +2683,36 @@ static int io_cqring_wait(struct io_ring
+       return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
+ }
++static void io_pages_unmap(void *ptr, struct page ***pages,
++                         unsigned short *npages)
++{
++      bool do_vunmap = false;
++
++      if (!ptr)
++              return;
++
++      if (*npages) {
++              struct page **to_free = *pages;
++              int i;
++
++              /*
++               * Only did vmap for the non-compound multiple page case.
++               * For the compound page, we just need to put the head.
++               */
++              if (PageCompound(to_free[0]))
++                      *npages = 1;
++              else if (*npages > 1)
++                      do_vunmap = true;
++              for (i = 0; i < *npages; i++)
++                      put_page(to_free[i]);
++      }
++      if (do_vunmap)
++              vunmap(ptr);
++      kvfree(*pages);
++      *pages = NULL;
++      *npages = 0;
++}
++
+ void io_mem_free(void *ptr)
+ {
+       if (!ptr)
+@@ -2787,8 +2817,8 @@ static void *io_sqes_map(struct io_ring_
+ static void io_rings_free(struct io_ring_ctx *ctx)
+ {
+       if (!(ctx->flags & IORING_SETUP_NO_MMAP)) {
+-              io_mem_free(ctx->rings);
+-              io_mem_free(ctx->sq_sqes);
++              io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages);
++              io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages);
+       } else {
+               io_pages_free(&ctx->ring_pages, ctx->n_ring_pages);
+               ctx->n_ring_pages = 0;
+@@ -2800,6 +2830,80 @@ static void io_rings_free(struct io_ring
+       ctx->sq_sqes = NULL;
+ }
++static void *io_mem_alloc_compound(struct page **pages, int nr_pages,
++                                 size_t size, gfp_t gfp)
++{
++      struct page *page;
++      int i, order;
++
++      order = get_order(size);
++      if (order > 10)
++              return ERR_PTR(-ENOMEM);
++      else if (order)
++              gfp |= __GFP_COMP;
++
++      page = alloc_pages(gfp, order);
++      if (!page)
++              return ERR_PTR(-ENOMEM);
++
++      for (i = 0; i < nr_pages; i++)
++              pages[i] = page + i;
++
++      return page_address(page);
++}
++
++static void *io_mem_alloc_single(struct page **pages, int nr_pages, size_t size,
++                               gfp_t gfp)
++{
++      void *ret;
++      int i;
++
++      for (i = 0; i < nr_pages; i++) {
++              pages[i] = alloc_page(gfp);
++              if (!pages[i])
++                      goto err;
++      }
++
++      ret = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
++      if (ret)
++              return ret;
++err:
++      while (i--)
++              put_page(pages[i]);
++      return ERR_PTR(-ENOMEM);
++}
++
++static void *io_pages_map(struct page ***out_pages, unsigned short *npages,
++                        size_t size)
++{
++      gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN;
++      struct page **pages;
++      int nr_pages;
++      void *ret;
++
++      nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
++      pages = kvmalloc_array(nr_pages, sizeof(struct page *), gfp);
++      if (!pages)
++              return ERR_PTR(-ENOMEM);
++
++      ret = io_mem_alloc_compound(pages, nr_pages, size, gfp);
++      if (!IS_ERR(ret))
++              goto done;
++
++      ret = io_mem_alloc_single(pages, nr_pages, size, gfp);
++      if (!IS_ERR(ret)) {
++done:
++              *out_pages = pages;
++              *npages = nr_pages;
++              return ret;
++      }
++
++      kvfree(pages);
++      *out_pages = NULL;
++      *npages = 0;
++      return ret;
++}
++
+ void *io_mem_alloc(size_t size)
+ {
+       gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
+@@ -3463,14 +3567,12 @@ static void *io_uring_validate_mmap_requ
+               /* Don't allow mmap if the ring was setup without it */
+               if (ctx->flags & IORING_SETUP_NO_MMAP)
+                       return ERR_PTR(-EINVAL);
+-              ptr = ctx->rings;
+-              break;
++              return ctx->rings;
+       case IORING_OFF_SQES:
+               /* Don't allow mmap if the ring was setup without it */
+               if (ctx->flags & IORING_SETUP_NO_MMAP)
+                       return ERR_PTR(-EINVAL);
+-              ptr = ctx->sq_sqes;
+-              break;
++              return ctx->sq_sqes;
+       case IORING_OFF_PBUF_RING: {
+               struct io_buffer_list *bl;
+               unsigned int bgid;
+@@ -3494,11 +3596,22 @@ static void *io_uring_validate_mmap_requ
+       return ptr;
+ }
++int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma,
++                      struct page **pages, int npages)
++{
++      unsigned long nr_pages = npages;
++
++      vm_flags_set(vma, VM_DONTEXPAND);
++      return vm_insert_pages(vma, vma->vm_start, pages, &nr_pages);
++}
++
+ #ifdef CONFIG_MMU
+ static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
+ {
++      struct io_ring_ctx *ctx = file->private_data;
+       size_t sz = vma->vm_end - vma->vm_start;
++      long offset = vma->vm_pgoff << PAGE_SHIFT;
+       unsigned long pfn;
+       void *ptr;
+@@ -3506,6 +3619,16 @@ static __cold int io_uring_mmap(struct f
+       if (IS_ERR(ptr))
+               return PTR_ERR(ptr);
++      switch (offset & IORING_OFF_MMAP_MASK) {
++      case IORING_OFF_SQ_RING:
++      case IORING_OFF_CQ_RING:
++              return io_uring_mmap_pages(ctx, vma, ctx->ring_pages,
++                                              ctx->n_ring_pages);
++      case IORING_OFF_SQES:
++              return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages,
++                                              ctx->n_sqe_pages);
++      }
++
+       pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
+       return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
+ }
+@@ -3795,7 +3918,7 @@ static __cold int io_allocate_scq_urings
+               return -EOVERFLOW;
+       if (!(ctx->flags & IORING_SETUP_NO_MMAP))
+-              rings = io_mem_alloc(size);
++              rings = io_pages_map(&ctx->ring_pages, &ctx->n_ring_pages, size);
+       else
+               rings = io_rings_map(ctx, p->cq_off.user_addr, size);
+@@ -3820,7 +3943,7 @@ static __cold int io_allocate_scq_urings
+       }
+       if (!(ctx->flags & IORING_SETUP_NO_MMAP))
+-              ptr = io_mem_alloc(size);
++              ptr = io_pages_map(&ctx->sqe_pages, &ctx->n_sqe_pages, size);
+       else
+               ptr = io_sqes_map(ctx, p->sq_off.user_addr, size);
+--- a/io_uring/io_uring.h
++++ b/io_uring/io_uring.h
+@@ -55,6 +55,8 @@ bool io_fill_cqe_req_aux(struct io_kiocb
+ void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
+ struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
++int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma,
++                      struct page **pages, int npages);
+ struct file *io_file_get_normal(struct io_kiocb *req, int fd);
+ struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
diff --git a/queue-6.6/io_uring-kbuf-use-vm_insert_pages-for-mmap-ed-pbuf-ring.patch b/queue-6.6/io_uring-kbuf-use-vm_insert_pages-for-mmap-ed-pbuf-ring.patch
new file mode 100644 (file)
index 0000000..5221441
--- /dev/null
@@ -0,0 +1,423 @@
+From bd9194ea9dc6647ca247b948aa8587fcb348ac6e Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@kernel.dk>
+Date: Tue, 12 Mar 2024 20:24:21 -0600
+Subject: io_uring/kbuf: use vm_insert_pages() for mmap'ed pbuf ring
+
+From: Jens Axboe <axboe@kernel.dk>
+
+Commit 87585b05757dc70545efb434669708d276125559 upstream.
+
+Rather than use remap_pfn_range() for this and manually free later,
+switch to using vm_insert_page() and have it Just Work.
+
+This requires a bit of effort on the mmap lookup side, as the ctx
+uring_lock isn't held, which  otherwise protects buffer_lists from being
+torn down, and it's not safe to grab from mmap context that would
+introduce an ABBA deadlock between the mmap lock and the ctx uring_lock.
+Instead, lookup the buffer_list under RCU, as the the list is RCU freed
+already. Use the existing reference count to determine whether it's
+possible to safely grab a reference to it (eg if it's not zero already),
+and drop that reference when done with the mapping. If the mmap
+reference is the last one, the buffer_list and the associated memory can
+go away, since the vma insertion has references to the inserted pages at
+that point.
+
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/io_uring_types.h |    3 
+ io_uring/io_uring.c            |   58 ++++-------------
+ io_uring/io_uring.h            |    6 +
+ io_uring/kbuf.c                |  137 ++++++++---------------------------------
+ io_uring/kbuf.h                |    3 
+ 5 files changed, 48 insertions(+), 159 deletions(-)
+
+--- a/include/linux/io_uring_types.h
++++ b/include/linux/io_uring_types.h
+@@ -326,9 +326,6 @@ struct io_ring_ctx {
+       struct list_head        io_buffers_cache;
+-      /* deferred free list, protected by ->uring_lock */
+-      struct hlist_head       io_buf_list;
+-
+       /* Keep this last, we don't need it for the fast path */
+       struct wait_queue_head          poll_wq;
+       struct io_restriction           restrictions;
+--- a/io_uring/io_uring.c
++++ b/io_uring/io_uring.c
+@@ -311,7 +311,6 @@ static __cold struct io_ring_ctx *io_rin
+       INIT_LIST_HEAD(&ctx->sqd_list);
+       INIT_LIST_HEAD(&ctx->cq_overflow_list);
+       INIT_LIST_HEAD(&ctx->io_buffers_cache);
+-      INIT_HLIST_HEAD(&ctx->io_buf_list);
+       io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX,
+                           sizeof(struct io_rsrc_node));
+       io_alloc_cache_init(&ctx->apoll_cache, IO_ALLOC_CACHE_MAX,
+@@ -2682,15 +2681,15 @@ static int io_cqring_wait(struct io_ring
+       return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
+ }
+-static void io_pages_unmap(void *ptr, struct page ***pages,
+-                         unsigned short *npages)
++void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages,
++                  bool put_pages)
+ {
+       bool do_vunmap = false;
+       if (!ptr)
+               return;
+-      if (*npages) {
++      if (put_pages && *npages) {
+               struct page **to_free = *pages;
+               int i;
+@@ -2712,14 +2711,6 @@ static void io_pages_unmap(void *ptr, st
+       *npages = 0;
+ }
+-void io_mem_free(void *ptr)
+-{
+-      if (!ptr)
+-              return;
+-
+-      folio_put(virt_to_folio(ptr));
+-}
+-
+ static void io_pages_free(struct page ***pages, int npages)
+ {
+       struct page **page_array;
+@@ -2818,8 +2809,10 @@ static void *io_sqes_map(struct io_ring_
+ static void io_rings_free(struct io_ring_ctx *ctx)
+ {
+       if (!(ctx->flags & IORING_SETUP_NO_MMAP)) {
+-              io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages);
+-              io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages);
++              io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages,
++                              true);
++              io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages,
++                              true);
+       } else {
+               io_pages_free(&ctx->ring_pages, ctx->n_ring_pages);
+               ctx->n_ring_pages = 0;
+@@ -2876,8 +2869,8 @@ err:
+       return ERR_PTR(-ENOMEM);
+ }
+-static void *io_pages_map(struct page ***out_pages, unsigned short *npages,
+-                        size_t size)
++void *io_pages_map(struct page ***out_pages, unsigned short *npages,
++                 size_t size)
+ {
+       gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN;
+       struct page **pages;
+@@ -2909,17 +2902,6 @@ fail:
+       return ret;
+ }
+-void *io_mem_alloc(size_t size)
+-{
+-      gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
+-      void *ret;
+-
+-      ret = (void *) __get_free_pages(gfp, get_order(size));
+-      if (ret)
+-              return ret;
+-      return ERR_PTR(-ENOMEM);
+-}
+-
+ static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries,
+                               unsigned int cq_entries, size_t *sq_offset)
+ {
+@@ -3073,7 +3055,6 @@ static __cold void io_ring_ctx_free(stru
+               ctx->mm_account = NULL;
+       }
+       io_rings_free(ctx);
+-      io_kbuf_mmap_list_free(ctx);
+       percpu_ref_exit(&ctx->refs);
+       free_uid(ctx->user);
+@@ -3563,10 +3544,8 @@ static void *io_uring_validate_mmap_requ
+ {
+       struct io_ring_ctx *ctx = file->private_data;
+       loff_t offset = pgoff << PAGE_SHIFT;
+-      struct page *page;
+-      void *ptr;
+-      switch (offset & IORING_OFF_MMAP_MASK) {
++      switch ((pgoff << PAGE_SHIFT) & IORING_OFF_MMAP_MASK) {
+       case IORING_OFF_SQ_RING:
+       case IORING_OFF_CQ_RING:
+               /* Don't allow mmap if the ring was setup without it */
+@@ -3581,6 +3560,7 @@ static void *io_uring_validate_mmap_requ
+       case IORING_OFF_PBUF_RING: {
+               struct io_buffer_list *bl;
+               unsigned int bgid;
++              void *ptr;
+               bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
+               bl = io_pbuf_get_bl(ctx, bgid);
+@@ -3588,17 +3568,11 @@ static void *io_uring_validate_mmap_requ
+                       return bl;
+               ptr = bl->buf_ring;
+               io_put_bl(ctx, bl);
+-              break;
++              return ptr;
+               }
+-      default:
+-              return ERR_PTR(-EINVAL);
+       }
+-      page = virt_to_head_page(ptr);
+-      if (sz > page_size(page))
+-              return ERR_PTR(-EINVAL);
+-
+-      return ptr;
++      return ERR_PTR(-EINVAL);
+ }
+ int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma,
+@@ -3618,7 +3592,6 @@ static __cold int io_uring_mmap(struct f
+       size_t sz = vma->vm_end - vma->vm_start;
+       long offset = vma->vm_pgoff << PAGE_SHIFT;
+       unsigned int npages;
+-      unsigned long pfn;
+       void *ptr;
+       ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
+@@ -3633,10 +3606,11 @@ static __cold int io_uring_mmap(struct f
+       case IORING_OFF_SQES:
+               return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages,
+                                               ctx->n_sqe_pages);
++      case IORING_OFF_PBUF_RING:
++              return io_pbuf_mmap(file, vma);
+       }
+-      pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
+-      return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
++      return -EINVAL;
+ }
+ static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp,
+--- a/io_uring/io_uring.h
++++ b/io_uring/io_uring.h
+@@ -93,8 +93,10 @@ bool __io_alloc_req_refill(struct io_rin
+ bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
+                       bool cancel_all);
+-void *io_mem_alloc(size_t size);
+-void io_mem_free(void *ptr);
++void *io_pages_map(struct page ***out_pages, unsigned short *npages,
++                 size_t size);
++void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages,
++                  bool put_pages);
+ #if defined(CONFIG_PROVE_LOCKING)
+ static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
+--- a/io_uring/kbuf.c
++++ b/io_uring/kbuf.c
+@@ -30,25 +30,12 @@ struct io_provide_buf {
+       __u16                           bid;
+ };
+-static inline struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx,
+-                                                        unsigned int bgid)
+-{
+-      return xa_load(&ctx->io_bl_xa, bgid);
+-}
+-
+-struct io_buf_free {
+-      struct hlist_node               list;
+-      void                            *mem;
+-      size_t                          size;
+-      int                             inuse;
+-};
+-
+ static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
+                                                       unsigned int bgid)
+ {
+       lockdep_assert_held(&ctx->uring_lock);
+-      return __io_buffer_get_list(ctx, bgid);
++      return xa_load(&ctx->io_bl_xa, bgid);
+ }
+ static int io_buffer_add_list(struct io_ring_ctx *ctx,
+@@ -199,24 +186,6 @@ void __user *io_buffer_select(struct io_
+       return ret;
+ }
+-/*
+- * Mark the given mapped range as free for reuse
+- */
+-static void io_kbuf_mark_free(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
+-{
+-      struct io_buf_free *ibf;
+-
+-      hlist_for_each_entry(ibf, &ctx->io_buf_list, list) {
+-              if (bl->buf_ring == ibf->mem) {
+-                      ibf->inuse = 0;
+-                      return;
+-              }
+-      }
+-
+-      /* can't happen... */
+-      WARN_ON_ONCE(1);
+-}
+-
+ static int __io_remove_buffers(struct io_ring_ctx *ctx,
+                              struct io_buffer_list *bl, unsigned nbufs)
+ {
+@@ -228,23 +197,16 @@ static int __io_remove_buffers(struct io
+       if (bl->is_mapped) {
+               i = bl->buf_ring->tail - bl->head;
+-              if (bl->is_mmap) {
+-                      /*
+-                       * io_kbuf_list_free() will free the page(s) at
+-                       * ->release() time.
+-                       */
+-                      io_kbuf_mark_free(ctx, bl);
+-                      bl->buf_ring = NULL;
+-                      bl->is_mmap = 0;
+-              } else if (bl->buf_nr_pages) {
++              if (bl->buf_nr_pages) {
+                       int j;
+-                      for (j = 0; j < bl->buf_nr_pages; j++)
+-                              unpin_user_page(bl->buf_pages[j]);
+-                      kvfree(bl->buf_pages);
+-                      vunmap(bl->buf_ring);
+-                      bl->buf_pages = NULL;
+-                      bl->buf_nr_pages = 0;
++                      if (!bl->is_mmap) {
++                              for (j = 0; j < bl->buf_nr_pages; j++)
++                                      unpin_user_page(bl->buf_pages[j]);
++                      }
++                      io_pages_unmap(bl->buf_ring, &bl->buf_pages,
++                                      &bl->buf_nr_pages, bl->is_mmap);
++                      bl->is_mmap = 0;
+               }
+               /* make sure it's seen as empty */
+               INIT_LIST_HEAD(&bl->buf_list);
+@@ -540,63 +502,17 @@ error_unpin:
+       return ret;
+ }
+-/*
+- * See if we have a suitable region that we can reuse, rather than allocate
+- * both a new io_buf_free and mem region again. We leave it on the list as
+- * even a reused entry will need freeing at ring release.
+- */
+-static struct io_buf_free *io_lookup_buf_free_entry(struct io_ring_ctx *ctx,
+-                                                  size_t ring_size)
+-{
+-      struct io_buf_free *ibf, *best = NULL;
+-      size_t best_dist;
+-
+-      hlist_for_each_entry(ibf, &ctx->io_buf_list, list) {
+-              size_t dist;
+-
+-              if (ibf->inuse || ibf->size < ring_size)
+-                      continue;
+-              dist = ibf->size - ring_size;
+-              if (!best || dist < best_dist) {
+-                      best = ibf;
+-                      if (!dist)
+-                              break;
+-                      best_dist = dist;
+-              }
+-      }
+-
+-      return best;
+-}
+-
+ static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx,
+                             struct io_uring_buf_reg *reg,
+                             struct io_buffer_list *bl)
+ {
+-      struct io_buf_free *ibf;
+       size_t ring_size;
+-      void *ptr;
+       ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring);
+-      /* Reuse existing entry, if we can */
+-      ibf = io_lookup_buf_free_entry(ctx, ring_size);
+-      if (!ibf) {
+-              ptr = io_mem_alloc(ring_size);
+-              if (IS_ERR(ptr))
+-                      return PTR_ERR(ptr);
+-
+-              /* Allocate and store deferred free entry */
+-              ibf = kmalloc(sizeof(*ibf), GFP_KERNEL_ACCOUNT);
+-              if (!ibf) {
+-                      io_mem_free(ptr);
+-                      return -ENOMEM;
+-              }
+-              ibf->mem = ptr;
+-              ibf->size = ring_size;
+-              hlist_add_head(&ibf->list, &ctx->io_buf_list);
+-      }
+-      ibf->inuse = 1;
+-      bl->buf_ring = ibf->mem;
++      bl->buf_ring = io_pages_map(&bl->buf_pages, &bl->buf_nr_pages, ring_size);
++      if (!bl->buf_ring)
++              return -ENOMEM;
+       bl->is_mapped = 1;
+       bl->is_mmap = 1;
+       return 0;
+@@ -719,18 +635,19 @@ struct io_buffer_list *io_pbuf_get_bl(st
+       return ERR_PTR(-EINVAL);
+ }
+-/*
+- * Called at or after ->release(), free the mmap'ed buffers that we used
+- * for memory mapped provided buffer rings.
+- */
+-void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx)
+-{
+-      struct io_buf_free *ibf;
+-      struct hlist_node *tmp;
+-
+-      hlist_for_each_entry_safe(ibf, tmp, &ctx->io_buf_list, list) {
+-              hlist_del(&ibf->list);
+-              io_mem_free(ibf->mem);
+-              kfree(ibf);
+-      }
++int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma)
++{
++      struct io_ring_ctx *ctx = file->private_data;
++      loff_t pgoff = vma->vm_pgoff << PAGE_SHIFT;
++      struct io_buffer_list *bl;
++      int bgid, ret;
++
++      bgid = (pgoff & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
++      bl = io_pbuf_get_bl(ctx, bgid);
++      if (IS_ERR(bl))
++              return PTR_ERR(bl);
++
++      ret = io_uring_mmap_pages(ctx, vma, bl->buf_pages, bl->buf_nr_pages);
++      io_put_bl(ctx, bl);
++      return ret;
+ }
+--- a/io_uring/kbuf.h
++++ b/io_uring/kbuf.h
+@@ -54,8 +54,6 @@ int io_provide_buffers(struct io_kiocb *
+ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
+ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
+-void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx);
+-
+ unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags);
+ void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);
+@@ -63,6 +61,7 @@ void io_kbuf_recycle_legacy(struct io_ki
+ void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl);
+ struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx,
+                                     unsigned long bgid);
++int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma);
+ static inline void io_kbuf_recycle_ring(struct io_kiocb *req)
+ {
diff --git a/queue-6.6/io_uring-kbuf-vmap-pinned-buffer-ring.patch b/queue-6.6/io_uring-kbuf-vmap-pinned-buffer-ring.patch
new file mode 100644 (file)
index 0000000..5b75f52
--- /dev/null
@@ -0,0 +1,112 @@
+From 432b583ab581f2c21cad164d396a8e9fa4754a22 Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@kernel.dk>
+Date: Tue, 12 Mar 2024 10:42:27 -0600
+Subject: io_uring/kbuf: vmap pinned buffer ring
+
+From: Jens Axboe <axboe@kernel.dk>
+
+Commit e270bfd22a2a10d1cfbaddf23e79b6d0b405d21e upstream.
+
+This avoids needing to care about HIGHMEM, and it makes the buffer
+indexing easier as both ring provided buffer methods are now virtually
+mapped in a contigious fashion.
+
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ io_uring/kbuf.c |   39 +++++++++++++++------------------------
+ 1 file changed, 15 insertions(+), 24 deletions(-)
+
+--- a/io_uring/kbuf.c
++++ b/io_uring/kbuf.c
+@@ -7,6 +7,7 @@
+ #include <linux/slab.h>
+ #include <linux/namei.h>
+ #include <linux/poll.h>
++#include <linux/vmalloc.h>
+ #include <linux/io_uring.h>
+ #include <uapi/linux/io_uring.h>
+@@ -153,15 +154,7 @@ static void __user *io_ring_buffer_selec
+               return NULL;
+       head &= bl->mask;
+-      /* mmaped buffers are always contig */
+-      if (bl->is_mmap || head < IO_BUFFER_LIST_BUF_PER_PAGE) {
+-              buf = &br->bufs[head];
+-      } else {
+-              int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1);
+-              int index = head / IO_BUFFER_LIST_BUF_PER_PAGE;
+-              buf = page_address(bl->buf_pages[index]);
+-              buf += off;
+-      }
++      buf = &br->bufs[head];
+       if (*len == 0 || *len > buf->len)
+               *len = buf->len;
+       req->flags |= REQ_F_BUFFER_RING;
+@@ -249,6 +242,7 @@ static int __io_remove_buffers(struct io
+                       for (j = 0; j < bl->buf_nr_pages; j++)
+                               unpin_user_page(bl->buf_pages[j]);
+                       kvfree(bl->buf_pages);
++                      vunmap(bl->buf_ring);
+                       bl->buf_pages = NULL;
+                       bl->buf_nr_pages = 0;
+               }
+@@ -501,9 +495,9 @@ err:
+ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
+                           struct io_buffer_list *bl)
+ {
+-      struct io_uring_buf_ring *br;
++      struct io_uring_buf_ring *br = NULL;
++      int nr_pages, ret, i;
+       struct page **pages;
+-      int i, nr_pages;
+       pages = io_pin_pages(reg->ring_addr,
+                            flex_array_size(br, bufs, reg->ring_entries),
+@@ -511,18 +505,12 @@ static int io_pin_pbuf_ring(struct io_ur
+       if (IS_ERR(pages))
+               return PTR_ERR(pages);
+-      /*
+-       * Apparently some 32-bit boxes (ARM) will return highmem pages,
+-       * which then need to be mapped. We could support that, but it'd
+-       * complicate the code and slowdown the common cases quite a bit.
+-       * So just error out, returning -EINVAL just like we did on kernels
+-       * that didn't support mapped buffer rings.
+-       */
+-      for (i = 0; i < nr_pages; i++)
+-              if (PageHighMem(pages[i]))
+-                      goto error_unpin;
++      br = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
++      if (!br) {
++              ret = -ENOMEM;
++              goto error_unpin;
++      }
+-      br = page_address(pages[0]);
+ #ifdef SHM_COLOUR
+       /*
+        * On platforms that have specific aliasing requirements, SHM_COLOUR
+@@ -533,8 +521,10 @@ static int io_pin_pbuf_ring(struct io_ur
+        * should use IOU_PBUF_RING_MMAP instead, and liburing will handle
+        * this transparently.
+        */
+-      if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1))
++      if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) {
++              ret = -EINVAL;
+               goto error_unpin;
++      }
+ #endif
+       bl->buf_pages = pages;
+       bl->buf_nr_pages = nr_pages;
+@@ -546,7 +536,8 @@ error_unpin:
+       for (i = 0; i < nr_pages; i++)
+               unpin_user_page(pages[i]);
+       kvfree(pages);
+-      return -EINVAL;
++      vunmap(br);
++      return ret;
+ }
+ /*
diff --git a/queue-6.6/io_uring-return-error-pointer-from-io_mem_alloc.patch b/queue-6.6/io_uring-return-error-pointer-from-io_mem_alloc.patch
new file mode 100644 (file)
index 0000000..b3487be
--- /dev/null
@@ -0,0 +1,76 @@
+From b001225fa4fe09610b35b428e46193ed2a28c95f Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@kernel.dk>
+Date: Fri, 5 Nov 2021 17:13:52 -0600
+Subject: io_uring: return error pointer from io_mem_alloc()
+
+From: Jens Axboe <axboe@kernel.dk>
+
+Commit e27cef86a0edd4ef7f8b4670f508a03b509cbbb2 upstream.
+
+In preparation for having more than one time of ring allocator, make the
+existing one return valid/error-pointer rather than just NULL.
+
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ io_uring/io_uring.c | 18 ++++++++++++------
+ 1 file changed, 12 insertions(+), 6 deletions(-)
+
+diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
+index 33597284e1cb..ebcb0680f1cc 100644
+--- a/io_uring/io_uring.c
++++ b/io_uring/io_uring.c
+@@ -2528,8 +2528,12 @@ static void io_mem_free(void *ptr)
+ static void *io_mem_alloc(size_t size)
+ {
+       gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
++      void *ret;
+-      return (void *) __get_free_pages(gfp, get_order(size));
++      ret = (void *) __get_free_pages(gfp, get_order(size));
++      if (ret)
++              return ret;
++      return ERR_PTR(-ENOMEM);
+ }
+ static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries,
+@@ -3422,6 +3426,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
+ {
+       struct io_rings *rings;
+       size_t size, sq_array_offset;
++      void *ptr;
+       /* make sure these are sane, as we already accounted them */
+       ctx->sq_entries = p->sq_entries;
+@@ -3432,8 +3437,8 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
+               return -EOVERFLOW;
+       rings = io_mem_alloc(size);
+-      if (!rings)
+-              return -ENOMEM;
++      if (IS_ERR(rings))
++              return PTR_ERR(rings);
+       ctx->rings = rings;
+       ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
+@@ -3452,13 +3457,14 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
+               return -EOVERFLOW;
+       }
+-      ctx->sq_sqes = io_mem_alloc(size);
+-      if (!ctx->sq_sqes) {
++      ptr = io_mem_alloc(size);
++      if (IS_ERR(ptr)) {
+               io_mem_free(ctx->rings);
+               ctx->rings = NULL;
+-              return -ENOMEM;
++              return PTR_ERR(ptr);
+       }
++      ctx->sq_sqes = ptr;
+       return 0;
+ }
+-- 
+2.47.2
+
diff --git a/queue-6.6/io_uring-unify-io_pin_pages.patch b/queue-6.6/io_uring-unify-io_pin_pages.patch
new file mode 100644 (file)
index 0000000..b008813
--- /dev/null
@@ -0,0 +1,156 @@
+From dc5ec8a2f867b4211508a5ded8616103f4d67112 Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@kernel.dk>
+Date: Wed, 13 Mar 2024 14:58:14 -0600
+Subject: io_uring: unify io_pin_pages()
+
+From: Jens Axboe <axboe@kernel.dk>
+
+Commit 1943f96b3816e0f0d3d6686374d6e1d617c8b42c upstream.
+
+Move it into io_uring.c where it belongs, and use it in there as well
+rather than have two implementations of this.
+
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ io_uring/io_uring.c |   61 +++++++++++++++++++++++++++++++++++-----------------
+ io_uring/rsrc.c     |   39 ---------------------------------
+ 2 files changed, 42 insertions(+), 58 deletions(-)
+
+--- a/io_uring/io_uring.c
++++ b/io_uring/io_uring.c
+@@ -2738,33 +2738,57 @@ static void io_pages_free(struct page **
+       *pages = NULL;
+ }
++struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages)
++{
++      unsigned long start, end, nr_pages;
++      struct page **pages;
++      int ret;
++
++      end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
++      start = uaddr >> PAGE_SHIFT;
++      nr_pages = end - start;
++      if (WARN_ON_ONCE(!nr_pages))
++              return ERR_PTR(-EINVAL);
++
++      pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
++      if (!pages)
++              return ERR_PTR(-ENOMEM);
++
++      ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
++                                      pages);
++      /* success, mapped all pages */
++      if (ret == nr_pages) {
++              *npages = nr_pages;
++              return pages;
++      }
++
++      /* partial map, or didn't map anything */
++      if (ret >= 0) {
++              /* if we did partial map, release any pages we did get */
++              if (ret)
++                      unpin_user_pages(pages, ret);
++              ret = -EFAULT;
++      }
++      kvfree(pages);
++      return ERR_PTR(ret);
++}
++
+ static void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
+                           unsigned long uaddr, size_t size)
+ {
+       struct page **page_array;
+       unsigned int nr_pages;
+       void *page_addr;
+-      int ret, pinned;
+       *npages = 0;
+       if (uaddr & (PAGE_SIZE - 1) || !size)
+               return ERR_PTR(-EINVAL);
+-      nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+-      if (nr_pages > USHRT_MAX)
+-              return ERR_PTR(-EINVAL);
+-      page_array = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
+-      if (!page_array)
+-              return ERR_PTR(-ENOMEM);
+-
+-
+-      pinned = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
+-                                   page_array);
+-      if (pinned != nr_pages) {
+-              ret = (pinned < 0) ? pinned : -EFAULT;
+-              goto free_pages;
+-      }
++      nr_pages = 0;
++      page_array = io_pin_pages(uaddr, size, &nr_pages);
++      if (IS_ERR(page_array))
++              return page_array;
+       page_addr = vmap(page_array, nr_pages, VM_MAP, PAGE_KERNEL);
+       if (page_addr) {
+@@ -2772,10 +2796,9 @@ static void *__io_uaddr_map(struct page
+               *npages = nr_pages;
+               return page_addr;
+       }
+-      ret = -ENOMEM;
+-free_pages:
+-      io_pages_free(&page_array, pinned > 0 ? pinned : 0);
+-      return ERR_PTR(ret);
++
++      io_pages_free(&page_array, nr_pages);
++      return ERR_PTR(-ENOMEM);
+ }
+ static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr,
+--- a/io_uring/rsrc.c
++++ b/io_uring/rsrc.c
+@@ -873,45 +873,6 @@ static int io_buffer_account_pin(struct
+       return ret;
+ }
+-struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages)
+-{
+-      unsigned long start, end, nr_pages;
+-      struct page **pages = NULL;
+-      int pret, ret = -ENOMEM;
+-
+-      end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+-      start = ubuf >> PAGE_SHIFT;
+-      nr_pages = end - start;
+-
+-      pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
+-      if (!pages)
+-              goto done;
+-
+-      ret = 0;
+-      mmap_read_lock(current->mm);
+-      pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
+-                            pages);
+-      if (pret == nr_pages)
+-              *npages = nr_pages;
+-      else
+-              ret = pret < 0 ? pret : -EFAULT;
+-
+-      mmap_read_unlock(current->mm);
+-      if (ret) {
+-              /* if we did partial map, release any pages we did get */
+-              if (pret > 0)
+-                      unpin_user_pages(pages, pret);
+-              goto done;
+-      }
+-      ret = 0;
+-done:
+-      if (ret < 0) {
+-              kvfree(pages);
+-              pages = ERR_PTR(ret);
+-      }
+-      return pages;
+-}
+-
+ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
+                                 struct io_mapped_ubuf **pimu,
+                                 struct page **last_hpage)
diff --git a/queue-6.6/io_uring-use-unpin_user_pages-where-appropriate.patch b/queue-6.6/io_uring-use-unpin_user_pages-where-appropriate.patch
new file mode 100644 (file)
index 0000000..06d7030
--- /dev/null
@@ -0,0 +1,40 @@
+From c8e556f54f547266d984bcffbb44279ec3884258 Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@kernel.dk>
+Date: Wed, 13 Mar 2024 15:01:03 -0600
+Subject: io_uring: use unpin_user_pages() where appropriate
+
+From: Jens Axboe <axboe@kernel.dk>
+
+Commit 18595c0a58ae29ac6a996c5b664610119b73182d upstream.
+
+There are a few cases of open-rolled loops around unpin_user_page(), use
+the generic helper instead.
+
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ io_uring/kbuf.c |    5 ++---
+ 1 file changed, 2 insertions(+), 3 deletions(-)
+
+--- a/io_uring/kbuf.c
++++ b/io_uring/kbuf.c
+@@ -458,8 +458,8 @@ static int io_pin_pbuf_ring(struct io_ur
+                           struct io_buffer_list *bl)
+ {
+       struct io_uring_buf_ring *br = NULL;
+-      int nr_pages, ret, i;
+       struct page **pages;
++      int nr_pages, ret;
+       pages = io_pin_pages(reg->ring_addr,
+                            flex_array_size(br, bufs, reg->ring_entries),
+@@ -495,8 +495,7 @@ static int io_pin_pbuf_ring(struct io_ur
+       bl->is_mmap = 0;
+       return 0;
+ error_unpin:
+-      for (i = 0; i < nr_pages; i++)
+-              unpin_user_page(pages[i]);
++      unpin_user_pages(pages, nr_pages);
+       kvfree(pages);
+       vunmap(br);
+       return ret;
diff --git a/queue-6.6/io_uring-use-vmap-for-ring-mapping.patch b/queue-6.6/io_uring-use-vmap-for-ring-mapping.patch
new file mode 100644 (file)
index 0000000..a0a21c9
--- /dev/null
@@ -0,0 +1,87 @@
+From 23cd4c4db8836b441e401328244a1864b47ac3c8 Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@kernel.dk>
+Date: Wed, 13 Mar 2024 14:10:40 -0600
+Subject: io_uring: use vmap() for ring mapping
+
+From: Jens Axboe <axboe@kernel.dk>
+
+Commit 09fc75e0c035a2cabb8caa15cec6e85159dd94f0 upstream.
+
+This is the last holdout which does odd page checking, convert it to
+vmap just like what is done for the non-mmap path.
+
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ io_uring/io_uring.c |   38 +++++++++-----------------------------
+ 1 file changed, 9 insertions(+), 29 deletions(-)
+
+--- a/io_uring/io_uring.c
++++ b/io_uring/io_uring.c
+@@ -64,7 +64,6 @@
+ #include <linux/sched/mm.h>
+ #include <linux/uaccess.h>
+ #include <linux/nospec.h>
+-#include <linux/highmem.h>
+ #include <linux/fsnotify.h>
+ #include <linux/fadvise.h>
+ #include <linux/task_work.h>
+@@ -2745,7 +2744,7 @@ static void *__io_uaddr_map(struct page
+       struct page **page_array;
+       unsigned int nr_pages;
+       void *page_addr;
+-      int ret, i, pinned;
++      int ret, pinned;
+       *npages = 0;
+@@ -2767,34 +2766,13 @@ static void *__io_uaddr_map(struct page
+               goto free_pages;
+       }
+-      page_addr = page_address(page_array[0]);
+-      for (i = 0; i < nr_pages; i++) {
+-              ret = -EINVAL;
+-
+-              /*
+-               * Can't support mapping user allocated ring memory on 32-bit
+-               * archs where it could potentially reside in highmem. Just
+-               * fail those with -EINVAL, just like we did on kernels that
+-               * didn't support this feature.
+-               */
+-              if (PageHighMem(page_array[i]))
+-                      goto free_pages;
+-
+-              /*
+-               * No support for discontig pages for now, should either be a
+-               * single normal page, or a huge page. Later on we can add
+-               * support for remapping discontig pages, for now we will
+-               * just fail them with EINVAL.
+-               */
+-              if (page_address(page_array[i]) != page_addr)
+-                      goto free_pages;
+-              page_addr += PAGE_SIZE;
++      page_addr = vmap(page_array, nr_pages, VM_MAP, PAGE_KERNEL);
++      if (page_addr) {
++              *pages = page_array;
++              *npages = nr_pages;
++              return page_addr;
+       }
+-
+-      *pages = page_array;
+-      *npages = nr_pages;
+-      return page_to_virt(page_array[0]);
+-
++      ret = -ENOMEM;
+ free_pages:
+       io_pages_free(&page_array, pinned > 0 ? pinned : 0);
+       return ERR_PTR(ret);
+@@ -2824,6 +2802,8 @@ static void io_rings_free(struct io_ring
+               ctx->n_ring_pages = 0;
+               io_pages_free(&ctx->sqe_pages, ctx->n_sqe_pages);
+               ctx->n_sqe_pages = 0;
++              vunmap(ctx->rings);
++              vunmap(ctx->sq_sqes);
+       }
+       ctx->rings = NULL;
diff --git a/queue-6.6/mm-add-nommu-variant-of-vm_insert_pages.patch b/queue-6.6/mm-add-nommu-variant-of-vm_insert_pages.patch
new file mode 100644 (file)
index 0000000..0a2953d
--- /dev/null
@@ -0,0 +1,36 @@
+From ac77b7bfe1633f5366bceb76d74d2f04846b2186 Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@kernel.dk>
+Date: Sat, 16 Mar 2024 07:21:43 -0600
+Subject: mm: add nommu variant of vm_insert_pages()
+
+From: Jens Axboe <axboe@kernel.dk>
+
+Commit 62346c6cb28b043f2a6e95337d9081ec0b37b5f5 upstream.
+
+An identical one exists for vm_insert_page(), add one for
+vm_insert_pages() to avoid needing to check for CONFIG_MMU in code using
+it.
+
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/nommu.c |    7 +++++++
+ 1 file changed, 7 insertions(+)
+
+--- a/mm/nommu.c
++++ b/mm/nommu.c
+@@ -357,6 +357,13 @@ int vm_insert_page(struct vm_area_struct
+ }
+ EXPORT_SYMBOL(vm_insert_page);
++int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
++                      struct page **pages, unsigned long *num)
++{
++      return -EINVAL;
++}
++EXPORT_SYMBOL(vm_insert_pages);
++
+ int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
+                       unsigned long num)
+ {
index 97f1e6badf039d70126cf24db9e033fb56d34ad5..0488fcb0ee06a29176eff8e651059c35de200a20 100644 (file)
@@ -98,3 +98,15 @@ mptcp-safety-check-before-fallback.patch
 drm-nouveau-do-not-override-forced-connector-status.patch
 net-handle-napi_schedule-calls-from-non-interrupt.patch
 block-fix-kmem_cache-of-name-bio-108-already-exists.patch
+mm-add-nommu-variant-of-vm_insert_pages.patch
+io_uring-get-rid-of-remap_pfn_range-for-mapping-rings-sqes.patch
+io_uring-don-t-attempt-to-mmap-larger-than-what-the-user-asks-for.patch
+io_uring-fix-corner-case-forgetting-to-vunmap.patch
+io_uring-use-vmap-for-ring-mapping.patch
+io_uring-unify-io_pin_pages.patch
+io_uring-kbuf-vmap-pinned-buffer-ring.patch
+io_uring-kbuf-use-vm_insert_pages-for-mmap-ed-pbuf-ring.patch
+io_uring-use-unpin_user_pages-where-appropriate.patch
+io_uring-fix-error-pbuf-checking.patch
+io_uring-add-ring-freeing-helper.patch
+io_uring-return-error-pointer-from-io_mem_alloc.patch