]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
ublk: eliminate permanent pages[] array from struct ublk_buf
authorMing Lei <ming.lei@redhat.com>
Tue, 31 Mar 2026 15:31:55 +0000 (23:31 +0800)
committerJens Axboe <axboe@kernel.dk>
Tue, 7 Apr 2026 13:40:19 +0000 (07:40 -0600)
The pages[] array (kvmalloc'd, 8 bytes per page = 2MB for a 1GB buffer)
was stored permanently in struct ublk_buf but only needed during
pin_user_pages_fast() and maple tree construction. Since the maple tree
already stores PFN ranges via ublk_buf_range, struct page pointers can
be recovered via pfn_to_page() during unregistration.

Make pages[] a temporary allocation in ublk_ctrl_reg_buf(), freed
immediately after the maple tree is built. Rewrite __ublk_ctrl_unreg_buf()
to iterate the maple tree for matching buf_index entries, recovering
struct page pointers via pfn_to_page() and unpinning in batches of 32.
Simplify ublk_buf_erase_ranges() to iterate the maple tree by buf_index
instead of walking the now-removed pages[] array.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://patch.msgid.link/20260331153207.3635125-5-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
drivers/block/ublk_drv.c

index bdb1de41d526a52fa34bb4df6b07ed82eef09c9c..1af42850f5b109586914820da5a4c388452c70f2 100644 (file)
@@ -296,7 +296,6 @@ struct ublk_queue {
 
 /* Per-registered shared memory buffer */
 struct ublk_buf {
-       struct page **pages;
        unsigned int nr_pages;
 };
 
@@ -5261,27 +5260,25 @@ static void ublk_unquiesce_and_resume(struct gendisk *disk)
        blk_mq_unquiesce_queue(disk->queue);
 }
 
-/* Erase coalesced PFN ranges from the maple tree for pages [0, nr_pages) */
-static void ublk_buf_erase_ranges(struct ublk_device *ub,
-                                 struct ublk_buf *ubuf,
-                                 unsigned long nr_pages)
+/* Erase coalesced PFN ranges from the maple tree matching buf_index */
+static void ublk_buf_erase_ranges(struct ublk_device *ub, int buf_index)
 {
-       unsigned long i;
-
-       for (i = 0; i < nr_pages; ) {
-               unsigned long pfn = page_to_pfn(ubuf->pages[i]);
-               unsigned long start = i;
+       MA_STATE(mas, &ub->buf_tree, 0, ULONG_MAX);
+       struct ublk_buf_range *range;
 
-               while (i + 1 < nr_pages &&
-                      page_to_pfn(ubuf->pages[i + 1]) == pfn + (i - start) + 1)
-                       i++;
-               i++;
-               kfree(mtree_erase(&ub->buf_tree, pfn));
+       mas_lock(&mas);
+       mas_for_each(&mas, range, ULONG_MAX) {
+               if (range->buf_index == buf_index) {
+                       mas_erase(&mas);
+                       kfree(range);
+               }
        }
+       mas_unlock(&mas);
 }
 
 static int __ublk_ctrl_reg_buf(struct ublk_device *ub,
-                              struct ublk_buf *ubuf, int index,
+                              struct ublk_buf *ubuf,
+                              struct page **pages, int index,
                               unsigned short flags)
 {
        unsigned long nr_pages = ubuf->nr_pages;
@@ -5289,13 +5286,13 @@ static int __ublk_ctrl_reg_buf(struct ublk_device *ub,
        int ret;
 
        for (i = 0; i < nr_pages; ) {
-               unsigned long pfn = page_to_pfn(ubuf->pages[i]);
+               unsigned long pfn = page_to_pfn(pages[i]);
                unsigned long start = i;
                struct ublk_buf_range *range;
 
                /* Find run of consecutive PFNs */
                while (i + 1 < nr_pages &&
-                      page_to_pfn(ubuf->pages[i + 1]) == pfn + (i - start) + 1)
+                      page_to_pfn(pages[i + 1]) == pfn + (i - start) + 1)
                        i++;
                i++;    /* past the last page in this run */
 
@@ -5320,7 +5317,7 @@ static int __ublk_ctrl_reg_buf(struct ublk_device *ub,
        return 0;
 
 unwind:
-       ublk_buf_erase_ranges(ub, ubuf, i);
+       ublk_buf_erase_ranges(ub, index);
        return ret;
 }
 
@@ -5335,6 +5332,7 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub,
        void __user *argp = (void __user *)(unsigned long)header->addr;
        struct ublk_shmem_buf_reg buf_reg;
        unsigned long addr, size, nr_pages;
+       struct page **pages = NULL;
        unsigned int gup_flags;
        struct gendisk *disk;
        struct ublk_buf *ubuf;
@@ -5371,9 +5369,8 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub,
                goto put_disk;
        }
 
-       ubuf->pages = kvmalloc_array(nr_pages, sizeof(*ubuf->pages),
-                                    GFP_KERNEL);
-       if (!ubuf->pages) {
+       pages = kvmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL);
+       if (!pages) {
                ret = -ENOMEM;
                goto err_free;
        }
@@ -5382,7 +5379,7 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub,
        if (!(buf_reg.flags & UBLK_SHMEM_BUF_READ_ONLY))
                gup_flags |= FOLL_WRITE;
 
-       pinned = pin_user_pages_fast(addr, nr_pages, gup_flags, ubuf->pages);
+       pinned = pin_user_pages_fast(addr, nr_pages, gup_flags, pages);
        if (pinned < 0) {
                ret = pinned;
                goto err_free_pages;
@@ -5406,7 +5403,7 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub,
        if (ret)
                goto err_unlock;
 
-       ret = __ublk_ctrl_reg_buf(ub, ubuf, index, buf_reg.flags);
+       ret = __ublk_ctrl_reg_buf(ub, ubuf, pages, index, buf_reg.flags);
        if (ret) {
                xa_erase(&ub->bufs_xa, index);
                goto err_unlock;
@@ -5414,6 +5411,7 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub,
 
        mutex_unlock(&ub->mutex);
 
+       kvfree(pages);
        ublk_unquiesce_and_resume(disk);
        ublk_put_disk(disk);
        return index;
@@ -5422,9 +5420,9 @@ err_unlock:
        mutex_unlock(&ub->mutex);
        ublk_unquiesce_and_resume(disk);
 err_unpin:
-       unpin_user_pages(ubuf->pages, pinned);
+       unpin_user_pages(pages, pinned);
 err_free_pages:
-       kvfree(ubuf->pages);
+       kvfree(pages);
 err_free:
        kfree(ubuf);
 put_disk:
@@ -5433,11 +5431,36 @@ put_disk:
 }
 
 static void __ublk_ctrl_unreg_buf(struct ublk_device *ub,
-                                 struct ublk_buf *ubuf)
+                                 struct ublk_buf *ubuf, int buf_index)
 {
-       ublk_buf_erase_ranges(ub, ubuf, ubuf->nr_pages);
-       unpin_user_pages(ubuf->pages, ubuf->nr_pages);
-       kvfree(ubuf->pages);
+       MA_STATE(mas, &ub->buf_tree, 0, ULONG_MAX);
+       struct ublk_buf_range *range;
+       struct page *pages[32];
+
+       mas_lock(&mas);
+       mas_for_each(&mas, range, ULONG_MAX) {
+               unsigned long base, nr, off;
+
+               if (range->buf_index != buf_index)
+                       continue;
+
+               base = range->base_pfn;
+               nr = mas.last - mas.index + 1;
+               mas_erase(&mas);
+
+               for (off = 0; off < nr; ) {
+                       unsigned int batch = min_t(unsigned long,
+                                                  nr - off, 32);
+                       unsigned int j;
+
+                       for (j = 0; j < batch; j++)
+                               pages[j] = pfn_to_page(base + off + j);
+                       unpin_user_pages(pages, batch);
+                       off += batch;
+               }
+               kfree(range);
+       }
+       mas_unlock(&mas);
        kfree(ubuf);
 }
 
@@ -5468,7 +5491,7 @@ static int ublk_ctrl_unreg_buf(struct ublk_device *ub,
                return -ENOENT;
        }
 
-       __ublk_ctrl_unreg_buf(ub, ubuf);
+       __ublk_ctrl_unreg_buf(ub, ubuf, index);
 
        mutex_unlock(&ub->mutex);
 
@@ -5483,7 +5506,7 @@ static void ublk_buf_cleanup(struct ublk_device *ub)
        unsigned long index;
 
        xa_for_each(&ub->bufs_xa, index, ubuf)
-               __ublk_ctrl_unreg_buf(ub, ubuf);
+               __ublk_ctrl_unreg_buf(ub, ubuf, index);
        xa_destroy(&ub->bufs_xa);
        mtree_destroy(&ub->buf_tree);
 }