1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/kernel.h>
3 #include <linux/errno.h>
5 #include <linux/file.h>
7 #include <linux/slab.h>
8 #include <linux/namei.h>
9 #include <linux/poll.h>
10 #include <linux/io_uring.h>
12 #include <uapi/linux/io_uring.h>
18 #define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf))
22 struct io_provide_buf
{
31 static inline struct io_buffer_list
*io_buffer_get_list(struct io_ring_ctx
*ctx
,
34 if (ctx
->io_bl
&& bgid
< BGID_ARRAY
)
35 return &ctx
->io_bl
[bgid
];
37 return xa_load(&ctx
->io_bl_xa
, bgid
);
40 static int io_buffer_add_list(struct io_ring_ctx
*ctx
,
41 struct io_buffer_list
*bl
, unsigned int bgid
)
44 if (bgid
< BGID_ARRAY
)
47 return xa_err(xa_store(&ctx
->io_bl_xa
, bgid
, bl
, GFP_KERNEL
));
50 void io_kbuf_recycle_legacy(struct io_kiocb
*req
, unsigned issue_flags
)
52 struct io_ring_ctx
*ctx
= req
->ctx
;
53 struct io_buffer_list
*bl
;
54 struct io_buffer
*buf
;
57 * For legacy provided buffer mode, don't recycle if we already did
58 * IO to this buffer. For ring-mapped provided buffer mode, we should
59 * increment ring->head to explicitly monopolize the buffer to avoid
62 if (req
->flags
& REQ_F_PARTIAL_IO
)
65 io_ring_submit_lock(ctx
, issue_flags
);
68 bl
= io_buffer_get_list(ctx
, buf
->bgid
);
69 list_add(&buf
->list
, &bl
->buf_list
);
70 req
->flags
&= ~REQ_F_BUFFER_SELECTED
;
71 req
->buf_index
= buf
->bgid
;
73 io_ring_submit_unlock(ctx
, issue_flags
);
77 unsigned int __io_put_kbuf(struct io_kiocb
*req
, unsigned issue_flags
)
82 * We can add this buffer back to two lists:
84 * 1) The io_buffers_cache list. This one is protected by the
85 * ctx->uring_lock. If we already hold this lock, add back to this
86 * list as we can grab it from issue as well.
87 * 2) The io_buffers_comp list. This one is protected by the
88 * ctx->completion_lock.
90 * We migrate buffers from the comp_list to the issue cache list
93 if (req
->flags
& REQ_F_BUFFER_RING
) {
94 /* no buffers to recycle for this case */
95 cflags
= __io_put_kbuf_list(req
, NULL
);
96 } else if (issue_flags
& IO_URING_F_UNLOCKED
) {
97 struct io_ring_ctx
*ctx
= req
->ctx
;
99 spin_lock(&ctx
->completion_lock
);
100 cflags
= __io_put_kbuf_list(req
, &ctx
->io_buffers_comp
);
101 spin_unlock(&ctx
->completion_lock
);
103 lockdep_assert_held(&req
->ctx
->uring_lock
);
105 cflags
= __io_put_kbuf_list(req
, &req
->ctx
->io_buffers_cache
);
110 static void __user
*io_provided_buffer_select(struct io_kiocb
*req
, size_t *len
,
111 struct io_buffer_list
*bl
)
113 if (!list_empty(&bl
->buf_list
)) {
114 struct io_buffer
*kbuf
;
116 kbuf
= list_first_entry(&bl
->buf_list
, struct io_buffer
, list
);
117 list_del(&kbuf
->list
);
118 if (*len
== 0 || *len
> kbuf
->len
)
120 req
->flags
|= REQ_F_BUFFER_SELECTED
;
122 req
->buf_index
= kbuf
->bid
;
123 return u64_to_user_ptr(kbuf
->addr
);
128 static void __user
*io_ring_buffer_select(struct io_kiocb
*req
, size_t *len
,
129 struct io_buffer_list
*bl
,
130 unsigned int issue_flags
)
132 struct io_uring_buf_ring
*br
= bl
->buf_ring
;
133 struct io_uring_buf
*buf
;
134 __u16 head
= bl
->head
;
136 if (unlikely(smp_load_acquire(&br
->tail
) == head
))
140 /* mmaped buffers are always contig */
141 if (bl
->is_mmap
|| head
< IO_BUFFER_LIST_BUF_PER_PAGE
) {
142 buf
= &br
->bufs
[head
];
144 int off
= head
& (IO_BUFFER_LIST_BUF_PER_PAGE
- 1);
145 int index
= head
/ IO_BUFFER_LIST_BUF_PER_PAGE
;
146 buf
= page_address(bl
->buf_pages
[index
]);
149 if (*len
== 0 || *len
> buf
->len
)
151 req
->flags
|= REQ_F_BUFFER_RING
;
153 req
->buf_index
= buf
->bid
;
155 if (issue_flags
& IO_URING_F_UNLOCKED
|| !file_can_poll(req
->file
)) {
157 * If we came in unlocked, we have no choice but to consume the
158 * buffer here, otherwise nothing ensures that the buffer won't
159 * get used by others. This does mean it'll be pinned until the
160 * IO completes, coming in unlocked means we're being called from
161 * io-wq context and there may be further retries in async hybrid
162 * mode. For the locked case, the caller must call commit when
163 * the transfer completes (or if we get -EAGAIN and must poll of
166 req
->buf_list
= NULL
;
169 return u64_to_user_ptr(buf
->addr
);
172 void __user
*io_buffer_select(struct io_kiocb
*req
, size_t *len
,
173 unsigned int issue_flags
)
175 struct io_ring_ctx
*ctx
= req
->ctx
;
176 struct io_buffer_list
*bl
;
177 void __user
*ret
= NULL
;
179 io_ring_submit_lock(req
->ctx
, issue_flags
);
181 bl
= io_buffer_get_list(ctx
, req
->buf_index
);
184 ret
= io_ring_buffer_select(req
, len
, bl
, issue_flags
);
186 ret
= io_provided_buffer_select(req
, len
, bl
);
188 io_ring_submit_unlock(req
->ctx
, issue_flags
);
192 static __cold
int io_init_bl_list(struct io_ring_ctx
*ctx
)
196 ctx
->io_bl
= kcalloc(BGID_ARRAY
, sizeof(struct io_buffer_list
),
201 for (i
= 0; i
< BGID_ARRAY
; i
++) {
202 INIT_LIST_HEAD(&ctx
->io_bl
[i
].buf_list
);
203 ctx
->io_bl
[i
].bgid
= i
;
209 static int __io_remove_buffers(struct io_ring_ctx
*ctx
,
210 struct io_buffer_list
*bl
, unsigned nbufs
)
214 /* shouldn't happen */
219 i
= bl
->buf_ring
->tail
- bl
->head
;
221 folio_put(virt_to_folio(bl
->buf_ring
));
224 } else if (bl
->buf_nr_pages
) {
227 for (j
= 0; j
< bl
->buf_nr_pages
; j
++)
228 unpin_user_page(bl
->buf_pages
[j
]);
229 kvfree(bl
->buf_pages
);
230 bl
->buf_pages
= NULL
;
231 bl
->buf_nr_pages
= 0;
233 /* make sure it's seen as empty */
234 INIT_LIST_HEAD(&bl
->buf_list
);
239 /* protects io_buffers_cache */
240 lockdep_assert_held(&ctx
->uring_lock
);
242 while (!list_empty(&bl
->buf_list
)) {
243 struct io_buffer
*nxt
;
245 nxt
= list_first_entry(&bl
->buf_list
, struct io_buffer
, list
);
246 list_move(&nxt
->list
, &ctx
->io_buffers_cache
);
255 void io_destroy_buffers(struct io_ring_ctx
*ctx
)
257 struct io_buffer_list
*bl
;
261 for (i
= 0; i
< BGID_ARRAY
; i
++) {
264 __io_remove_buffers(ctx
, &ctx
->io_bl
[i
], -1U);
267 xa_for_each(&ctx
->io_bl_xa
, index
, bl
) {
268 xa_erase(&ctx
->io_bl_xa
, bl
->bgid
);
269 __io_remove_buffers(ctx
, bl
, -1U);
273 while (!list_empty(&ctx
->io_buffers_pages
)) {
276 page
= list_first_entry(&ctx
->io_buffers_pages
, struct page
, lru
);
277 list_del_init(&page
->lru
);
282 int io_remove_buffers_prep(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
284 struct io_provide_buf
*p
= io_kiocb_to_cmd(req
, struct io_provide_buf
);
287 if (sqe
->rw_flags
|| sqe
->addr
|| sqe
->len
|| sqe
->off
||
291 tmp
= READ_ONCE(sqe
->fd
);
292 if (!tmp
|| tmp
> USHRT_MAX
)
295 memset(p
, 0, sizeof(*p
));
297 p
->bgid
= READ_ONCE(sqe
->buf_group
);
301 int io_remove_buffers(struct io_kiocb
*req
, unsigned int issue_flags
)
303 struct io_provide_buf
*p
= io_kiocb_to_cmd(req
, struct io_provide_buf
);
304 struct io_ring_ctx
*ctx
= req
->ctx
;
305 struct io_buffer_list
*bl
;
308 io_ring_submit_lock(ctx
, issue_flags
);
311 bl
= io_buffer_get_list(ctx
, p
->bgid
);
314 /* can't use provide/remove buffers command on mapped buffers */
316 ret
= __io_remove_buffers(ctx
, bl
, p
->nbufs
);
318 io_ring_submit_unlock(ctx
, issue_flags
);
321 io_req_set_res(req
, ret
, 0);
325 int io_provide_buffers_prep(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
327 unsigned long size
, tmp_check
;
328 struct io_provide_buf
*p
= io_kiocb_to_cmd(req
, struct io_provide_buf
);
331 if (sqe
->rw_flags
|| sqe
->splice_fd_in
)
334 tmp
= READ_ONCE(sqe
->fd
);
335 if (!tmp
|| tmp
> USHRT_MAX
)
338 p
->addr
= READ_ONCE(sqe
->addr
);
339 p
->len
= READ_ONCE(sqe
->len
);
341 if (check_mul_overflow((unsigned long)p
->len
, (unsigned long)p
->nbufs
,
344 if (check_add_overflow((unsigned long)p
->addr
, size
, &tmp_check
))
347 size
= (unsigned long)p
->len
* p
->nbufs
;
348 if (!access_ok(u64_to_user_ptr(p
->addr
), size
))
351 p
->bgid
= READ_ONCE(sqe
->buf_group
);
352 tmp
= READ_ONCE(sqe
->off
);
355 if (tmp
+ p
->nbufs
>= USHRT_MAX
)
361 static int io_refill_buffer_cache(struct io_ring_ctx
*ctx
)
363 struct io_buffer
*buf
;
368 * Completions that don't happen inline (eg not under uring_lock) will
369 * add to ->io_buffers_comp. If we don't have any free buffers, check
370 * the completion list and splice those entries first.
372 if (!list_empty_careful(&ctx
->io_buffers_comp
)) {
373 spin_lock(&ctx
->completion_lock
);
374 if (!list_empty(&ctx
->io_buffers_comp
)) {
375 list_splice_init(&ctx
->io_buffers_comp
,
376 &ctx
->io_buffers_cache
);
377 spin_unlock(&ctx
->completion_lock
);
380 spin_unlock(&ctx
->completion_lock
);
384 * No free buffers and no completion entries either. Allocate a new
385 * page worth of buffer entries and add those to our freelist.
387 page
= alloc_page(GFP_KERNEL_ACCOUNT
);
391 list_add(&page
->lru
, &ctx
->io_buffers_pages
);
393 buf
= page_address(page
);
394 bufs_in_page
= PAGE_SIZE
/ sizeof(*buf
);
395 while (bufs_in_page
) {
396 list_add_tail(&buf
->list
, &ctx
->io_buffers_cache
);
404 static int io_add_buffers(struct io_ring_ctx
*ctx
, struct io_provide_buf
*pbuf
,
405 struct io_buffer_list
*bl
)
407 struct io_buffer
*buf
;
408 u64 addr
= pbuf
->addr
;
409 int i
, bid
= pbuf
->bid
;
411 for (i
= 0; i
< pbuf
->nbufs
; i
++) {
412 if (list_empty(&ctx
->io_buffers_cache
) &&
413 io_refill_buffer_cache(ctx
))
415 buf
= list_first_entry(&ctx
->io_buffers_cache
, struct io_buffer
,
417 list_move_tail(&buf
->list
, &bl
->buf_list
);
419 buf
->len
= min_t(__u32
, pbuf
->len
, MAX_RW_COUNT
);
421 buf
->bgid
= pbuf
->bgid
;
427 return i
? 0 : -ENOMEM
;
430 int io_provide_buffers(struct io_kiocb
*req
, unsigned int issue_flags
)
432 struct io_provide_buf
*p
= io_kiocb_to_cmd(req
, struct io_provide_buf
);
433 struct io_ring_ctx
*ctx
= req
->ctx
;
434 struct io_buffer_list
*bl
;
437 io_ring_submit_lock(ctx
, issue_flags
);
439 if (unlikely(p
->bgid
< BGID_ARRAY
&& !ctx
->io_bl
)) {
440 ret
= io_init_bl_list(ctx
);
445 bl
= io_buffer_get_list(ctx
, p
->bgid
);
447 bl
= kzalloc(sizeof(*bl
), GFP_KERNEL_ACCOUNT
);
452 INIT_LIST_HEAD(&bl
->buf_list
);
453 ret
= io_buffer_add_list(ctx
, bl
, p
->bgid
);
459 /* can't add buffers via this command for a mapped buffer ring */
465 ret
= io_add_buffers(ctx
, p
, bl
);
467 io_ring_submit_unlock(ctx
, issue_flags
);
471 io_req_set_res(req
, ret
, 0);
475 static int io_pin_pbuf_ring(struct io_uring_buf_reg
*reg
,
476 struct io_buffer_list
*bl
)
478 struct io_uring_buf_ring
*br
;
482 pages
= io_pin_pages(reg
->ring_addr
,
483 flex_array_size(br
, bufs
, reg
->ring_entries
),
486 return PTR_ERR(pages
);
488 br
= page_address(pages
[0]);
491 * On platforms that have specific aliasing requirements, SHM_COLOUR
492 * is set and we must guarantee that the kernel and user side align
493 * nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and
494 * the application mmap's the provided ring buffer. Fail the request
495 * if we, by chance, don't end up with aligned addresses. The app
496 * should use IOU_PBUF_RING_MMAP instead, and liburing will handle
497 * this transparently.
499 if ((reg
->ring_addr
| (unsigned long) br
) & (SHM_COLOUR
- 1)) {
502 for (i
= 0; i
< nr_pages
; i
++)
503 unpin_user_page(pages
[i
]);
507 bl
->buf_pages
= pages
;
508 bl
->buf_nr_pages
= nr_pages
;
515 static int io_alloc_pbuf_ring(struct io_uring_buf_reg
*reg
,
516 struct io_buffer_list
*bl
)
518 gfp_t gfp
= GFP_KERNEL_ACCOUNT
| __GFP_ZERO
| __GFP_NOWARN
| __GFP_COMP
;
522 ring_size
= reg
->ring_entries
* sizeof(struct io_uring_buf_ring
);
523 ptr
= (void *) __get_free_pages(gfp
, get_order(ring_size
));
533 int io_register_pbuf_ring(struct io_ring_ctx
*ctx
, void __user
*arg
)
535 struct io_uring_buf_reg reg
;
536 struct io_buffer_list
*bl
, *free_bl
= NULL
;
539 if (copy_from_user(®
, arg
, sizeof(reg
)))
542 if (reg
.resv
[0] || reg
.resv
[1] || reg
.resv
[2])
544 if (reg
.flags
& ~IOU_PBUF_RING_MMAP
)
546 if (!(reg
.flags
& IOU_PBUF_RING_MMAP
)) {
549 if (reg
.ring_addr
& ~PAGE_MASK
)
556 if (!is_power_of_2(reg
.ring_entries
))
559 /* cannot disambiguate full vs empty due to head/tail size */
560 if (reg
.ring_entries
>= 65536)
563 if (unlikely(reg
.bgid
< BGID_ARRAY
&& !ctx
->io_bl
)) {
564 int ret
= io_init_bl_list(ctx
);
569 bl
= io_buffer_get_list(ctx
, reg
.bgid
);
571 /* if mapped buffer ring OR classic exists, don't allow */
572 if (bl
->is_mapped
|| !list_empty(&bl
->buf_list
))
575 free_bl
= bl
= kzalloc(sizeof(*bl
), GFP_KERNEL
);
580 if (!(reg
.flags
& IOU_PBUF_RING_MMAP
))
581 ret
= io_pin_pbuf_ring(®
, bl
);
583 ret
= io_alloc_pbuf_ring(®
, bl
);
586 bl
->nr_entries
= reg
.ring_entries
;
587 bl
->mask
= reg
.ring_entries
- 1;
589 io_buffer_add_list(ctx
, bl
, reg
.bgid
);
597 int io_unregister_pbuf_ring(struct io_ring_ctx
*ctx
, void __user
*arg
)
599 struct io_uring_buf_reg reg
;
600 struct io_buffer_list
*bl
;
602 if (copy_from_user(®
, arg
, sizeof(reg
)))
604 if (reg
.resv
[0] || reg
.resv
[1] || reg
.resv
[2])
609 bl
= io_buffer_get_list(ctx
, reg
.bgid
);
615 __io_remove_buffers(ctx
, bl
, -1U);
616 if (bl
->bgid
>= BGID_ARRAY
) {
617 xa_erase(&ctx
->io_bl_xa
, bl
->bgid
);
623 void *io_pbuf_get_address(struct io_ring_ctx
*ctx
, unsigned long bgid
)
625 struct io_buffer_list
*bl
;
627 bl
= io_buffer_get_list(ctx
, bgid
);
628 if (!bl
|| !bl
->is_mmap
)