1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/kernel.h>
3 #include <linux/errno.h>
5 #include <linux/file.h>
7 #include <linux/slab.h>
8 #include <linux/namei.h>
9 #include <linux/poll.h>
10 #include <linux/io_uring.h>
12 #include <uapi/linux/io_uring.h>
18 #define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf))
22 struct io_provide_buf
{
31 static inline struct io_buffer_list
*io_buffer_get_list(struct io_ring_ctx
*ctx
,
34 if (ctx
->io_bl
&& bgid
< BGID_ARRAY
)
35 return &ctx
->io_bl
[bgid
];
37 return xa_load(&ctx
->io_bl_xa
, bgid
);
40 static int io_buffer_add_list(struct io_ring_ctx
*ctx
,
41 struct io_buffer_list
*bl
, unsigned int bgid
)
44 if (bgid
< BGID_ARRAY
)
47 return xa_err(xa_store(&ctx
->io_bl_xa
, bgid
, bl
, GFP_KERNEL
));
50 void io_kbuf_recycle_legacy(struct io_kiocb
*req
, unsigned issue_flags
)
52 struct io_ring_ctx
*ctx
= req
->ctx
;
53 struct io_buffer_list
*bl
;
54 struct io_buffer
*buf
;
57 * For legacy provided buffer mode, don't recycle if we already did
58 * IO to this buffer. For ring-mapped provided buffer mode, we should
59 * increment ring->head to explicitly monopolize the buffer to avoid
62 if (req
->flags
& REQ_F_PARTIAL_IO
)
65 io_ring_submit_lock(ctx
, issue_flags
);
68 bl
= io_buffer_get_list(ctx
, buf
->bgid
);
69 list_add(&buf
->list
, &bl
->buf_list
);
70 req
->flags
&= ~REQ_F_BUFFER_SELECTED
;
71 req
->buf_index
= buf
->bgid
;
73 io_ring_submit_unlock(ctx
, issue_flags
);
77 unsigned int __io_put_kbuf(struct io_kiocb
*req
, unsigned issue_flags
)
82 * We can add this buffer back to two lists:
84 * 1) The io_buffers_cache list. This one is protected by the
85 * ctx->uring_lock. If we already hold this lock, add back to this
86 * list as we can grab it from issue as well.
87 * 2) The io_buffers_comp list. This one is protected by the
88 * ctx->completion_lock.
90 * We migrate buffers from the comp_list to the issue cache list
93 if (req
->flags
& REQ_F_BUFFER_RING
) {
94 /* no buffers to recycle for this case */
95 cflags
= __io_put_kbuf_list(req
, NULL
);
96 } else if (issue_flags
& IO_URING_F_UNLOCKED
) {
97 struct io_ring_ctx
*ctx
= req
->ctx
;
99 spin_lock(&ctx
->completion_lock
);
100 cflags
= __io_put_kbuf_list(req
, &ctx
->io_buffers_comp
);
101 spin_unlock(&ctx
->completion_lock
);
103 lockdep_assert_held(&req
->ctx
->uring_lock
);
105 cflags
= __io_put_kbuf_list(req
, &req
->ctx
->io_buffers_cache
);
110 static void __user
*io_provided_buffer_select(struct io_kiocb
*req
, size_t *len
,
111 struct io_buffer_list
*bl
)
113 if (!list_empty(&bl
->buf_list
)) {
114 struct io_buffer
*kbuf
;
116 kbuf
= list_first_entry(&bl
->buf_list
, struct io_buffer
, list
);
117 list_del(&kbuf
->list
);
118 if (*len
== 0 || *len
> kbuf
->len
)
120 req
->flags
|= REQ_F_BUFFER_SELECTED
;
122 req
->buf_index
= kbuf
->bid
;
123 return u64_to_user_ptr(kbuf
->addr
);
128 static void __user
*io_ring_buffer_select(struct io_kiocb
*req
, size_t *len
,
129 struct io_buffer_list
*bl
,
130 unsigned int issue_flags
)
132 struct io_uring_buf_ring
*br
= bl
->buf_ring
;
133 struct io_uring_buf
*buf
;
134 __u16 head
= bl
->head
;
136 if (unlikely(smp_load_acquire(&br
->tail
) == head
))
140 /* mmaped buffers are always contig */
141 if (bl
->is_mmap
|| head
< IO_BUFFER_LIST_BUF_PER_PAGE
) {
142 buf
= &br
->bufs
[head
];
144 int off
= head
& (IO_BUFFER_LIST_BUF_PER_PAGE
- 1);
145 int index
= head
/ IO_BUFFER_LIST_BUF_PER_PAGE
;
146 buf
= page_address(bl
->buf_pages
[index
]);
149 if (*len
== 0 || *len
> buf
->len
)
151 req
->flags
|= REQ_F_BUFFER_RING
;
153 req
->buf_index
= buf
->bid
;
155 if (issue_flags
& IO_URING_F_UNLOCKED
|| !file_can_poll(req
->file
)) {
157 * If we came in unlocked, we have no choice but to consume the
158 * buffer here, otherwise nothing ensures that the buffer won't
159 * get used by others. This does mean it'll be pinned until the
160 * IO completes, coming in unlocked means we're being called from
161 * io-wq context and there may be further retries in async hybrid
162 * mode. For the locked case, the caller must call commit when
163 * the transfer completes (or if we get -EAGAIN and must poll of
166 req
->buf_list
= NULL
;
169 return u64_to_user_ptr(buf
->addr
);
172 void __user
*io_buffer_select(struct io_kiocb
*req
, size_t *len
,
173 unsigned int issue_flags
)
175 struct io_ring_ctx
*ctx
= req
->ctx
;
176 struct io_buffer_list
*bl
;
177 void __user
*ret
= NULL
;
179 io_ring_submit_lock(req
->ctx
, issue_flags
);
181 bl
= io_buffer_get_list(ctx
, req
->buf_index
);
184 ret
= io_ring_buffer_select(req
, len
, bl
, issue_flags
);
186 ret
= io_provided_buffer_select(req
, len
, bl
);
188 io_ring_submit_unlock(req
->ctx
, issue_flags
);
192 static __cold
int io_init_bl_list(struct io_ring_ctx
*ctx
)
196 ctx
->io_bl
= kcalloc(BGID_ARRAY
, sizeof(struct io_buffer_list
),
201 for (i
= 0; i
< BGID_ARRAY
; i
++) {
202 INIT_LIST_HEAD(&ctx
->io_bl
[i
].buf_list
);
203 ctx
->io_bl
[i
].bgid
= i
;
209 static int __io_remove_buffers(struct io_ring_ctx
*ctx
,
210 struct io_buffer_list
*bl
, unsigned nbufs
)
214 /* shouldn't happen */
219 i
= bl
->buf_ring
->tail
- bl
->head
;
223 page
= virt_to_head_page(bl
->buf_ring
);
224 if (put_page_testzero(page
))
225 free_compound_page(page
);
228 } else if (bl
->buf_nr_pages
) {
231 for (j
= 0; j
< bl
->buf_nr_pages
; j
++)
232 unpin_user_page(bl
->buf_pages
[j
]);
233 kvfree(bl
->buf_pages
);
234 bl
->buf_pages
= NULL
;
235 bl
->buf_nr_pages
= 0;
237 /* make sure it's seen as empty */
238 INIT_LIST_HEAD(&bl
->buf_list
);
243 /* protects io_buffers_cache */
244 lockdep_assert_held(&ctx
->uring_lock
);
246 while (!list_empty(&bl
->buf_list
)) {
247 struct io_buffer
*nxt
;
249 nxt
= list_first_entry(&bl
->buf_list
, struct io_buffer
, list
);
250 list_move(&nxt
->list
, &ctx
->io_buffers_cache
);
259 void io_destroy_buffers(struct io_ring_ctx
*ctx
)
261 struct io_buffer_list
*bl
;
265 for (i
= 0; i
< BGID_ARRAY
; i
++) {
268 __io_remove_buffers(ctx
, &ctx
->io_bl
[i
], -1U);
271 xa_for_each(&ctx
->io_bl_xa
, index
, bl
) {
272 xa_erase(&ctx
->io_bl_xa
, bl
->bgid
);
273 __io_remove_buffers(ctx
, bl
, -1U);
277 while (!list_empty(&ctx
->io_buffers_pages
)) {
280 page
= list_first_entry(&ctx
->io_buffers_pages
, struct page
, lru
);
281 list_del_init(&page
->lru
);
286 int io_remove_buffers_prep(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
288 struct io_provide_buf
*p
= io_kiocb_to_cmd(req
, struct io_provide_buf
);
291 if (sqe
->rw_flags
|| sqe
->addr
|| sqe
->len
|| sqe
->off
||
295 tmp
= READ_ONCE(sqe
->fd
);
296 if (!tmp
|| tmp
> USHRT_MAX
)
299 memset(p
, 0, sizeof(*p
));
301 p
->bgid
= READ_ONCE(sqe
->buf_group
);
305 int io_remove_buffers(struct io_kiocb
*req
, unsigned int issue_flags
)
307 struct io_provide_buf
*p
= io_kiocb_to_cmd(req
, struct io_provide_buf
);
308 struct io_ring_ctx
*ctx
= req
->ctx
;
309 struct io_buffer_list
*bl
;
312 io_ring_submit_lock(ctx
, issue_flags
);
315 bl
= io_buffer_get_list(ctx
, p
->bgid
);
318 /* can't use provide/remove buffers command on mapped buffers */
320 ret
= __io_remove_buffers(ctx
, bl
, p
->nbufs
);
322 io_ring_submit_unlock(ctx
, issue_flags
);
325 io_req_set_res(req
, ret
, 0);
329 int io_provide_buffers_prep(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
331 unsigned long size
, tmp_check
;
332 struct io_provide_buf
*p
= io_kiocb_to_cmd(req
, struct io_provide_buf
);
335 if (sqe
->rw_flags
|| sqe
->splice_fd_in
)
338 tmp
= READ_ONCE(sqe
->fd
);
339 if (!tmp
|| tmp
> USHRT_MAX
)
342 p
->addr
= READ_ONCE(sqe
->addr
);
343 p
->len
= READ_ONCE(sqe
->len
);
345 if (check_mul_overflow((unsigned long)p
->len
, (unsigned long)p
->nbufs
,
348 if (check_add_overflow((unsigned long)p
->addr
, size
, &tmp_check
))
351 size
= (unsigned long)p
->len
* p
->nbufs
;
352 if (!access_ok(u64_to_user_ptr(p
->addr
), size
))
355 p
->bgid
= READ_ONCE(sqe
->buf_group
);
356 tmp
= READ_ONCE(sqe
->off
);
359 if (tmp
+ p
->nbufs
>= USHRT_MAX
)
365 static int io_refill_buffer_cache(struct io_ring_ctx
*ctx
)
367 struct io_buffer
*buf
;
372 * Completions that don't happen inline (eg not under uring_lock) will
373 * add to ->io_buffers_comp. If we don't have any free buffers, check
374 * the completion list and splice those entries first.
376 if (!list_empty_careful(&ctx
->io_buffers_comp
)) {
377 spin_lock(&ctx
->completion_lock
);
378 if (!list_empty(&ctx
->io_buffers_comp
)) {
379 list_splice_init(&ctx
->io_buffers_comp
,
380 &ctx
->io_buffers_cache
);
381 spin_unlock(&ctx
->completion_lock
);
384 spin_unlock(&ctx
->completion_lock
);
388 * No free buffers and no completion entries either. Allocate a new
389 * page worth of buffer entries and add those to our freelist.
391 page
= alloc_page(GFP_KERNEL_ACCOUNT
);
395 list_add(&page
->lru
, &ctx
->io_buffers_pages
);
397 buf
= page_address(page
);
398 bufs_in_page
= PAGE_SIZE
/ sizeof(*buf
);
399 while (bufs_in_page
) {
400 list_add_tail(&buf
->list
, &ctx
->io_buffers_cache
);
408 static int io_add_buffers(struct io_ring_ctx
*ctx
, struct io_provide_buf
*pbuf
,
409 struct io_buffer_list
*bl
)
411 struct io_buffer
*buf
;
412 u64 addr
= pbuf
->addr
;
413 int i
, bid
= pbuf
->bid
;
415 for (i
= 0; i
< pbuf
->nbufs
; i
++) {
416 if (list_empty(&ctx
->io_buffers_cache
) &&
417 io_refill_buffer_cache(ctx
))
419 buf
= list_first_entry(&ctx
->io_buffers_cache
, struct io_buffer
,
421 list_move_tail(&buf
->list
, &bl
->buf_list
);
423 buf
->len
= min_t(__u32
, pbuf
->len
, MAX_RW_COUNT
);
425 buf
->bgid
= pbuf
->bgid
;
431 return i
? 0 : -ENOMEM
;
434 int io_provide_buffers(struct io_kiocb
*req
, unsigned int issue_flags
)
436 struct io_provide_buf
*p
= io_kiocb_to_cmd(req
, struct io_provide_buf
);
437 struct io_ring_ctx
*ctx
= req
->ctx
;
438 struct io_buffer_list
*bl
;
441 io_ring_submit_lock(ctx
, issue_flags
);
443 if (unlikely(p
->bgid
< BGID_ARRAY
&& !ctx
->io_bl
)) {
444 ret
= io_init_bl_list(ctx
);
449 bl
= io_buffer_get_list(ctx
, p
->bgid
);
451 bl
= kzalloc(sizeof(*bl
), GFP_KERNEL_ACCOUNT
);
456 INIT_LIST_HEAD(&bl
->buf_list
);
457 ret
= io_buffer_add_list(ctx
, bl
, p
->bgid
);
463 /* can't add buffers via this command for a mapped buffer ring */
469 ret
= io_add_buffers(ctx
, p
, bl
);
471 io_ring_submit_unlock(ctx
, issue_flags
);
475 io_req_set_res(req
, ret
, 0);
479 static int io_pin_pbuf_ring(struct io_uring_buf_reg
*reg
,
480 struct io_buffer_list
*bl
)
482 struct io_uring_buf_ring
*br
;
486 pages
= io_pin_pages(reg
->ring_addr
,
487 flex_array_size(br
, bufs
, reg
->ring_entries
),
490 return PTR_ERR(pages
);
492 br
= page_address(pages
[0]);
495 * On platforms that have specific aliasing requirements, SHM_COLOUR
496 * is set and we must guarantee that the kernel and user side align
497 * nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and
498 * the application mmap's the provided ring buffer. Fail the request
499 * if we, by chance, don't end up with aligned addresses. The app
500 * should use IOU_PBUF_RING_MMAP instead, and liburing will handle
501 * this transparently.
503 if ((reg
->ring_addr
| (unsigned long) br
) & (SHM_COLOUR
- 1)) {
506 for (i
= 0; i
< nr_pages
; i
++)
507 unpin_user_page(pages
[i
]);
511 bl
->buf_pages
= pages
;
512 bl
->buf_nr_pages
= nr_pages
;
519 static int io_alloc_pbuf_ring(struct io_uring_buf_reg
*reg
,
520 struct io_buffer_list
*bl
)
522 gfp_t gfp
= GFP_KERNEL_ACCOUNT
| __GFP_ZERO
| __GFP_NOWARN
| __GFP_COMP
;
526 ring_size
= reg
->ring_entries
* sizeof(struct io_uring_buf_ring
);
527 ptr
= (void *) __get_free_pages(gfp
, get_order(ring_size
));
537 int io_register_pbuf_ring(struct io_ring_ctx
*ctx
, void __user
*arg
)
539 struct io_uring_buf_reg reg
;
540 struct io_buffer_list
*bl
, *free_bl
= NULL
;
543 if (copy_from_user(®
, arg
, sizeof(reg
)))
546 if (reg
.resv
[0] || reg
.resv
[1] || reg
.resv
[2])
548 if (reg
.flags
& ~IOU_PBUF_RING_MMAP
)
550 if (!(reg
.flags
& IOU_PBUF_RING_MMAP
)) {
553 if (reg
.ring_addr
& ~PAGE_MASK
)
560 if (!is_power_of_2(reg
.ring_entries
))
563 /* cannot disambiguate full vs empty due to head/tail size */
564 if (reg
.ring_entries
>= 65536)
567 if (unlikely(reg
.bgid
< BGID_ARRAY
&& !ctx
->io_bl
)) {
568 int ret
= io_init_bl_list(ctx
);
573 bl
= io_buffer_get_list(ctx
, reg
.bgid
);
575 /* if mapped buffer ring OR classic exists, don't allow */
576 if (bl
->is_mapped
|| !list_empty(&bl
->buf_list
))
579 free_bl
= bl
= kzalloc(sizeof(*bl
), GFP_KERNEL
);
584 if (!(reg
.flags
& IOU_PBUF_RING_MMAP
))
585 ret
= io_pin_pbuf_ring(®
, bl
);
587 ret
= io_alloc_pbuf_ring(®
, bl
);
590 bl
->nr_entries
= reg
.ring_entries
;
591 bl
->mask
= reg
.ring_entries
- 1;
593 io_buffer_add_list(ctx
, bl
, reg
.bgid
);
601 int io_unregister_pbuf_ring(struct io_ring_ctx
*ctx
, void __user
*arg
)
603 struct io_uring_buf_reg reg
;
604 struct io_buffer_list
*bl
;
606 if (copy_from_user(®
, arg
, sizeof(reg
)))
608 if (reg
.resv
[0] || reg
.resv
[1] || reg
.resv
[2])
613 bl
= io_buffer_get_list(ctx
, reg
.bgid
);
619 __io_remove_buffers(ctx
, bl
, -1U);
620 if (bl
->bgid
>= BGID_ARRAY
) {
621 xa_erase(&ctx
->io_bl_xa
, bl
->bgid
);
627 void *io_pbuf_get_address(struct io_ring_ctx
*ctx
, unsigned long bgid
)
629 struct io_buffer_list
*bl
;
631 bl
= io_buffer_get_list(ctx
, bgid
);
632 if (!bl
|| !bl
->is_mmap
)