1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/kernel.h>
3 #include <linux/errno.h>
5 #include <linux/file.h>
7 #include <linux/slab.h>
8 #include <linux/nospec.h>
9 #include <linux/hugetlb.h>
10 #include <linux/compat.h>
11 #include <linux/io_uring.h>
13 #include <uapi/linux/io_uring.h>
16 #include "openclose.h"
19 struct io_rsrc_update
{
26 static void io_rsrc_buf_put(struct io_ring_ctx
*ctx
, struct io_rsrc_put
*prsrc
);
27 static void io_rsrc_file_put(struct io_ring_ctx
*ctx
, struct io_rsrc_put
*prsrc
);
28 static int io_sqe_buffer_register(struct io_ring_ctx
*ctx
, struct iovec
*iov
,
29 struct io_mapped_ubuf
**pimu
,
30 struct page
**last_hpage
);
33 #define IORING_MAX_FIXED_FILES (1U << 20)
34 #define IORING_MAX_REG_BUFFERS (1U << 14)
36 int __io_account_mem(struct user_struct
*user
, unsigned long nr_pages
)
38 unsigned long page_limit
, cur_pages
, new_pages
;
43 /* Don't allow more pages than we can safely lock */
44 page_limit
= rlimit(RLIMIT_MEMLOCK
) >> PAGE_SHIFT
;
46 cur_pages
= atomic_long_read(&user
->locked_vm
);
48 new_pages
= cur_pages
+ nr_pages
;
49 if (new_pages
> page_limit
)
51 } while (!atomic_long_try_cmpxchg(&user
->locked_vm
,
52 &cur_pages
, new_pages
));
56 static void io_unaccount_mem(struct io_ring_ctx
*ctx
, unsigned long nr_pages
)
59 __io_unaccount_mem(ctx
->user
, nr_pages
);
62 atomic64_sub(nr_pages
, &ctx
->mm_account
->pinned_vm
);
65 static int io_account_mem(struct io_ring_ctx
*ctx
, unsigned long nr_pages
)
70 ret
= __io_account_mem(ctx
->user
, nr_pages
);
76 atomic64_add(nr_pages
, &ctx
->mm_account
->pinned_vm
);
81 static int io_copy_iov(struct io_ring_ctx
*ctx
, struct iovec
*dst
,
82 void __user
*arg
, unsigned index
)
84 struct iovec __user
*src
;
88 struct compat_iovec __user
*ciovs
;
89 struct compat_iovec ciov
;
91 ciovs
= (struct compat_iovec __user
*) arg
;
92 if (copy_from_user(&ciov
, &ciovs
[index
], sizeof(ciov
)))
95 dst
->iov_base
= u64_to_user_ptr((u64
)ciov
.iov_base
);
96 dst
->iov_len
= ciov
.iov_len
;
100 src
= (struct iovec __user
*) arg
;
101 if (copy_from_user(dst
, &src
[index
], sizeof(*dst
)))
106 static int io_buffer_validate(struct iovec
*iov
)
108 unsigned long tmp
, acct_len
= iov
->iov_len
+ (PAGE_SIZE
- 1);
111 * Don't impose further limits on the size and buffer
112 * constraints here, we'll -EINVAL later when IO is
113 * submitted if they are wrong.
116 return iov
->iov_len
? -EFAULT
: 0;
120 /* arbitrary limit, but we need something */
121 if (iov
->iov_len
> SZ_1G
)
124 if (check_add_overflow((unsigned long)iov
->iov_base
, acct_len
, &tmp
))
130 static void io_buffer_unmap(struct io_ring_ctx
*ctx
, struct io_mapped_ubuf
**slot
)
132 struct io_mapped_ubuf
*imu
= *slot
;
135 if (imu
!= ctx
->dummy_ubuf
) {
136 for (i
= 0; i
< imu
->nr_bvecs
; i
++)
137 unpin_user_page(imu
->bvec
[i
].bv_page
);
139 io_unaccount_mem(ctx
, imu
->acct_pages
);
145 static void io_rsrc_put_work(struct io_rsrc_node
*node
)
147 struct io_rsrc_put
*prsrc
= &node
->item
;
150 io_post_aux_cqe(node
->ctx
, prsrc
->tag
, 0, 0);
152 switch (node
->type
) {
153 case IORING_RSRC_FILE
:
154 io_rsrc_file_put(node
->ctx
, prsrc
);
156 case IORING_RSRC_BUFFER
:
157 io_rsrc_buf_put(node
->ctx
, prsrc
);
165 void io_rsrc_node_destroy(struct io_ring_ctx
*ctx
, struct io_rsrc_node
*node
)
167 if (!io_alloc_cache_put(&ctx
->rsrc_node_cache
, &node
->cache
))
171 void io_rsrc_node_ref_zero(struct io_rsrc_node
*node
)
172 __must_hold(&node
->ctx
->uring_lock
)
174 struct io_ring_ctx
*ctx
= node
->ctx
;
176 while (!list_empty(&ctx
->rsrc_ref_list
)) {
177 node
= list_first_entry(&ctx
->rsrc_ref_list
,
178 struct io_rsrc_node
, node
);
179 /* recycle ref nodes in order */
182 list_del(&node
->node
);
184 if (likely(!node
->empty
))
185 io_rsrc_put_work(node
);
186 io_rsrc_node_destroy(ctx
, node
);
188 if (list_empty(&ctx
->rsrc_ref_list
) && unlikely(ctx
->rsrc_quiesce
))
189 wake_up_all(&ctx
->rsrc_quiesce_wq
);
192 struct io_rsrc_node
*io_rsrc_node_alloc(struct io_ring_ctx
*ctx
)
194 struct io_rsrc_node
*ref_node
;
195 struct io_cache_entry
*entry
;
197 entry
= io_alloc_cache_get(&ctx
->rsrc_node_cache
);
199 ref_node
= container_of(entry
, struct io_rsrc_node
, cache
);
201 ref_node
= kzalloc(sizeof(*ref_node
), GFP_KERNEL
);
212 __cold
static int io_rsrc_ref_quiesce(struct io_rsrc_data
*data
,
213 struct io_ring_ctx
*ctx
)
215 struct io_rsrc_node
*backup
;
219 /* As We may drop ->uring_lock, other task may have started quiesce */
223 backup
= io_rsrc_node_alloc(ctx
);
226 ctx
->rsrc_node
->empty
= true;
227 ctx
->rsrc_node
->type
= -1;
228 list_add_tail(&ctx
->rsrc_node
->node
, &ctx
->rsrc_ref_list
);
229 io_put_rsrc_node(ctx
, ctx
->rsrc_node
);
230 ctx
->rsrc_node
= backup
;
232 if (list_empty(&ctx
->rsrc_ref_list
))
235 if (ctx
->flags
& IORING_SETUP_DEFER_TASKRUN
) {
236 atomic_set(&ctx
->cq_wait_nr
, 1);
241 data
->quiesce
= true;
243 prepare_to_wait(&ctx
->rsrc_quiesce_wq
, &we
, TASK_INTERRUPTIBLE
);
244 mutex_unlock(&ctx
->uring_lock
);
246 ret
= io_run_task_work_sig(ctx
);
248 mutex_lock(&ctx
->uring_lock
);
249 if (list_empty(&ctx
->rsrc_ref_list
))
255 __set_current_state(TASK_RUNNING
);
256 mutex_lock(&ctx
->uring_lock
);
258 } while (!list_empty(&ctx
->rsrc_ref_list
));
260 finish_wait(&ctx
->rsrc_quiesce_wq
, &we
);
261 data
->quiesce
= false;
264 if (ctx
->flags
& IORING_SETUP_DEFER_TASKRUN
) {
265 atomic_set(&ctx
->cq_wait_nr
, 0);
271 static void io_free_page_table(void **table
, size_t size
)
273 unsigned i
, nr_tables
= DIV_ROUND_UP(size
, PAGE_SIZE
);
275 for (i
= 0; i
< nr_tables
; i
++)
280 static void io_rsrc_data_free(struct io_rsrc_data
*data
)
282 size_t size
= data
->nr
* sizeof(data
->tags
[0][0]);
285 io_free_page_table((void **)data
->tags
, size
);
289 static __cold
void **io_alloc_page_table(size_t size
)
291 unsigned i
, nr_tables
= DIV_ROUND_UP(size
, PAGE_SIZE
);
292 size_t init_size
= size
;
295 table
= kcalloc(nr_tables
, sizeof(*table
), GFP_KERNEL_ACCOUNT
);
299 for (i
= 0; i
< nr_tables
; i
++) {
300 unsigned int this_size
= min_t(size_t, size
, PAGE_SIZE
);
302 table
[i
] = kzalloc(this_size
, GFP_KERNEL_ACCOUNT
);
304 io_free_page_table(table
, init_size
);
312 __cold
static int io_rsrc_data_alloc(struct io_ring_ctx
*ctx
, int type
,
314 unsigned nr
, struct io_rsrc_data
**pdata
)
316 struct io_rsrc_data
*data
;
320 data
= kzalloc(sizeof(*data
), GFP_KERNEL
);
323 data
->tags
= (u64
**)io_alloc_page_table(nr
* sizeof(data
->tags
[0][0]));
331 data
->rsrc_type
= type
;
334 for (i
= 0; i
< nr
; i
++) {
335 u64
*tag_slot
= io_get_tag_slot(data
, i
);
337 if (copy_from_user(tag_slot
, &utags
[i
],
345 io_rsrc_data_free(data
);
349 static int __io_sqe_files_update(struct io_ring_ctx
*ctx
,
350 struct io_uring_rsrc_update2
*up
,
353 u64 __user
*tags
= u64_to_user_ptr(up
->tags
);
354 __s32 __user
*fds
= u64_to_user_ptr(up
->data
);
355 struct io_rsrc_data
*data
= ctx
->file_data
;
356 struct io_fixed_file
*file_slot
;
362 if (up
->offset
+ nr_args
> ctx
->nr_user_files
)
365 for (done
= 0; done
< nr_args
; done
++) {
368 if ((tags
&& copy_from_user(&tag
, &tags
[done
], sizeof(tag
))) ||
369 copy_from_user(&fd
, &fds
[done
], sizeof(fd
))) {
373 if ((fd
== IORING_REGISTER_FILES_SKIP
|| fd
== -1) && tag
) {
377 if (fd
== IORING_REGISTER_FILES_SKIP
)
380 i
= array_index_nospec(up
->offset
+ done
, ctx
->nr_user_files
);
381 file_slot
= io_fixed_file_slot(&ctx
->file_table
, i
);
383 if (file_slot
->file_ptr
) {
384 err
= io_queue_rsrc_removal(data
, i
,
385 io_slot_file(file_slot
));
388 file_slot
->file_ptr
= 0;
389 io_file_bitmap_clear(&ctx
->file_table
, i
);
392 struct file
*file
= fget(fd
);
399 * Don't allow io_uring instances to be registered. If
400 * UNIX isn't enabled, then this causes a reference
401 * cycle and this instance can never get freed. If UNIX
402 * is enabled we'll handle it just fine, but there's
403 * still no point in allowing a ring fd as it doesn't
404 * support regular read/write anyway.
406 if (io_is_uring_fops(file
)) {
411 err
= io_scm_file_account(ctx
, file
);
416 *io_get_tag_slot(data
, i
) = tag
;
417 io_fixed_file_set(file_slot
, file
);
418 io_file_bitmap_set(&ctx
->file_table
, i
);
421 return done
? done
: err
;
424 static int __io_sqe_buffers_update(struct io_ring_ctx
*ctx
,
425 struct io_uring_rsrc_update2
*up
,
426 unsigned int nr_args
)
428 u64 __user
*tags
= u64_to_user_ptr(up
->tags
);
429 struct iovec iov
, __user
*iovs
= u64_to_user_ptr(up
->data
);
430 struct page
*last_hpage
= NULL
;
436 if (up
->offset
+ nr_args
> ctx
->nr_user_bufs
)
439 for (done
= 0; done
< nr_args
; done
++) {
440 struct io_mapped_ubuf
*imu
;
443 err
= io_copy_iov(ctx
, &iov
, iovs
, done
);
446 if (tags
&& copy_from_user(&tag
, &tags
[done
], sizeof(tag
))) {
450 err
= io_buffer_validate(&iov
);
453 if (!iov
.iov_base
&& tag
) {
457 err
= io_sqe_buffer_register(ctx
, &iov
, &imu
, &last_hpage
);
461 i
= array_index_nospec(up
->offset
+ done
, ctx
->nr_user_bufs
);
462 if (ctx
->user_bufs
[i
] != ctx
->dummy_ubuf
) {
463 err
= io_queue_rsrc_removal(ctx
->buf_data
, i
,
466 io_buffer_unmap(ctx
, &imu
);
469 ctx
->user_bufs
[i
] = ctx
->dummy_ubuf
;
472 ctx
->user_bufs
[i
] = imu
;
473 *io_get_tag_slot(ctx
->buf_data
, i
) = tag
;
475 return done
? done
: err
;
478 static int __io_register_rsrc_update(struct io_ring_ctx
*ctx
, unsigned type
,
479 struct io_uring_rsrc_update2
*up
,
484 lockdep_assert_held(&ctx
->uring_lock
);
486 if (check_add_overflow(up
->offset
, nr_args
, &tmp
))
490 case IORING_RSRC_FILE
:
491 return __io_sqe_files_update(ctx
, up
, nr_args
);
492 case IORING_RSRC_BUFFER
:
493 return __io_sqe_buffers_update(ctx
, up
, nr_args
);
498 int io_register_files_update(struct io_ring_ctx
*ctx
, void __user
*arg
,
501 struct io_uring_rsrc_update2 up
;
505 memset(&up
, 0, sizeof(up
));
506 if (copy_from_user(&up
, arg
, sizeof(struct io_uring_rsrc_update
)))
508 if (up
.resv
|| up
.resv2
)
510 return __io_register_rsrc_update(ctx
, IORING_RSRC_FILE
, &up
, nr_args
);
513 int io_register_rsrc_update(struct io_ring_ctx
*ctx
, void __user
*arg
,
514 unsigned size
, unsigned type
)
516 struct io_uring_rsrc_update2 up
;
518 if (size
!= sizeof(up
))
520 if (copy_from_user(&up
, arg
, sizeof(up
)))
522 if (!up
.nr
|| up
.resv
|| up
.resv2
)
524 return __io_register_rsrc_update(ctx
, type
, &up
, up
.nr
);
527 __cold
int io_register_rsrc(struct io_ring_ctx
*ctx
, void __user
*arg
,
528 unsigned int size
, unsigned int type
)
530 struct io_uring_rsrc_register rr
;
532 /* keep it extendible */
533 if (size
!= sizeof(rr
))
536 memset(&rr
, 0, sizeof(rr
));
537 if (copy_from_user(&rr
, arg
, size
))
539 if (!rr
.nr
|| rr
.resv2
)
541 if (rr
.flags
& ~IORING_RSRC_REGISTER_SPARSE
)
545 case IORING_RSRC_FILE
:
546 if (rr
.flags
& IORING_RSRC_REGISTER_SPARSE
&& rr
.data
)
548 return io_sqe_files_register(ctx
, u64_to_user_ptr(rr
.data
),
549 rr
.nr
, u64_to_user_ptr(rr
.tags
));
550 case IORING_RSRC_BUFFER
:
551 if (rr
.flags
& IORING_RSRC_REGISTER_SPARSE
&& rr
.data
)
553 return io_sqe_buffers_register(ctx
, u64_to_user_ptr(rr
.data
),
554 rr
.nr
, u64_to_user_ptr(rr
.tags
));
559 int io_files_update_prep(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
561 struct io_rsrc_update
*up
= io_kiocb_to_cmd(req
, struct io_rsrc_update
);
563 if (unlikely(req
->flags
& (REQ_F_FIXED_FILE
| REQ_F_BUFFER_SELECT
)))
565 if (sqe
->rw_flags
|| sqe
->splice_fd_in
)
568 up
->offset
= READ_ONCE(sqe
->off
);
569 up
->nr_args
= READ_ONCE(sqe
->len
);
572 up
->arg
= READ_ONCE(sqe
->addr
);
576 static int io_files_update_with_index_alloc(struct io_kiocb
*req
,
577 unsigned int issue_flags
)
579 struct io_rsrc_update
*up
= io_kiocb_to_cmd(req
, struct io_rsrc_update
);
580 __s32 __user
*fds
= u64_to_user_ptr(up
->arg
);
585 if (!req
->ctx
->file_data
)
588 for (done
= 0; done
< up
->nr_args
; done
++) {
589 if (copy_from_user(&fd
, &fds
[done
], sizeof(fd
))) {
599 ret
= io_fixed_fd_install(req
, issue_flags
, file
,
600 IORING_FILE_INDEX_ALLOC
);
603 if (copy_to_user(&fds
[done
], &ret
, sizeof(ret
))) {
604 __io_close_fixed(req
->ctx
, issue_flags
, ret
);
615 int io_files_update(struct io_kiocb
*req
, unsigned int issue_flags
)
617 struct io_rsrc_update
*up
= io_kiocb_to_cmd(req
, struct io_rsrc_update
);
618 struct io_ring_ctx
*ctx
= req
->ctx
;
619 struct io_uring_rsrc_update2 up2
;
622 up2
.offset
= up
->offset
;
629 if (up
->offset
== IORING_FILE_INDEX_ALLOC
) {
630 ret
= io_files_update_with_index_alloc(req
, issue_flags
);
632 io_ring_submit_lock(ctx
, issue_flags
);
633 ret
= __io_register_rsrc_update(ctx
, IORING_RSRC_FILE
,
635 io_ring_submit_unlock(ctx
, issue_flags
);
640 io_req_set_res(req
, ret
, 0);
644 int io_queue_rsrc_removal(struct io_rsrc_data
*data
, unsigned idx
, void *rsrc
)
646 struct io_ring_ctx
*ctx
= data
->ctx
;
647 struct io_rsrc_node
*node
= ctx
->rsrc_node
;
648 u64
*tag_slot
= io_get_tag_slot(data
, idx
);
650 ctx
->rsrc_node
= io_rsrc_node_alloc(ctx
);
651 if (unlikely(!ctx
->rsrc_node
)) {
652 ctx
->rsrc_node
= node
;
656 node
->item
.rsrc
= rsrc
;
657 node
->type
= data
->rsrc_type
;
658 node
->item
.tag
= *tag_slot
;
660 list_add_tail(&node
->node
, &ctx
->rsrc_ref_list
);
661 io_put_rsrc_node(ctx
, node
);
665 void __io_sqe_files_unregister(struct io_ring_ctx
*ctx
)
669 for (i
= 0; i
< ctx
->nr_user_files
; i
++) {
670 struct file
*file
= io_file_from_index(&ctx
->file_table
, i
);
672 /* skip scm accounted files, they'll be freed by ->ring_sock */
673 if (!file
|| io_file_need_scm(file
))
675 io_file_bitmap_clear(&ctx
->file_table
, i
);
679 #if defined(CONFIG_UNIX)
680 if (ctx
->ring_sock
) {
681 struct sock
*sock
= ctx
->ring_sock
->sk
;
684 while ((skb
= skb_dequeue(&sock
->sk_receive_queue
)) != NULL
)
688 io_free_file_tables(&ctx
->file_table
);
689 io_file_table_set_alloc_range(ctx
, 0, 0);
690 io_rsrc_data_free(ctx
->file_data
);
691 ctx
->file_data
= NULL
;
692 ctx
->nr_user_files
= 0;
695 int io_sqe_files_unregister(struct io_ring_ctx
*ctx
)
697 unsigned nr
= ctx
->nr_user_files
;
704 * Quiesce may unlock ->uring_lock, and while it's not held
705 * prevent new requests using the table.
707 ctx
->nr_user_files
= 0;
708 ret
= io_rsrc_ref_quiesce(ctx
->file_data
, ctx
);
709 ctx
->nr_user_files
= nr
;
711 __io_sqe_files_unregister(ctx
);
716 * Ensure the UNIX gc is aware of our file set, so we are certain that
717 * the io_uring can be safely unregistered on process exit, even if we have
718 * loops in the file referencing. We account only files that can hold other
719 * files because otherwise they can't form a loop and so are not interesting
722 int __io_scm_file_account(struct io_ring_ctx
*ctx
, struct file
*file
)
724 #if defined(CONFIG_UNIX)
725 struct sock
*sk
= ctx
->ring_sock
->sk
;
726 struct sk_buff_head
*head
= &sk
->sk_receive_queue
;
727 struct scm_fp_list
*fpl
;
730 if (likely(!io_file_need_scm(file
)))
734 * See if we can merge this file into an existing skb SCM_RIGHTS
735 * file set. If there's no room, fall back to allocating a new skb
738 spin_lock_irq(&head
->lock
);
739 skb
= skb_peek(head
);
740 if (skb
&& UNIXCB(skb
).fp
->count
< SCM_MAX_FD
)
741 __skb_unlink(skb
, head
);
744 spin_unlock_irq(&head
->lock
);
747 fpl
= kzalloc(sizeof(*fpl
), GFP_KERNEL
);
751 skb
= alloc_skb(0, GFP_KERNEL
);
757 fpl
->user
= get_uid(current_user());
758 fpl
->max
= SCM_MAX_FD
;
761 UNIXCB(skb
).fp
= fpl
;
763 skb
->destructor
= io_uring_destruct_scm
;
764 refcount_add(skb
->truesize
, &sk
->sk_wmem_alloc
);
767 fpl
= UNIXCB(skb
).fp
;
768 fpl
->fp
[fpl
->count
++] = get_file(file
);
769 unix_inflight(fpl
->user
, file
);
770 skb_queue_head(head
, skb
);
776 static __cold
void io_rsrc_file_scm_put(struct io_ring_ctx
*ctx
, struct file
*file
)
778 #if defined(CONFIG_UNIX)
779 struct sock
*sock
= ctx
->ring_sock
->sk
;
780 struct sk_buff_head list
, *head
= &sock
->sk_receive_queue
;
784 __skb_queue_head_init(&list
);
787 * Find the skb that holds this file in its SCM_RIGHTS. When found,
788 * remove this entry and rearrange the file array.
790 skb
= skb_dequeue(head
);
792 struct scm_fp_list
*fp
;
795 for (i
= 0; i
< fp
->count
; i
++) {
798 if (fp
->fp
[i
] != file
)
801 unix_notinflight(fp
->user
, fp
->fp
[i
]);
802 left
= fp
->count
- 1 - i
;
804 memmove(&fp
->fp
[i
], &fp
->fp
[i
+ 1],
805 left
* sizeof(struct file
*));
812 __skb_queue_tail(&list
, skb
);
822 __skb_queue_tail(&list
, skb
);
824 skb
= skb_dequeue(head
);
827 if (skb_peek(&list
)) {
828 spin_lock_irq(&head
->lock
);
829 while ((skb
= __skb_dequeue(&list
)) != NULL
)
830 __skb_queue_tail(head
, skb
);
831 spin_unlock_irq(&head
->lock
);
836 static void io_rsrc_file_put(struct io_ring_ctx
*ctx
, struct io_rsrc_put
*prsrc
)
838 struct file
*file
= prsrc
->file
;
840 if (likely(!io_file_need_scm(file
)))
843 io_rsrc_file_scm_put(ctx
, file
);
846 int io_sqe_files_register(struct io_ring_ctx
*ctx
, void __user
*arg
,
847 unsigned nr_args
, u64 __user
*tags
)
849 __s32 __user
*fds
= (__s32 __user
*) arg
;
858 if (nr_args
> IORING_MAX_FIXED_FILES
)
860 if (nr_args
> rlimit(RLIMIT_NOFILE
))
862 ret
= io_rsrc_data_alloc(ctx
, IORING_RSRC_FILE
, tags
, nr_args
,
867 if (!io_alloc_file_tables(&ctx
->file_table
, nr_args
)) {
868 io_rsrc_data_free(ctx
->file_data
);
869 ctx
->file_data
= NULL
;
873 for (i
= 0; i
< nr_args
; i
++, ctx
->nr_user_files
++) {
874 struct io_fixed_file
*file_slot
;
876 if (fds
&& copy_from_user(&fd
, &fds
[i
], sizeof(fd
))) {
880 /* allow sparse sets */
881 if (!fds
|| fd
== -1) {
883 if (unlikely(*io_get_tag_slot(ctx
->file_data
, i
)))
894 * Don't allow io_uring instances to be registered. If UNIX
895 * isn't enabled, then this causes a reference cycle and this
896 * instance can never get freed. If UNIX is enabled we'll
897 * handle it just fine, but there's still no point in allowing
898 * a ring fd as it doesn't support regular read/write anyway.
900 if (io_is_uring_fops(file
)) {
904 ret
= io_scm_file_account(ctx
, file
);
909 file_slot
= io_fixed_file_slot(&ctx
->file_table
, i
);
910 io_fixed_file_set(file_slot
, file
);
911 io_file_bitmap_set(&ctx
->file_table
, i
);
914 /* default it to the whole table */
915 io_file_table_set_alloc_range(ctx
, 0, ctx
->nr_user_files
);
918 __io_sqe_files_unregister(ctx
);
922 static void io_rsrc_buf_put(struct io_ring_ctx
*ctx
, struct io_rsrc_put
*prsrc
)
924 io_buffer_unmap(ctx
, &prsrc
->buf
);
928 void __io_sqe_buffers_unregister(struct io_ring_ctx
*ctx
)
932 for (i
= 0; i
< ctx
->nr_user_bufs
; i
++)
933 io_buffer_unmap(ctx
, &ctx
->user_bufs
[i
]);
934 kfree(ctx
->user_bufs
);
935 io_rsrc_data_free(ctx
->buf_data
);
936 ctx
->user_bufs
= NULL
;
937 ctx
->buf_data
= NULL
;
938 ctx
->nr_user_bufs
= 0;
941 int io_sqe_buffers_unregister(struct io_ring_ctx
*ctx
)
943 unsigned nr
= ctx
->nr_user_bufs
;
950 * Quiesce may unlock ->uring_lock, and while it's not held
951 * prevent new requests using the table.
953 ctx
->nr_user_bufs
= 0;
954 ret
= io_rsrc_ref_quiesce(ctx
->buf_data
, ctx
);
955 ctx
->nr_user_bufs
= nr
;
957 __io_sqe_buffers_unregister(ctx
);
962 * Not super efficient, but this is just a registration time. And we do cache
963 * the last compound head, so generally we'll only do a full search if we don't
966 * We check if the given compound head page has already been accounted, to
967 * avoid double accounting it. This allows us to account the full size of the
968 * page, not just the constituent pages of a huge page.
970 static bool headpage_already_acct(struct io_ring_ctx
*ctx
, struct page
**pages
,
971 int nr_pages
, struct page
*hpage
)
975 /* check current page array */
976 for (i
= 0; i
< nr_pages
; i
++) {
977 if (!PageCompound(pages
[i
]))
979 if (compound_head(pages
[i
]) == hpage
)
983 /* check previously registered pages */
984 for (i
= 0; i
< ctx
->nr_user_bufs
; i
++) {
985 struct io_mapped_ubuf
*imu
= ctx
->user_bufs
[i
];
987 for (j
= 0; j
< imu
->nr_bvecs
; j
++) {
988 if (!PageCompound(imu
->bvec
[j
].bv_page
))
990 if (compound_head(imu
->bvec
[j
].bv_page
) == hpage
)
998 static int io_buffer_account_pin(struct io_ring_ctx
*ctx
, struct page
**pages
,
999 int nr_pages
, struct io_mapped_ubuf
*imu
,
1000 struct page
**last_hpage
)
1004 imu
->acct_pages
= 0;
1005 for (i
= 0; i
< nr_pages
; i
++) {
1006 if (!PageCompound(pages
[i
])) {
1011 hpage
= compound_head(pages
[i
]);
1012 if (hpage
== *last_hpage
)
1014 *last_hpage
= hpage
;
1015 if (headpage_already_acct(ctx
, pages
, i
, hpage
))
1017 imu
->acct_pages
+= page_size(hpage
) >> PAGE_SHIFT
;
1021 if (!imu
->acct_pages
)
1024 ret
= io_account_mem(ctx
, imu
->acct_pages
);
1026 imu
->acct_pages
= 0;
1030 struct page
**io_pin_pages(unsigned long ubuf
, unsigned long len
, int *npages
)
1032 unsigned long start
, end
, nr_pages
;
1033 struct page
**pages
= NULL
;
1034 int pret
, ret
= -ENOMEM
;
1036 end
= (ubuf
+ len
+ PAGE_SIZE
- 1) >> PAGE_SHIFT
;
1037 start
= ubuf
>> PAGE_SHIFT
;
1038 nr_pages
= end
- start
;
1040 pages
= kvmalloc_array(nr_pages
, sizeof(struct page
*), GFP_KERNEL
);
1045 mmap_read_lock(current
->mm
);
1046 pret
= pin_user_pages(ubuf
, nr_pages
, FOLL_WRITE
| FOLL_LONGTERM
,
1048 if (pret
== nr_pages
)
1051 ret
= pret
< 0 ? pret
: -EFAULT
;
1053 mmap_read_unlock(current
->mm
);
1055 /* if we did partial map, release any pages we did get */
1057 unpin_user_pages(pages
, pret
);
1064 pages
= ERR_PTR(ret
);
1069 static int io_sqe_buffer_register(struct io_ring_ctx
*ctx
, struct iovec
*iov
,
1070 struct io_mapped_ubuf
**pimu
,
1071 struct page
**last_hpage
)
1073 struct io_mapped_ubuf
*imu
= NULL
;
1074 struct page
**pages
= NULL
;
1077 int ret
, nr_pages
, i
;
1078 struct folio
*folio
= NULL
;
1080 *pimu
= ctx
->dummy_ubuf
;
1085 pages
= io_pin_pages((unsigned long) iov
->iov_base
, iov
->iov_len
,
1087 if (IS_ERR(pages
)) {
1088 ret
= PTR_ERR(pages
);
1093 /* If it's a huge page, try to coalesce them into a single bvec entry */
1095 folio
= page_folio(pages
[0]);
1096 for (i
= 1; i
< nr_pages
; i
++) {
1098 * Pages must be consecutive and on the same folio for
1101 if (page_folio(pages
[i
]) != folio
||
1102 pages
[i
] != pages
[i
- 1] + 1) {
1109 * The pages are bound to the folio, it doesn't
1110 * actually unpin them but drops all but one reference,
1111 * which is usually put down by io_buffer_unmap().
1112 * Note, needs a better helper.
1114 unpin_user_pages(&pages
[1], nr_pages
- 1);
1119 imu
= kvmalloc(struct_size(imu
, bvec
, nr_pages
), GFP_KERNEL
);
1123 ret
= io_buffer_account_pin(ctx
, pages
, nr_pages
, imu
, last_hpage
);
1125 unpin_user_pages(pages
, nr_pages
);
1129 off
= (unsigned long) iov
->iov_base
& ~PAGE_MASK
;
1130 size
= iov
->iov_len
;
1131 /* store original address for later verification */
1132 imu
->ubuf
= (unsigned long) iov
->iov_base
;
1133 imu
->ubuf_end
= imu
->ubuf
+ iov
->iov_len
;
1134 imu
->nr_bvecs
= nr_pages
;
1139 bvec_set_page(&imu
->bvec
[0], pages
[0], size
, off
);
1142 for (i
= 0; i
< nr_pages
; i
++) {
1145 vec_len
= min_t(size_t, size
, PAGE_SIZE
- off
);
1146 bvec_set_page(&imu
->bvec
[i
], pages
[i
], vec_len
, off
);
1157 static int io_buffers_map_alloc(struct io_ring_ctx
*ctx
, unsigned int nr_args
)
1159 ctx
->user_bufs
= kcalloc(nr_args
, sizeof(*ctx
->user_bufs
), GFP_KERNEL
);
1160 return ctx
->user_bufs
? 0 : -ENOMEM
;
1163 int io_sqe_buffers_register(struct io_ring_ctx
*ctx
, void __user
*arg
,
1164 unsigned int nr_args
, u64 __user
*tags
)
1166 struct page
*last_hpage
= NULL
;
1167 struct io_rsrc_data
*data
;
1171 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS
>= (1u << 16));
1175 if (!nr_args
|| nr_args
> IORING_MAX_REG_BUFFERS
)
1177 ret
= io_rsrc_data_alloc(ctx
, IORING_RSRC_BUFFER
, tags
, nr_args
, &data
);
1180 ret
= io_buffers_map_alloc(ctx
, nr_args
);
1182 io_rsrc_data_free(data
);
1186 for (i
= 0; i
< nr_args
; i
++, ctx
->nr_user_bufs
++) {
1188 ret
= io_copy_iov(ctx
, &iov
, arg
, i
);
1191 ret
= io_buffer_validate(&iov
);
1195 memset(&iov
, 0, sizeof(iov
));
1198 if (!iov
.iov_base
&& *io_get_tag_slot(data
, i
)) {
1203 ret
= io_sqe_buffer_register(ctx
, &iov
, &ctx
->user_bufs
[i
],
1209 WARN_ON_ONCE(ctx
->buf_data
);
1211 ctx
->buf_data
= data
;
1213 __io_sqe_buffers_unregister(ctx
);
1217 int io_import_fixed(int ddir
, struct iov_iter
*iter
,
1218 struct io_mapped_ubuf
*imu
,
1219 u64 buf_addr
, size_t len
)
1224 if (WARN_ON_ONCE(!imu
))
1226 if (unlikely(check_add_overflow(buf_addr
, (u64
)len
, &buf_end
)))
1228 /* not inside the mapped region */
1229 if (unlikely(buf_addr
< imu
->ubuf
|| buf_end
> imu
->ubuf_end
))
1233 * Might not be a start of buffer, set size appropriately
1234 * and advance us to the beginning.
1236 offset
= buf_addr
- imu
->ubuf
;
1237 iov_iter_bvec(iter
, ddir
, imu
->bvec
, imu
->nr_bvecs
, offset
+ len
);
1241 * Don't use iov_iter_advance() here, as it's really slow for
1242 * using the latter parts of a big fixed buffer - it iterates
1243 * over each segment manually. We can cheat a bit here, because
1246 * 1) it's a BVEC iter, we set it up
1247 * 2) all bvecs are PAGE_SIZE in size, except potentially the
1248 * first and last bvec
1250 * So just find our index, and adjust the iterator afterwards.
1251 * If the offset is within the first bvec (or the whole first
1252 * bvec, just use iov_iter_advance(). This makes it easier
1253 * since we can just skip the first segment, which may not
1254 * be PAGE_SIZE aligned.
1256 const struct bio_vec
*bvec
= imu
->bvec
;
1258 if (offset
<= bvec
->bv_len
) {
1260 * Note, huge pages buffers consists of one large
1261 * bvec entry and should always go this way. The other
1262 * branch doesn't expect non PAGE_SIZE'd chunks.
1265 iter
->nr_segs
= bvec
->bv_len
;
1266 iter
->count
-= offset
;
1267 iter
->iov_offset
= offset
;
1269 unsigned long seg_skip
;
1271 /* skip first vec */
1272 offset
-= bvec
->bv_len
;
1273 seg_skip
= 1 + (offset
>> PAGE_SHIFT
);
1275 iter
->bvec
= bvec
+ seg_skip
;
1276 iter
->nr_segs
-= seg_skip
;
1277 iter
->count
-= bvec
->bv_len
+ offset
;
1278 iter
->iov_offset
= offset
& ~PAGE_MASK
;