1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/kernel.h>
3 #include <linux/errno.h>
5 #include <linux/file.h>
7 #include <linux/slab.h>
8 #include <linux/nospec.h>
9 #include <linux/hugetlb.h>
10 #include <linux/compat.h>
11 #include <linux/io_uring.h>
13 #include <uapi/linux/io_uring.h>
16 #include "openclose.h"
20 struct io_rsrc_update
{
28 static int io_sqe_buffer_register(struct io_ring_ctx
*ctx
, struct iovec
*iov
,
29 struct io_mapped_ubuf
**pimu
,
30 struct page
**last_hpage
);
32 #define IO_RSRC_REF_BATCH 100
35 #define IORING_MAX_FIXED_FILES (1U << 20)
36 #define IORING_MAX_REG_BUFFERS (1U << 14)
38 void io_rsrc_refs_drop(struct io_ring_ctx
*ctx
)
39 __must_hold(&ctx
->uring_lock
)
41 if (ctx
->rsrc_cached_refs
) {
42 io_rsrc_put_node(ctx
->rsrc_node
, ctx
->rsrc_cached_refs
);
43 ctx
->rsrc_cached_refs
= 0;
47 int __io_account_mem(struct user_struct
*user
, unsigned long nr_pages
)
49 unsigned long page_limit
, cur_pages
, new_pages
;
54 /* Don't allow more pages than we can safely lock */
55 page_limit
= rlimit(RLIMIT_MEMLOCK
) >> PAGE_SHIFT
;
57 cur_pages
= atomic_long_read(&user
->locked_vm
);
59 new_pages
= cur_pages
+ nr_pages
;
60 if (new_pages
> page_limit
)
62 } while (!atomic_long_try_cmpxchg(&user
->locked_vm
,
63 &cur_pages
, new_pages
));
67 static void io_unaccount_mem(struct io_ring_ctx
*ctx
, unsigned long nr_pages
)
70 __io_unaccount_mem(ctx
->user
, nr_pages
);
73 atomic64_sub(nr_pages
, &ctx
->mm_account
->pinned_vm
);
76 static int io_account_mem(struct io_ring_ctx
*ctx
, unsigned long nr_pages
)
81 ret
= __io_account_mem(ctx
->user
, nr_pages
);
87 atomic64_add(nr_pages
, &ctx
->mm_account
->pinned_vm
);
92 static int io_copy_iov(struct io_ring_ctx
*ctx
, struct iovec
*dst
,
93 void __user
*arg
, unsigned index
)
95 struct iovec __user
*src
;
99 struct compat_iovec __user
*ciovs
;
100 struct compat_iovec ciov
;
102 ciovs
= (struct compat_iovec __user
*) arg
;
103 if (copy_from_user(&ciov
, &ciovs
[index
], sizeof(ciov
)))
106 dst
->iov_base
= u64_to_user_ptr((u64
)ciov
.iov_base
);
107 dst
->iov_len
= ciov
.iov_len
;
111 src
= (struct iovec __user
*) arg
;
112 if (copy_from_user(dst
, &src
[index
], sizeof(*dst
)))
117 static int io_buffer_validate(struct iovec
*iov
)
119 unsigned long tmp
, acct_len
= iov
->iov_len
+ (PAGE_SIZE
- 1);
122 * Don't impose further limits on the size and buffer
123 * constraints here, we'll -EINVAL later when IO is
124 * submitted if they are wrong.
127 return iov
->iov_len
? -EFAULT
: 0;
131 /* arbitrary limit, but we need something */
132 if (iov
->iov_len
> SZ_1G
)
135 if (check_add_overflow((unsigned long)iov
->iov_base
, acct_len
, &tmp
))
141 static void io_buffer_unmap(struct io_ring_ctx
*ctx
, struct io_mapped_ubuf
**slot
)
143 struct io_mapped_ubuf
*imu
= *slot
;
146 if (imu
!= ctx
->dummy_ubuf
) {
147 for (i
= 0; i
< imu
->nr_bvecs
; i
++)
148 unpin_user_page(imu
->bvec
[i
].bv_page
);
150 io_unaccount_mem(ctx
, imu
->acct_pages
);
156 void io_rsrc_refs_refill(struct io_ring_ctx
*ctx
)
157 __must_hold(&ctx
->uring_lock
)
159 ctx
->rsrc_cached_refs
+= IO_RSRC_REF_BATCH
;
160 percpu_ref_get_many(&ctx
->rsrc_node
->refs
, IO_RSRC_REF_BATCH
);
163 static void __io_rsrc_put_work(struct io_rsrc_node
*ref_node
)
165 struct io_rsrc_data
*rsrc_data
= ref_node
->rsrc_data
;
166 struct io_ring_ctx
*ctx
= rsrc_data
->ctx
;
167 struct io_rsrc_put
*prsrc
, *tmp
;
169 list_for_each_entry_safe(prsrc
, tmp
, &ref_node
->rsrc_list
, list
) {
170 list_del(&prsrc
->list
);
173 if (ctx
->flags
& IORING_SETUP_IOPOLL
) {
174 mutex_lock(&ctx
->uring_lock
);
175 io_post_aux_cqe(ctx
, prsrc
->tag
, 0, 0, true);
176 mutex_unlock(&ctx
->uring_lock
);
178 io_post_aux_cqe(ctx
, prsrc
->tag
, 0, 0, true);
182 rsrc_data
->do_put(ctx
, prsrc
);
186 io_rsrc_node_destroy(ref_node
);
187 if (atomic_dec_and_test(&rsrc_data
->refs
))
188 complete(&rsrc_data
->done
);
191 void io_rsrc_put_work(struct work_struct
*work
)
193 struct io_ring_ctx
*ctx
;
194 struct llist_node
*node
;
196 ctx
= container_of(work
, struct io_ring_ctx
, rsrc_put_work
.work
);
197 node
= llist_del_all(&ctx
->rsrc_put_llist
);
200 struct io_rsrc_node
*ref_node
;
201 struct llist_node
*next
= node
->next
;
203 ref_node
= llist_entry(node
, struct io_rsrc_node
, llist
);
204 __io_rsrc_put_work(ref_node
);
209 void io_wait_rsrc_data(struct io_rsrc_data
*data
)
211 if (data
&& !atomic_dec_and_test(&data
->refs
))
212 wait_for_completion(&data
->done
);
215 void io_rsrc_node_destroy(struct io_rsrc_node
*ref_node
)
217 percpu_ref_exit(&ref_node
->refs
);
221 static __cold
void io_rsrc_node_ref_zero(struct percpu_ref
*ref
)
223 struct io_rsrc_node
*node
= container_of(ref
, struct io_rsrc_node
, refs
);
224 struct io_ring_ctx
*ctx
= node
->rsrc_data
->ctx
;
226 bool first_add
= false;
227 unsigned long delay
= HZ
;
229 spin_lock_irqsave(&ctx
->rsrc_ref_lock
, flags
);
232 /* if we are mid-quiesce then do not delay */
233 if (node
->rsrc_data
->quiesce
)
236 while (!list_empty(&ctx
->rsrc_ref_list
)) {
237 node
= list_first_entry(&ctx
->rsrc_ref_list
,
238 struct io_rsrc_node
, node
);
239 /* recycle ref nodes in order */
242 list_del(&node
->node
);
243 first_add
|= llist_add(&node
->llist
, &ctx
->rsrc_put_llist
);
245 spin_unlock_irqrestore(&ctx
->rsrc_ref_lock
, flags
);
248 mod_delayed_work(system_wq
, &ctx
->rsrc_put_work
, delay
);
251 static struct io_rsrc_node
*io_rsrc_node_alloc(void)
253 struct io_rsrc_node
*ref_node
;
255 ref_node
= kzalloc(sizeof(*ref_node
), GFP_KERNEL
);
259 if (percpu_ref_init(&ref_node
->refs
, io_rsrc_node_ref_zero
,
264 INIT_LIST_HEAD(&ref_node
->node
);
265 INIT_LIST_HEAD(&ref_node
->rsrc_list
);
266 ref_node
->done
= false;
270 void io_rsrc_node_switch(struct io_ring_ctx
*ctx
,
271 struct io_rsrc_data
*data_to_kill
)
272 __must_hold(&ctx
->uring_lock
)
274 WARN_ON_ONCE(!ctx
->rsrc_backup_node
);
275 WARN_ON_ONCE(data_to_kill
&& !ctx
->rsrc_node
);
277 io_rsrc_refs_drop(ctx
);
280 struct io_rsrc_node
*rsrc_node
= ctx
->rsrc_node
;
282 rsrc_node
->rsrc_data
= data_to_kill
;
283 spin_lock_irq(&ctx
->rsrc_ref_lock
);
284 list_add_tail(&rsrc_node
->node
, &ctx
->rsrc_ref_list
);
285 spin_unlock_irq(&ctx
->rsrc_ref_lock
);
287 atomic_inc(&data_to_kill
->refs
);
288 percpu_ref_kill(&rsrc_node
->refs
);
289 ctx
->rsrc_node
= NULL
;
292 if (!ctx
->rsrc_node
) {
293 ctx
->rsrc_node
= ctx
->rsrc_backup_node
;
294 ctx
->rsrc_backup_node
= NULL
;
298 int io_rsrc_node_switch_start(struct io_ring_ctx
*ctx
)
300 if (ctx
->rsrc_backup_node
)
302 ctx
->rsrc_backup_node
= io_rsrc_node_alloc();
303 return ctx
->rsrc_backup_node
? 0 : -ENOMEM
;
306 __cold
static int io_rsrc_ref_quiesce(struct io_rsrc_data
*data
,
307 struct io_ring_ctx
*ctx
)
311 /* As we may drop ->uring_lock, other task may have started quiesce */
315 data
->quiesce
= true;
317 ret
= io_rsrc_node_switch_start(ctx
);
320 io_rsrc_node_switch(ctx
, data
);
322 /* kill initial ref, already quiesced if zero */
323 if (atomic_dec_and_test(&data
->refs
))
325 mutex_unlock(&ctx
->uring_lock
);
326 flush_delayed_work(&ctx
->rsrc_put_work
);
327 ret
= wait_for_completion_interruptible(&data
->done
);
329 mutex_lock(&ctx
->uring_lock
);
330 if (atomic_read(&data
->refs
) > 0) {
332 * it has been revived by another thread while
335 mutex_unlock(&ctx
->uring_lock
);
341 atomic_inc(&data
->refs
);
342 /* wait for all works potentially completing data->done */
343 flush_delayed_work(&ctx
->rsrc_put_work
);
344 reinit_completion(&data
->done
);
346 ret
= io_run_task_work_sig();
347 mutex_lock(&ctx
->uring_lock
);
349 data
->quiesce
= false;
354 static void io_free_page_table(void **table
, size_t size
)
356 unsigned i
, nr_tables
= DIV_ROUND_UP(size
, PAGE_SIZE
);
358 for (i
= 0; i
< nr_tables
; i
++)
363 static void io_rsrc_data_free(struct io_rsrc_data
*data
)
365 size_t size
= data
->nr
* sizeof(data
->tags
[0][0]);
368 io_free_page_table((void **)data
->tags
, size
);
372 static __cold
void **io_alloc_page_table(size_t size
)
374 unsigned i
, nr_tables
= DIV_ROUND_UP(size
, PAGE_SIZE
);
375 size_t init_size
= size
;
378 table
= kcalloc(nr_tables
, sizeof(*table
), GFP_KERNEL_ACCOUNT
);
382 for (i
= 0; i
< nr_tables
; i
++) {
383 unsigned int this_size
= min_t(size_t, size
, PAGE_SIZE
);
385 table
[i
] = kzalloc(this_size
, GFP_KERNEL_ACCOUNT
);
387 io_free_page_table(table
, init_size
);
395 __cold
static int io_rsrc_data_alloc(struct io_ring_ctx
*ctx
,
396 rsrc_put_fn
*do_put
, u64 __user
*utags
,
397 unsigned nr
, struct io_rsrc_data
**pdata
)
399 struct io_rsrc_data
*data
;
403 data
= kzalloc(sizeof(*data
), GFP_KERNEL
);
406 data
->tags
= (u64
**)io_alloc_page_table(nr
* sizeof(data
->tags
[0][0]));
414 data
->do_put
= do_put
;
417 for (i
= 0; i
< nr
; i
++) {
418 u64
*tag_slot
= io_get_tag_slot(data
, i
);
420 if (copy_from_user(tag_slot
, &utags
[i
],
426 atomic_set(&data
->refs
, 1);
427 init_completion(&data
->done
);
431 io_rsrc_data_free(data
);
435 static int __io_sqe_files_update(struct io_ring_ctx
*ctx
,
436 struct io_uring_rsrc_update2
*up
,
439 u64 __user
*tags
= u64_to_user_ptr(up
->tags
);
440 __s32 __user
*fds
= u64_to_user_ptr(up
->data
);
441 struct io_rsrc_data
*data
= ctx
->file_data
;
442 struct io_fixed_file
*file_slot
;
446 bool needs_switch
= false;
450 if (up
->offset
+ nr_args
> ctx
->nr_user_files
)
453 for (done
= 0; done
< nr_args
; done
++) {
456 if ((tags
&& copy_from_user(&tag
, &tags
[done
], sizeof(tag
))) ||
457 copy_from_user(&fd
, &fds
[done
], sizeof(fd
))) {
461 if ((fd
== IORING_REGISTER_FILES_SKIP
|| fd
== -1) && tag
) {
465 if (fd
== IORING_REGISTER_FILES_SKIP
)
468 i
= array_index_nospec(up
->offset
+ done
, ctx
->nr_user_files
);
469 file_slot
= io_fixed_file_slot(&ctx
->file_table
, i
);
471 if (file_slot
->file_ptr
) {
472 file
= (struct file
*)(file_slot
->file_ptr
& FFS_MASK
);
473 err
= io_queue_rsrc_removal(data
, i
, ctx
->rsrc_node
, file
);
476 file_slot
->file_ptr
= 0;
477 io_file_bitmap_clear(&ctx
->file_table
, i
);
487 * Don't allow io_uring instances to be registered. If
488 * UNIX isn't enabled, then this causes a reference
489 * cycle and this instance can never get freed. If UNIX
490 * is enabled we'll handle it just fine, but there's
491 * still no point in allowing a ring fd as it doesn't
492 * support regular read/write anyway.
494 if (io_is_uring_fops(file
)) {
499 err
= io_scm_file_account(ctx
, file
);
504 *io_get_tag_slot(data
, i
) = tag
;
505 io_fixed_file_set(file_slot
, file
);
506 io_file_bitmap_set(&ctx
->file_table
, i
);
511 io_rsrc_node_switch(ctx
, data
);
512 return done
? done
: err
;
515 static int __io_sqe_buffers_update(struct io_ring_ctx
*ctx
,
516 struct io_uring_rsrc_update2
*up
,
517 unsigned int nr_args
)
519 u64 __user
*tags
= u64_to_user_ptr(up
->tags
);
520 struct iovec iov
, __user
*iovs
= u64_to_user_ptr(up
->data
);
521 struct page
*last_hpage
= NULL
;
522 bool needs_switch
= false;
528 if (up
->offset
+ nr_args
> ctx
->nr_user_bufs
)
531 for (done
= 0; done
< nr_args
; done
++) {
532 struct io_mapped_ubuf
*imu
;
533 int offset
= up
->offset
+ done
;
536 err
= io_copy_iov(ctx
, &iov
, iovs
, done
);
539 if (tags
&& copy_from_user(&tag
, &tags
[done
], sizeof(tag
))) {
543 err
= io_buffer_validate(&iov
);
546 if (!iov
.iov_base
&& tag
) {
550 err
= io_sqe_buffer_register(ctx
, &iov
, &imu
, &last_hpage
);
554 i
= array_index_nospec(offset
, ctx
->nr_user_bufs
);
555 if (ctx
->user_bufs
[i
] != ctx
->dummy_ubuf
) {
556 err
= io_queue_rsrc_removal(ctx
->buf_data
, i
,
557 ctx
->rsrc_node
, ctx
->user_bufs
[i
]);
559 io_buffer_unmap(ctx
, &imu
);
562 ctx
->user_bufs
[i
] = ctx
->dummy_ubuf
;
566 ctx
->user_bufs
[i
] = imu
;
567 *io_get_tag_slot(ctx
->buf_data
, offset
) = tag
;
571 io_rsrc_node_switch(ctx
, ctx
->buf_data
);
572 return done
? done
: err
;
575 static int __io_register_rsrc_update(struct io_ring_ctx
*ctx
, unsigned type
,
576 struct io_uring_rsrc_update2
*up
,
582 if (check_add_overflow(up
->offset
, nr_args
, &tmp
))
584 err
= io_rsrc_node_switch_start(ctx
);
589 case IORING_RSRC_FILE
:
590 return __io_sqe_files_update(ctx
, up
, nr_args
);
591 case IORING_RSRC_BUFFER
:
592 return __io_sqe_buffers_update(ctx
, up
, nr_args
);
597 int io_register_files_update(struct io_ring_ctx
*ctx
, void __user
*arg
,
600 struct io_uring_rsrc_update2 up
;
604 memset(&up
, 0, sizeof(up
));
605 if (copy_from_user(&up
, arg
, sizeof(struct io_uring_rsrc_update
)))
607 if (up
.resv
|| up
.resv2
)
609 return __io_register_rsrc_update(ctx
, IORING_RSRC_FILE
, &up
, nr_args
);
612 int io_register_rsrc_update(struct io_ring_ctx
*ctx
, void __user
*arg
,
613 unsigned size
, unsigned type
)
615 struct io_uring_rsrc_update2 up
;
617 if (size
!= sizeof(up
))
619 if (copy_from_user(&up
, arg
, sizeof(up
)))
621 if (!up
.nr
|| up
.resv
|| up
.resv2
)
623 return __io_register_rsrc_update(ctx
, type
, &up
, up
.nr
);
626 __cold
int io_register_rsrc(struct io_ring_ctx
*ctx
, void __user
*arg
,
627 unsigned int size
, unsigned int type
)
629 struct io_uring_rsrc_register rr
;
631 /* keep it extendible */
632 if (size
!= sizeof(rr
))
635 memset(&rr
, 0, sizeof(rr
));
636 if (copy_from_user(&rr
, arg
, size
))
638 if (!rr
.nr
|| rr
.resv2
)
640 if (rr
.flags
& ~IORING_RSRC_REGISTER_SPARSE
)
644 case IORING_RSRC_FILE
:
645 if (rr
.flags
& IORING_RSRC_REGISTER_SPARSE
&& rr
.data
)
647 return io_sqe_files_register(ctx
, u64_to_user_ptr(rr
.data
),
648 rr
.nr
, u64_to_user_ptr(rr
.tags
));
649 case IORING_RSRC_BUFFER
:
650 if (rr
.flags
& IORING_RSRC_REGISTER_SPARSE
&& rr
.data
)
652 return io_sqe_buffers_register(ctx
, u64_to_user_ptr(rr
.data
),
653 rr
.nr
, u64_to_user_ptr(rr
.tags
));
658 int io_rsrc_update_prep(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
660 struct io_rsrc_update
*up
= io_kiocb_to_cmd(req
, struct io_rsrc_update
);
662 if (unlikely(req
->flags
& (REQ_F_FIXED_FILE
| REQ_F_BUFFER_SELECT
)))
664 if (sqe
->rw_flags
|| sqe
->splice_fd_in
)
667 up
->offset
= READ_ONCE(sqe
->off
);
668 up
->nr_args
= READ_ONCE(sqe
->len
);
671 up
->arg
= READ_ONCE(sqe
->addr
);
672 up
->type
= READ_ONCE(sqe
->ioprio
);
676 static int io_files_update_with_index_alloc(struct io_kiocb
*req
,
677 unsigned int issue_flags
)
679 struct io_rsrc_update
*up
= io_kiocb_to_cmd(req
, struct io_rsrc_update
);
680 __s32 __user
*fds
= u64_to_user_ptr(up
->arg
);
685 if (!req
->ctx
->file_data
)
688 for (done
= 0; done
< up
->nr_args
; done
++) {
689 if (copy_from_user(&fd
, &fds
[done
], sizeof(fd
))) {
699 ret
= io_fixed_fd_install(req
, issue_flags
, file
,
700 IORING_FILE_INDEX_ALLOC
);
703 if (copy_to_user(&fds
[done
], &ret
, sizeof(ret
))) {
704 __io_close_fixed(req
->ctx
, issue_flags
, ret
);
715 static int io_files_update(struct io_kiocb
*req
, unsigned int issue_flags
)
717 struct io_rsrc_update
*up
= io_kiocb_to_cmd(req
, struct io_rsrc_update
);
718 struct io_ring_ctx
*ctx
= req
->ctx
;
719 struct io_uring_rsrc_update2 up2
;
722 up2
.offset
= up
->offset
;
729 if (up
->offset
== IORING_FILE_INDEX_ALLOC
) {
730 ret
= io_files_update_with_index_alloc(req
, issue_flags
);
732 io_ring_submit_lock(ctx
, issue_flags
);
733 ret
= __io_register_rsrc_update(ctx
, IORING_RSRC_FILE
,
735 io_ring_submit_unlock(ctx
, issue_flags
);
740 io_req_set_res(req
, ret
, 0);
744 static int io_notif_update(struct io_kiocb
*req
, unsigned int issue_flags
)
746 struct io_rsrc_update
*up
= io_kiocb_to_cmd(req
, struct io_rsrc_update
);
747 struct io_ring_ctx
*ctx
= req
->ctx
;
748 unsigned len
= up
->nr_args
;
749 unsigned idx_end
, idx
= up
->offset
;
752 io_ring_submit_lock(ctx
, issue_flags
);
753 if (unlikely(check_add_overflow(idx
, len
, &idx_end
))) {
757 if (unlikely(idx_end
> ctx
->nr_notif_slots
)) {
762 for (; idx
< idx_end
; idx
++) {
763 struct io_notif_slot
*slot
= &ctx
->notif_slots
[idx
];
769 io_notif_slot_flush_submit(slot
, issue_flags
);
772 io_ring_submit_unlock(ctx
, issue_flags
);
775 io_req_set_res(req
, ret
, 0);
779 int io_rsrc_update(struct io_kiocb
*req
, unsigned int issue_flags
)
781 struct io_rsrc_update
*up
= io_kiocb_to_cmd(req
, struct io_rsrc_update
);
784 case IORING_RSRC_UPDATE_FILES
:
785 return io_files_update(req
, issue_flags
);
786 case IORING_RSRC_UPDATE_NOTIF
:
787 return io_notif_update(req
, issue_flags
);
792 int io_queue_rsrc_removal(struct io_rsrc_data
*data
, unsigned idx
,
793 struct io_rsrc_node
*node
, void *rsrc
)
795 u64
*tag_slot
= io_get_tag_slot(data
, idx
);
796 struct io_rsrc_put
*prsrc
;
798 prsrc
= kzalloc(sizeof(*prsrc
), GFP_KERNEL
);
802 prsrc
->tag
= *tag_slot
;
805 list_add(&prsrc
->list
, &node
->rsrc_list
);
809 void __io_sqe_files_unregister(struct io_ring_ctx
*ctx
)
811 #if !defined(IO_URING_SCM_ALL)
814 for (i
= 0; i
< ctx
->nr_user_files
; i
++) {
815 struct file
*file
= io_file_from_index(&ctx
->file_table
, i
);
819 if (io_fixed_file_slot(&ctx
->file_table
, i
)->file_ptr
& FFS_SCM
)
821 io_file_bitmap_clear(&ctx
->file_table
, i
);
826 #if defined(CONFIG_UNIX)
827 if (ctx
->ring_sock
) {
828 struct sock
*sock
= ctx
->ring_sock
->sk
;
831 while ((skb
= skb_dequeue(&sock
->sk_receive_queue
)) != NULL
)
835 io_free_file_tables(&ctx
->file_table
);
836 io_rsrc_data_free(ctx
->file_data
);
837 ctx
->file_data
= NULL
;
838 ctx
->nr_user_files
= 0;
841 int io_sqe_files_unregister(struct io_ring_ctx
*ctx
)
843 unsigned nr
= ctx
->nr_user_files
;
850 * Quiesce may unlock ->uring_lock, and while it's not held
851 * prevent new requests using the table.
853 ctx
->nr_user_files
= 0;
854 ret
= io_rsrc_ref_quiesce(ctx
->file_data
, ctx
);
855 ctx
->nr_user_files
= nr
;
857 __io_sqe_files_unregister(ctx
);
862 * Ensure the UNIX gc is aware of our file set, so we are certain that
863 * the io_uring can be safely unregistered on process exit, even if we have
864 * loops in the file referencing. We account only files that can hold other
865 * files because otherwise they can't form a loop and so are not interesting
868 int __io_scm_file_account(struct io_ring_ctx
*ctx
, struct file
*file
)
870 #if defined(CONFIG_UNIX)
871 struct sock
*sk
= ctx
->ring_sock
->sk
;
872 struct sk_buff_head
*head
= &sk
->sk_receive_queue
;
873 struct scm_fp_list
*fpl
;
876 if (likely(!io_file_need_scm(file
)))
880 * See if we can merge this file into an existing skb SCM_RIGHTS
881 * file set. If there's no room, fall back to allocating a new skb
884 spin_lock_irq(&head
->lock
);
885 skb
= skb_peek(head
);
886 if (skb
&& UNIXCB(skb
).fp
->count
< SCM_MAX_FD
)
887 __skb_unlink(skb
, head
);
890 spin_unlock_irq(&head
->lock
);
893 fpl
= kzalloc(sizeof(*fpl
), GFP_KERNEL
);
897 skb
= alloc_skb(0, GFP_KERNEL
);
903 fpl
->user
= get_uid(current_user());
904 fpl
->max
= SCM_MAX_FD
;
907 UNIXCB(skb
).fp
= fpl
;
909 skb
->destructor
= unix_destruct_scm
;
910 refcount_add(skb
->truesize
, &sk
->sk_wmem_alloc
);
913 fpl
= UNIXCB(skb
).fp
;
914 fpl
->fp
[fpl
->count
++] = get_file(file
);
915 unix_inflight(fpl
->user
, file
);
916 skb_queue_head(head
, skb
);
922 static void io_rsrc_file_put(struct io_ring_ctx
*ctx
, struct io_rsrc_put
*prsrc
)
924 struct file
*file
= prsrc
->file
;
925 #if defined(CONFIG_UNIX)
926 struct sock
*sock
= ctx
->ring_sock
->sk
;
927 struct sk_buff_head list
, *head
= &sock
->sk_receive_queue
;
931 if (!io_file_need_scm(file
)) {
936 __skb_queue_head_init(&list
);
939 * Find the skb that holds this file in its SCM_RIGHTS. When found,
940 * remove this entry and rearrange the file array.
942 skb
= skb_dequeue(head
);
944 struct scm_fp_list
*fp
;
947 for (i
= 0; i
< fp
->count
; i
++) {
950 if (fp
->fp
[i
] != file
)
953 unix_notinflight(fp
->user
, fp
->fp
[i
]);
954 left
= fp
->count
- 1 - i
;
956 memmove(&fp
->fp
[i
], &fp
->fp
[i
+ 1],
957 left
* sizeof(struct file
*));
964 __skb_queue_tail(&list
, skb
);
974 __skb_queue_tail(&list
, skb
);
976 skb
= skb_dequeue(head
);
979 if (skb_peek(&list
)) {
980 spin_lock_irq(&head
->lock
);
981 while ((skb
= __skb_dequeue(&list
)) != NULL
)
982 __skb_queue_tail(head
, skb
);
983 spin_unlock_irq(&head
->lock
);
990 int io_sqe_files_register(struct io_ring_ctx
*ctx
, void __user
*arg
,
991 unsigned nr_args
, u64 __user
*tags
)
993 __s32 __user
*fds
= (__s32 __user
*) arg
;
1002 if (nr_args
> IORING_MAX_FIXED_FILES
)
1004 if (nr_args
> rlimit(RLIMIT_NOFILE
))
1006 ret
= io_rsrc_node_switch_start(ctx
);
1009 ret
= io_rsrc_data_alloc(ctx
, io_rsrc_file_put
, tags
, nr_args
,
1014 if (!io_alloc_file_tables(&ctx
->file_table
, nr_args
)) {
1015 io_rsrc_data_free(ctx
->file_data
);
1016 ctx
->file_data
= NULL
;
1020 for (i
= 0; i
< nr_args
; i
++, ctx
->nr_user_files
++) {
1021 struct io_fixed_file
*file_slot
;
1023 if (fds
&& copy_from_user(&fd
, &fds
[i
], sizeof(fd
))) {
1027 /* allow sparse sets */
1028 if (!fds
|| fd
== -1) {
1030 if (unlikely(*io_get_tag_slot(ctx
->file_data
, i
)))
1037 if (unlikely(!file
))
1041 * Don't allow io_uring instances to be registered. If UNIX
1042 * isn't enabled, then this causes a reference cycle and this
1043 * instance can never get freed. If UNIX is enabled we'll
1044 * handle it just fine, but there's still no point in allowing
1045 * a ring fd as it doesn't support regular read/write anyway.
1047 if (io_is_uring_fops(file
)) {
1051 ret
= io_scm_file_account(ctx
, file
);
1056 file_slot
= io_fixed_file_slot(&ctx
->file_table
, i
);
1057 io_fixed_file_set(file_slot
, file
);
1058 io_file_bitmap_set(&ctx
->file_table
, i
);
1061 /* default it to the whole table */
1062 io_file_table_set_alloc_range(ctx
, 0, ctx
->nr_user_files
);
1063 io_rsrc_node_switch(ctx
, NULL
);
1066 __io_sqe_files_unregister(ctx
);
1070 static void io_rsrc_buf_put(struct io_ring_ctx
*ctx
, struct io_rsrc_put
*prsrc
)
1072 io_buffer_unmap(ctx
, &prsrc
->buf
);
1076 void __io_sqe_buffers_unregister(struct io_ring_ctx
*ctx
)
1080 for (i
= 0; i
< ctx
->nr_user_bufs
; i
++)
1081 io_buffer_unmap(ctx
, &ctx
->user_bufs
[i
]);
1082 kfree(ctx
->user_bufs
);
1083 io_rsrc_data_free(ctx
->buf_data
);
1084 ctx
->user_bufs
= NULL
;
1085 ctx
->buf_data
= NULL
;
1086 ctx
->nr_user_bufs
= 0;
1089 int io_sqe_buffers_unregister(struct io_ring_ctx
*ctx
)
1091 unsigned nr
= ctx
->nr_user_bufs
;
1098 * Quiesce may unlock ->uring_lock, and while it's not held
1099 * prevent new requests using the table.
1101 ctx
->nr_user_bufs
= 0;
1102 ret
= io_rsrc_ref_quiesce(ctx
->buf_data
, ctx
);
1103 ctx
->nr_user_bufs
= nr
;
1105 __io_sqe_buffers_unregister(ctx
);
1110 * Not super efficient, but this is just a registration time. And we do cache
1111 * the last compound head, so generally we'll only do a full search if we don't
1114 * We check if the given compound head page has already been accounted, to
1115 * avoid double accounting it. This allows us to account the full size of the
1116 * page, not just the constituent pages of a huge page.
1118 static bool headpage_already_acct(struct io_ring_ctx
*ctx
, struct page
**pages
,
1119 int nr_pages
, struct page
*hpage
)
1123 /* check current page array */
1124 for (i
= 0; i
< nr_pages
; i
++) {
1125 if (!PageCompound(pages
[i
]))
1127 if (compound_head(pages
[i
]) == hpage
)
1131 /* check previously registered pages */
1132 for (i
= 0; i
< ctx
->nr_user_bufs
; i
++) {
1133 struct io_mapped_ubuf
*imu
= ctx
->user_bufs
[i
];
1135 for (j
= 0; j
< imu
->nr_bvecs
; j
++) {
1136 if (!PageCompound(imu
->bvec
[j
].bv_page
))
1138 if (compound_head(imu
->bvec
[j
].bv_page
) == hpage
)
1146 static int io_buffer_account_pin(struct io_ring_ctx
*ctx
, struct page
**pages
,
1147 int nr_pages
, struct io_mapped_ubuf
*imu
,
1148 struct page
**last_hpage
)
1152 imu
->acct_pages
= 0;
1153 for (i
= 0; i
< nr_pages
; i
++) {
1154 if (!PageCompound(pages
[i
])) {
1159 hpage
= compound_head(pages
[i
]);
1160 if (hpage
== *last_hpage
)
1162 *last_hpage
= hpage
;
1163 if (headpage_already_acct(ctx
, pages
, i
, hpage
))
1165 imu
->acct_pages
+= page_size(hpage
) >> PAGE_SHIFT
;
1169 if (!imu
->acct_pages
)
1172 ret
= io_account_mem(ctx
, imu
->acct_pages
);
1174 imu
->acct_pages
= 0;
1178 struct page
**io_pin_pages(unsigned long ubuf
, unsigned long len
, int *npages
)
1180 unsigned long start
, end
, nr_pages
;
1181 struct vm_area_struct
**vmas
= NULL
;
1182 struct page
**pages
= NULL
;
1183 int i
, pret
, ret
= -ENOMEM
;
1185 end
= (ubuf
+ len
+ PAGE_SIZE
- 1) >> PAGE_SHIFT
;
1186 start
= ubuf
>> PAGE_SHIFT
;
1187 nr_pages
= end
- start
;
1189 pages
= kvmalloc_array(nr_pages
, sizeof(struct page
*), GFP_KERNEL
);
1193 vmas
= kvmalloc_array(nr_pages
, sizeof(struct vm_area_struct
*),
1199 mmap_read_lock(current
->mm
);
1200 pret
= pin_user_pages(ubuf
, nr_pages
, FOLL_WRITE
| FOLL_LONGTERM
,
1202 if (pret
== nr_pages
) {
1203 /* don't support file backed memory */
1204 for (i
= 0; i
< nr_pages
; i
++) {
1205 struct vm_area_struct
*vma
= vmas
[i
];
1207 if (vma_is_shmem(vma
))
1210 !is_file_hugepages(vma
->vm_file
)) {
1217 ret
= pret
< 0 ? pret
: -EFAULT
;
1219 mmap_read_unlock(current
->mm
);
1222 * if we did partial map, or found file backed vmas,
1223 * release any pages we did get
1226 unpin_user_pages(pages
, pret
);
1234 pages
= ERR_PTR(ret
);
1239 static int io_sqe_buffer_register(struct io_ring_ctx
*ctx
, struct iovec
*iov
,
1240 struct io_mapped_ubuf
**pimu
,
1241 struct page
**last_hpage
)
1243 struct io_mapped_ubuf
*imu
= NULL
;
1244 struct page
**pages
= NULL
;
1247 int ret
, nr_pages
, i
;
1249 *pimu
= ctx
->dummy_ubuf
;
1254 pages
= io_pin_pages((unsigned long) iov
->iov_base
, iov
->iov_len
,
1256 if (IS_ERR(pages
)) {
1257 ret
= PTR_ERR(pages
);
1262 imu
= kvmalloc(struct_size(imu
, bvec
, nr_pages
), GFP_KERNEL
);
1266 ret
= io_buffer_account_pin(ctx
, pages
, nr_pages
, imu
, last_hpage
);
1268 unpin_user_pages(pages
, nr_pages
);
1272 off
= (unsigned long) iov
->iov_base
& ~PAGE_MASK
;
1273 size
= iov
->iov_len
;
1274 for (i
= 0; i
< nr_pages
; i
++) {
1277 vec_len
= min_t(size_t, size
, PAGE_SIZE
- off
);
1278 imu
->bvec
[i
].bv_page
= pages
[i
];
1279 imu
->bvec
[i
].bv_len
= vec_len
;
1280 imu
->bvec
[i
].bv_offset
= off
;
1284 /* store original address for later verification */
1285 imu
->ubuf
= (unsigned long) iov
->iov_base
;
1286 imu
->ubuf_end
= imu
->ubuf
+ iov
->iov_len
;
1287 imu
->nr_bvecs
= nr_pages
;
1297 static int io_buffers_map_alloc(struct io_ring_ctx
*ctx
, unsigned int nr_args
)
1299 ctx
->user_bufs
= kcalloc(nr_args
, sizeof(*ctx
->user_bufs
), GFP_KERNEL
);
1300 return ctx
->user_bufs
? 0 : -ENOMEM
;
1303 int io_sqe_buffers_register(struct io_ring_ctx
*ctx
, void __user
*arg
,
1304 unsigned int nr_args
, u64 __user
*tags
)
1306 struct page
*last_hpage
= NULL
;
1307 struct io_rsrc_data
*data
;
1311 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS
>= (1u << 16));
1315 if (!nr_args
|| nr_args
> IORING_MAX_REG_BUFFERS
)
1317 ret
= io_rsrc_node_switch_start(ctx
);
1320 ret
= io_rsrc_data_alloc(ctx
, io_rsrc_buf_put
, tags
, nr_args
, &data
);
1323 ret
= io_buffers_map_alloc(ctx
, nr_args
);
1325 io_rsrc_data_free(data
);
1329 for (i
= 0; i
< nr_args
; i
++, ctx
->nr_user_bufs
++) {
1331 ret
= io_copy_iov(ctx
, &iov
, arg
, i
);
1334 ret
= io_buffer_validate(&iov
);
1338 memset(&iov
, 0, sizeof(iov
));
1341 if (!iov
.iov_base
&& *io_get_tag_slot(data
, i
)) {
1346 ret
= io_sqe_buffer_register(ctx
, &iov
, &ctx
->user_bufs
[i
],
1352 WARN_ON_ONCE(ctx
->buf_data
);
1354 ctx
->buf_data
= data
;
1356 __io_sqe_buffers_unregister(ctx
);
1358 io_rsrc_node_switch(ctx
, NULL
);
1362 int io_import_fixed(int ddir
, struct iov_iter
*iter
,
1363 struct io_mapped_ubuf
*imu
,
1364 u64 buf_addr
, size_t len
)
1369 if (WARN_ON_ONCE(!imu
))
1371 if (unlikely(check_add_overflow(buf_addr
, (u64
)len
, &buf_end
)))
1373 /* not inside the mapped region */
1374 if (unlikely(buf_addr
< imu
->ubuf
|| buf_end
> imu
->ubuf_end
))
1378 * May not be a start of buffer, set size appropriately
1379 * and advance us to the beginning.
1381 offset
= buf_addr
- imu
->ubuf
;
1382 iov_iter_bvec(iter
, ddir
, imu
->bvec
, imu
->nr_bvecs
, offset
+ len
);
1386 * Don't use iov_iter_advance() here, as it's really slow for
1387 * using the latter parts of a big fixed buffer - it iterates
1388 * over each segment manually. We can cheat a bit here, because
1391 * 1) it's a BVEC iter, we set it up
1392 * 2) all bvecs are PAGE_SIZE in size, except potentially the
1393 * first and last bvec
1395 * So just find our index, and adjust the iterator afterwards.
1396 * If the offset is within the first bvec (or the whole first
1397 * bvec, just use iov_iter_advance(). This makes it easier
1398 * since we can just skip the first segment, which may not
1399 * be PAGE_SIZE aligned.
1401 const struct bio_vec
*bvec
= imu
->bvec
;
1403 if (offset
<= bvec
->bv_len
) {
1404 iov_iter_advance(iter
, offset
);
1406 unsigned long seg_skip
;
1408 /* skip first vec */
1409 offset
-= bvec
->bv_len
;
1410 seg_skip
= 1 + (offset
>> PAGE_SHIFT
);
1412 iter
->bvec
= bvec
+ seg_skip
;
1413 iter
->nr_segs
-= seg_skip
;
1414 iter
->count
-= bvec
->bv_len
+ offset
;
1415 iter
->iov_offset
= offset
& ~PAGE_MASK
;