1 // SPDX-License-Identifier: GPL-2.0
3 * Shared application/kernel submission and completion ring pairs, for
4 * supporting fast/efficient IO.
6 * A note on the read/write ordering memory barriers that are matched between
7 * the application and kernel side.
9 * After the application reads the CQ ring tail, it must use an
10 * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
11 * before writing the tail (using smp_load_acquire to read the tail will
12 * do). It also needs a smp_mb() before updating CQ head (ordering the
13 * entry load(s) with the head store), pairing with an implicit barrier
14 * through a control-dependency in io_get_cqe (smp_store_release to
15 * store head will do). Failure to do so could lead to reading invalid
18 * Likewise, the application must use an appropriate smp_wmb() before
19 * writing the SQ tail (ordering SQ entry stores with the tail store),
20 * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
21 * to store the tail will do). And it needs a barrier ordering the SQ
22 * head load before writing new SQ entries (smp_load_acquire to read
25 * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
26 * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
27 * updating the SQ tail; a full memory barrier smp_mb() is needed
30 * Also see the examples in the liburing library:
32 * git://git.kernel.dk/liburing
34 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
35 * from data shared between the kernel and application. This is done both
36 * for ordering purposes, but also to ensure that once a value is loaded from
37 * data that the application could potentially modify, it remains stable.
39 * Copyright (C) 2018-2019 Jens Axboe
40 * Copyright (c) 2018-2019 Christoph Hellwig
42 #include <linux/kernel.h>
43 #include <linux/init.h>
44 #include <linux/errno.h>
45 #include <linux/syscalls.h>
46 #include <linux/compat.h>
47 #include <net/compat.h>
48 #include <linux/refcount.h>
49 #include <linux/uio.h>
50 #include <linux/bits.h>
52 #include <linux/sched/signal.h>
54 #include <linux/file.h>
55 #include <linux/fdtable.h>
57 #include <linux/mman.h>
58 #include <linux/percpu.h>
59 #include <linux/slab.h>
60 #include <linux/blk-mq.h>
61 #include <linux/bvec.h>
62 #include <linux/net.h>
64 #include <net/af_unix.h>
66 #include <linux/anon_inodes.h>
67 #include <linux/sched/mm.h>
68 #include <linux/uaccess.h>
69 #include <linux/nospec.h>
70 #include <linux/sizes.h>
71 #include <linux/hugetlb.h>
72 #include <linux/highmem.h>
73 #include <linux/namei.h>
74 #include <linux/fsnotify.h>
75 #include <linux/fadvise.h>
76 #include <linux/eventpoll.h>
77 #include <linux/splice.h>
78 #include <linux/task_work.h>
79 #include <linux/pagemap.h>
80 #include <linux/io_uring.h>
81 #include <linux/audit.h>
82 #include <linux/security.h>
84 #define CREATE_TRACE_POINTS
85 #include <trace/events/io_uring.h>
87 #include <uapi/linux/io_uring.h>
89 #include "../fs/internal.h"
92 #define IORING_MAX_ENTRIES 32768
93 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
94 #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
97 #define IORING_MAX_FIXED_FILES (1U << 15)
98 #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
99 IORING_REGISTER_LAST + IORING_OP_LAST)
101 #define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3)
102 #define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT)
103 #define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1)
105 #define IORING_MAX_REG_BUFFERS (1U << 14)
107 #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
108 IOSQE_IO_HARDLINK | IOSQE_ASYNC)
110 #define SQE_VALID_FLAGS (SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \
111 IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS)
113 #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
114 REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \
117 #define IO_TCTX_REFS_CACHE_NR (1U << 10)
120 u32 head ____cacheline_aligned_in_smp
;
121 u32 tail ____cacheline_aligned_in_smp
;
125 * This data is shared with the application through the mmap at offsets
126 * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
128 * The offsets to the member fields are published through struct
129 * io_sqring_offsets when calling io_uring_setup.
133 * Head and tail offsets into the ring; the offsets need to be
134 * masked to get valid indices.
136 * The kernel controls head of the sq ring and the tail of the cq ring,
137 * and the application controls tail of the sq ring and the head of the
140 struct io_uring sq
, cq
;
142 * Bitmasks to apply to head and tail offsets (constant, equals
145 u32 sq_ring_mask
, cq_ring_mask
;
146 /* Ring sizes (constant, power of 2) */
147 u32 sq_ring_entries
, cq_ring_entries
;
149 * Number of invalid entries dropped by the kernel due to
150 * invalid index stored in array
152 * Written by the kernel, shouldn't be modified by the
153 * application (i.e. get number of "new events" by comparing to
156 * After a new SQ head value was read by the application this
157 * counter includes all submissions that were dropped reaching
158 * the new SQ head (and possibly more).
164 * Written by the kernel, shouldn't be modified by the
167 * The application needs a full memory barrier before checking
168 * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
174 * Written by the application, shouldn't be modified by the
179 * Number of completion events lost because the queue was full;
180 * this should be avoided by the application by making sure
181 * there are not more requests pending than there is space in
182 * the completion queue.
184 * Written by the kernel, shouldn't be modified by the
185 * application (i.e. get number of "new events" by comparing to
188 * As completion events come in out of order this counter is not
189 * ordered with any other data.
193 * Ring buffer of completion events.
195 * The kernel writes completion events fresh every time they are
196 * produced, so the application is allowed to modify pending
199 struct io_uring_cqe cqes
[] ____cacheline_aligned_in_smp
;
202 enum io_uring_cmd_flags
{
203 IO_URING_F_COMPLETE_DEFER
= 1,
204 IO_URING_F_UNLOCKED
= 2,
205 /* int's last bit, sign checks are usually faster than a bit test */
206 IO_URING_F_NONBLOCK
= INT_MIN
,
209 struct io_mapped_ubuf
{
212 unsigned int nr_bvecs
;
213 unsigned long acct_pages
;
214 struct bio_vec bvec
[];
219 struct io_overflow_cqe
{
220 struct io_uring_cqe cqe
;
221 struct list_head list
;
224 struct io_fixed_file
{
225 /* file * with additional FFS_* flags */
226 unsigned long file_ptr
;
230 struct list_head list
;
235 struct io_mapped_ubuf
*buf
;
239 struct io_file_table
{
240 struct io_fixed_file
*files
;
243 struct io_rsrc_node
{
244 struct percpu_ref refs
;
245 struct list_head node
;
246 struct list_head rsrc_list
;
247 struct io_rsrc_data
*rsrc_data
;
248 struct llist_node llist
;
252 typedef void (rsrc_put_fn
)(struct io_ring_ctx
*ctx
, struct io_rsrc_put
*prsrc
);
254 struct io_rsrc_data
{
255 struct io_ring_ctx
*ctx
;
261 struct completion done
;
265 struct io_buffer_list
{
266 struct list_head list
;
267 struct list_head buf_list
;
272 struct list_head list
;
279 struct io_restriction
{
280 DECLARE_BITMAP(register_op
, IORING_REGISTER_LAST
);
281 DECLARE_BITMAP(sqe_op
, IORING_OP_LAST
);
282 u8 sqe_flags_allowed
;
283 u8 sqe_flags_required
;
288 IO_SQ_THREAD_SHOULD_STOP
= 0,
289 IO_SQ_THREAD_SHOULD_PARK
,
294 atomic_t park_pending
;
297 /* ctx's that are using this sqd */
298 struct list_head ctx_list
;
300 struct task_struct
*thread
;
301 struct wait_queue_head wait
;
303 unsigned sq_thread_idle
;
309 struct completion exited
;
312 #define IO_COMPL_BATCH 32
313 #define IO_REQ_CACHE_SIZE 32
314 #define IO_REQ_ALLOC_BATCH 8
316 struct io_submit_link
{
317 struct io_kiocb
*head
;
318 struct io_kiocb
*last
;
321 struct io_submit_state
{
322 /* inline/task_work completion list, under ->uring_lock */
323 struct io_wq_work_node free_list
;
324 /* batch completion logic */
325 struct io_wq_work_list compl_reqs
;
326 struct io_submit_link link
;
331 unsigned short submit_nr
;
332 struct blk_plug plug
;
336 struct eventfd_ctx
*cq_ev_fd
;
337 unsigned int eventfd_async
: 1;
341 #define IO_BUFFERS_HASH_BITS 5
344 /* const or read-mostly hot data */
346 struct percpu_ref refs
;
348 struct io_rings
*rings
;
350 unsigned int compat
: 1;
351 unsigned int drain_next
: 1;
352 unsigned int restricted
: 1;
353 unsigned int off_timeout_used
: 1;
354 unsigned int drain_active
: 1;
355 unsigned int drain_disabled
: 1;
356 unsigned int has_evfd
: 1;
357 } ____cacheline_aligned_in_smp
;
359 /* submission data */
361 struct mutex uring_lock
;
364 * Ring buffer of indices into array of io_uring_sqe, which is
365 * mmapped by the application using the IORING_OFF_SQES offset.
367 * This indirection could e.g. be used to assign fixed
368 * io_uring_sqe entries to operations and only submit them to
369 * the queue when needed.
371 * The kernel modifies neither the indices array nor the entries
375 struct io_uring_sqe
*sq_sqes
;
376 unsigned cached_sq_head
;
378 struct list_head defer_list
;
381 * Fixed resources fast path, should be accessed only under
382 * uring_lock, and updated through io_uring_register(2)
384 struct io_rsrc_node
*rsrc_node
;
385 int rsrc_cached_refs
;
386 struct io_file_table file_table
;
387 unsigned nr_user_files
;
388 unsigned nr_user_bufs
;
389 struct io_mapped_ubuf
**user_bufs
;
391 struct io_submit_state submit_state
;
392 struct list_head timeout_list
;
393 struct list_head ltimeout_list
;
394 struct list_head cq_overflow_list
;
395 struct list_head
*io_buffers
;
396 struct list_head io_buffers_cache
;
397 struct list_head apoll_cache
;
398 struct xarray personalities
;
400 unsigned sq_thread_idle
;
401 } ____cacheline_aligned_in_smp
;
403 /* IRQ completion list, under ->completion_lock */
404 struct io_wq_work_list locked_free_list
;
405 unsigned int locked_free_nr
;
407 const struct cred
*sq_creds
; /* cred used for __io_sq_thread() */
408 struct io_sq_data
*sq_data
; /* if using sq thread polling */
410 struct wait_queue_head sqo_sq_wait
;
411 struct list_head sqd_list
;
413 unsigned long check_cq_overflow
;
416 unsigned cached_cq_tail
;
418 struct io_ev_fd __rcu
*io_ev_fd
;
419 struct wait_queue_head cq_wait
;
421 atomic_t cq_timeouts
;
422 unsigned cq_last_tm_flush
;
423 } ____cacheline_aligned_in_smp
;
426 spinlock_t completion_lock
;
428 spinlock_t timeout_lock
;
431 * ->iopoll_list is protected by the ctx->uring_lock for
432 * io_uring instances that don't use IORING_SETUP_SQPOLL.
433 * For SQPOLL, only the single threaded io_sq_thread() will
434 * manipulate the list, hence no extra locking is needed there.
436 struct io_wq_work_list iopoll_list
;
437 struct hlist_head
*cancel_hash
;
438 unsigned cancel_hash_bits
;
439 bool poll_multi_queue
;
441 struct list_head io_buffers_comp
;
442 } ____cacheline_aligned_in_smp
;
444 struct io_restriction restrictions
;
446 /* slow path rsrc auxilary data, used by update/register */
448 struct io_rsrc_node
*rsrc_backup_node
;
449 struct io_mapped_ubuf
*dummy_ubuf
;
450 struct io_rsrc_data
*file_data
;
451 struct io_rsrc_data
*buf_data
;
453 struct delayed_work rsrc_put_work
;
454 struct llist_head rsrc_put_llist
;
455 struct list_head rsrc_ref_list
;
456 spinlock_t rsrc_ref_lock
;
458 struct list_head io_buffers_pages
;
461 /* Keep this last, we don't need it for the fast path */
463 #if defined(CONFIG_UNIX)
464 struct socket
*ring_sock
;
466 /* hashed buffered write serialization */
467 struct io_wq_hash
*hash_map
;
469 /* Only used for accounting purposes */
470 struct user_struct
*user
;
471 struct mm_struct
*mm_account
;
473 /* ctx exit and cancelation */
474 struct llist_head fallback_llist
;
475 struct delayed_work fallback_work
;
476 struct work_struct exit_work
;
477 struct list_head tctx_list
;
478 struct completion ref_comp
;
480 bool iowq_limits_set
;
485 * Arbitrary limit, can be raised if need be
487 #define IO_RINGFD_REG_MAX 16
489 struct io_uring_task
{
490 /* submission side */
493 struct wait_queue_head wait
;
494 const struct io_ring_ctx
*last
;
496 struct percpu_counter inflight
;
497 atomic_t inflight_tracked
;
500 spinlock_t task_lock
;
501 struct io_wq_work_list task_list
;
502 struct io_wq_work_list prior_task_list
;
503 struct callback_head task_work
;
504 struct file
**registered_rings
;
509 * First field must be the file pointer in all the
510 * iocb unions! See also 'struct kiocb' in <linux/fs.h>
512 struct io_poll_iocb
{
514 struct wait_queue_head
*head
;
516 struct wait_queue_entry wait
;
519 struct io_poll_update
{
525 bool update_user_data
;
534 struct io_timeout_data
{
535 struct io_kiocb
*req
;
536 struct hrtimer timer
;
537 struct timespec64 ts
;
538 enum hrtimer_mode mode
;
544 struct sockaddr __user
*addr
;
545 int __user
*addr_len
;
548 unsigned long nofile
;
568 struct list_head list
;
569 /* head of the link, used by linked timeouts only */
570 struct io_kiocb
*head
;
571 /* for linked completions */
572 struct io_kiocb
*prev
;
575 struct io_timeout_rem
{
580 struct timespec64 ts
;
586 /* NOTE: kiocb has the file as the first member, so don't do it here */
595 struct sockaddr __user
*addr
;
602 struct compat_msghdr __user
*umsg_compat
;
603 struct user_msghdr __user
*umsg
;
616 struct filename
*filename
;
618 unsigned long nofile
;
621 struct io_rsrc_update
{
647 struct epoll_event event
;
651 struct file
*file_out
;
659 struct io_provide_buf
{
673 struct filename
*filename
;
674 struct statx __user
*buffer
;
686 struct filename
*oldpath
;
687 struct filename
*newpath
;
695 struct filename
*filename
;
702 struct filename
*filename
;
708 struct filename
*oldpath
;
709 struct filename
*newpath
;
716 struct filename
*oldpath
;
717 struct filename
*newpath
;
727 struct io_async_connect
{
728 struct sockaddr_storage address
;
731 struct io_async_msghdr
{
732 struct iovec fast_iov
[UIO_FASTIOV
];
733 /* points to an allocated iov, if NULL we use fast_iov instead */
734 struct iovec
*free_iov
;
735 struct sockaddr __user
*uaddr
;
737 struct sockaddr_storage addr
;
741 struct iov_iter iter
;
742 struct iov_iter_state iter_state
;
743 struct iovec fast_iov
[UIO_FASTIOV
];
747 struct io_rw_state s
;
748 const struct iovec
*free_iovec
;
750 struct wait_page_queue wpq
;
754 REQ_F_FIXED_FILE_BIT
= IOSQE_FIXED_FILE_BIT
,
755 REQ_F_IO_DRAIN_BIT
= IOSQE_IO_DRAIN_BIT
,
756 REQ_F_LINK_BIT
= IOSQE_IO_LINK_BIT
,
757 REQ_F_HARDLINK_BIT
= IOSQE_IO_HARDLINK_BIT
,
758 REQ_F_FORCE_ASYNC_BIT
= IOSQE_ASYNC_BIT
,
759 REQ_F_BUFFER_SELECT_BIT
= IOSQE_BUFFER_SELECT_BIT
,
760 REQ_F_CQE_SKIP_BIT
= IOSQE_CQE_SKIP_SUCCESS_BIT
,
762 /* first byte is taken by user flags, shift it to not overlap */
767 REQ_F_LINK_TIMEOUT_BIT
,
768 REQ_F_NEED_CLEANUP_BIT
,
770 REQ_F_BUFFER_SELECTED_BIT
,
771 REQ_F_COMPLETE_INLINE_BIT
,
775 REQ_F_ARM_LTIMEOUT_BIT
,
776 REQ_F_ASYNC_DATA_BIT
,
777 REQ_F_SKIP_LINK_CQES_BIT
,
778 REQ_F_SINGLE_POLL_BIT
,
779 REQ_F_DOUBLE_POLL_BIT
,
780 REQ_F_PARTIAL_IO_BIT
,
781 /* keep async read/write and isreg together and in order */
782 REQ_F_SUPPORT_NOWAIT_BIT
,
785 /* not a real bit, just to check we're not overflowing the space */
791 REQ_F_FIXED_FILE
= BIT(REQ_F_FIXED_FILE_BIT
),
792 /* drain existing IO first */
793 REQ_F_IO_DRAIN
= BIT(REQ_F_IO_DRAIN_BIT
),
795 REQ_F_LINK
= BIT(REQ_F_LINK_BIT
),
796 /* doesn't sever on completion < 0 */
797 REQ_F_HARDLINK
= BIT(REQ_F_HARDLINK_BIT
),
799 REQ_F_FORCE_ASYNC
= BIT(REQ_F_FORCE_ASYNC_BIT
),
800 /* IOSQE_BUFFER_SELECT */
801 REQ_F_BUFFER_SELECT
= BIT(REQ_F_BUFFER_SELECT_BIT
),
802 /* IOSQE_CQE_SKIP_SUCCESS */
803 REQ_F_CQE_SKIP
= BIT(REQ_F_CQE_SKIP_BIT
),
805 /* fail rest of links */
806 REQ_F_FAIL
= BIT(REQ_F_FAIL_BIT
),
807 /* on inflight list, should be cancelled and waited on exit reliably */
808 REQ_F_INFLIGHT
= BIT(REQ_F_INFLIGHT_BIT
),
809 /* read/write uses file position */
810 REQ_F_CUR_POS
= BIT(REQ_F_CUR_POS_BIT
),
811 /* must not punt to workers */
812 REQ_F_NOWAIT
= BIT(REQ_F_NOWAIT_BIT
),
813 /* has or had linked timeout */
814 REQ_F_LINK_TIMEOUT
= BIT(REQ_F_LINK_TIMEOUT_BIT
),
816 REQ_F_NEED_CLEANUP
= BIT(REQ_F_NEED_CLEANUP_BIT
),
817 /* already went through poll handler */
818 REQ_F_POLLED
= BIT(REQ_F_POLLED_BIT
),
819 /* buffer already selected */
820 REQ_F_BUFFER_SELECTED
= BIT(REQ_F_BUFFER_SELECTED_BIT
),
821 /* completion is deferred through io_comp_state */
822 REQ_F_COMPLETE_INLINE
= BIT(REQ_F_COMPLETE_INLINE_BIT
),
823 /* caller should reissue async */
824 REQ_F_REISSUE
= BIT(REQ_F_REISSUE_BIT
),
825 /* supports async reads/writes */
826 REQ_F_SUPPORT_NOWAIT
= BIT(REQ_F_SUPPORT_NOWAIT_BIT
),
828 REQ_F_ISREG
= BIT(REQ_F_ISREG_BIT
),
829 /* has creds assigned */
830 REQ_F_CREDS
= BIT(REQ_F_CREDS_BIT
),
831 /* skip refcounting if not set */
832 REQ_F_REFCOUNT
= BIT(REQ_F_REFCOUNT_BIT
),
833 /* there is a linked timeout that has to be armed */
834 REQ_F_ARM_LTIMEOUT
= BIT(REQ_F_ARM_LTIMEOUT_BIT
),
835 /* ->async_data allocated */
836 REQ_F_ASYNC_DATA
= BIT(REQ_F_ASYNC_DATA_BIT
),
837 /* don't post CQEs while failing linked requests */
838 REQ_F_SKIP_LINK_CQES
= BIT(REQ_F_SKIP_LINK_CQES_BIT
),
839 /* single poll may be active */
840 REQ_F_SINGLE_POLL
= BIT(REQ_F_SINGLE_POLL_BIT
),
841 /* double poll may active */
842 REQ_F_DOUBLE_POLL
= BIT(REQ_F_DOUBLE_POLL_BIT
),
843 /* request has already done partial IO */
844 REQ_F_PARTIAL_IO
= BIT(REQ_F_PARTIAL_IO_BIT
),
848 struct io_poll_iocb poll
;
849 struct io_poll_iocb
*double_poll
;
852 typedef void (*io_req_tw_func_t
)(struct io_kiocb
*req
, bool *locked
);
854 struct io_task_work
{
856 struct io_wq_work_node node
;
857 struct llist_node fallback_node
;
859 io_req_tw_func_t func
;
863 IORING_RSRC_FILE
= 0,
864 IORING_RSRC_BUFFER
= 1,
868 * NOTE! Each of the iocb union members has the file pointer
869 * as the first entry in their struct definition. So you can
870 * access the file pointer through any of the sub-structs,
871 * or directly as just 'file' in this struct.
877 struct io_poll_iocb poll
;
878 struct io_poll_update poll_update
;
879 struct io_accept accept
;
881 struct io_cancel cancel
;
882 struct io_timeout timeout
;
883 struct io_timeout_rem timeout_rem
;
884 struct io_connect connect
;
885 struct io_sr_msg sr_msg
;
887 struct io_close close
;
888 struct io_rsrc_update rsrc_update
;
889 struct io_fadvise fadvise
;
890 struct io_madvise madvise
;
891 struct io_epoll epoll
;
892 struct io_splice splice
;
893 struct io_provide_buf pbuf
;
894 struct io_statx statx
;
895 struct io_shutdown shutdown
;
896 struct io_rename rename
;
897 struct io_unlink unlink
;
898 struct io_mkdir mkdir
;
899 struct io_symlink symlink
;
900 struct io_hardlink hardlink
;
905 /* polled IO has completed */
912 /* fd initially, then cflags for completion */
918 struct io_ring_ctx
*ctx
;
919 struct task_struct
*task
;
921 struct percpu_ref
*fixed_rsrc_refs
;
922 /* store used ubuf, so we can prevent reloading */
923 struct io_mapped_ubuf
*imu
;
926 /* used by request caches, completion batching and iopoll */
927 struct io_wq_work_node comp_list
;
928 /* cache ->apoll->events */
929 __poll_t apoll_events
;
933 struct io_task_work io_task_work
;
934 /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
935 struct hlist_node hash_node
;
936 /* internal polling, see IORING_FEAT_FAST_POLL */
937 struct async_poll
*apoll
;
938 /* opcode allocated if it needs to store data for async defer */
940 /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
941 struct io_buffer
*kbuf
;
942 /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */
943 struct io_kiocb
*link
;
944 /* custom credentials, valid IFF REQ_F_CREDS is set */
945 const struct cred
*creds
;
946 struct io_wq_work work
;
949 struct io_tctx_node
{
950 struct list_head ctx_node
;
951 struct task_struct
*task
;
952 struct io_ring_ctx
*ctx
;
955 struct io_defer_entry
{
956 struct list_head list
;
957 struct io_kiocb
*req
;
962 /* needs req->file assigned */
963 unsigned needs_file
: 1;
964 /* should block plug */
966 /* hash wq insertion if file is a regular file */
967 unsigned hash_reg_file
: 1;
968 /* unbound wq insertion if file is a non-regular file */
969 unsigned unbound_nonreg_file
: 1;
970 /* set if opcode supports polled "wait" */
972 unsigned pollout
: 1;
973 unsigned poll_exclusive
: 1;
974 /* op supports buffer selection */
975 unsigned buffer_select
: 1;
976 /* do prep async if is going to be punted */
977 unsigned needs_async_setup
: 1;
978 /* opcode is not supported by this kernel */
979 unsigned not_supported
: 1;
981 unsigned audit_skip
: 1;
982 /* size of async data needed, if any */
983 unsigned short async_size
;
986 static const struct io_op_def io_op_defs
[] = {
987 [IORING_OP_NOP
] = {},
988 [IORING_OP_READV
] = {
990 .unbound_nonreg_file
= 1,
993 .needs_async_setup
= 1,
996 .async_size
= sizeof(struct io_async_rw
),
998 [IORING_OP_WRITEV
] = {
1001 .unbound_nonreg_file
= 1,
1003 .needs_async_setup
= 1,
1006 .async_size
= sizeof(struct io_async_rw
),
1008 [IORING_OP_FSYNC
] = {
1012 [IORING_OP_READ_FIXED
] = {
1014 .unbound_nonreg_file
= 1,
1018 .async_size
= sizeof(struct io_async_rw
),
1020 [IORING_OP_WRITE_FIXED
] = {
1023 .unbound_nonreg_file
= 1,
1027 .async_size
= sizeof(struct io_async_rw
),
1029 [IORING_OP_POLL_ADD
] = {
1031 .unbound_nonreg_file
= 1,
1034 [IORING_OP_POLL_REMOVE
] = {
1037 [IORING_OP_SYNC_FILE_RANGE
] = {
1041 [IORING_OP_SENDMSG
] = {
1043 .unbound_nonreg_file
= 1,
1045 .needs_async_setup
= 1,
1046 .async_size
= sizeof(struct io_async_msghdr
),
1048 [IORING_OP_RECVMSG
] = {
1050 .unbound_nonreg_file
= 1,
1053 .needs_async_setup
= 1,
1054 .async_size
= sizeof(struct io_async_msghdr
),
1056 [IORING_OP_TIMEOUT
] = {
1058 .async_size
= sizeof(struct io_timeout_data
),
1060 [IORING_OP_TIMEOUT_REMOVE
] = {
1061 /* used by timeout updates' prep() */
1064 [IORING_OP_ACCEPT
] = {
1066 .unbound_nonreg_file
= 1,
1068 .poll_exclusive
= 1,
1070 [IORING_OP_ASYNC_CANCEL
] = {
1073 [IORING_OP_LINK_TIMEOUT
] = {
1075 .async_size
= sizeof(struct io_timeout_data
),
1077 [IORING_OP_CONNECT
] = {
1079 .unbound_nonreg_file
= 1,
1081 .needs_async_setup
= 1,
1082 .async_size
= sizeof(struct io_async_connect
),
1084 [IORING_OP_FALLOCATE
] = {
1087 [IORING_OP_OPENAT
] = {},
1088 [IORING_OP_CLOSE
] = {},
1089 [IORING_OP_FILES_UPDATE
] = {
1092 [IORING_OP_STATX
] = {
1095 [IORING_OP_READ
] = {
1097 .unbound_nonreg_file
= 1,
1102 .async_size
= sizeof(struct io_async_rw
),
1104 [IORING_OP_WRITE
] = {
1107 .unbound_nonreg_file
= 1,
1111 .async_size
= sizeof(struct io_async_rw
),
1113 [IORING_OP_FADVISE
] = {
1117 [IORING_OP_MADVISE
] = {},
1118 [IORING_OP_SEND
] = {
1120 .unbound_nonreg_file
= 1,
1124 [IORING_OP_RECV
] = {
1126 .unbound_nonreg_file
= 1,
1131 [IORING_OP_OPENAT2
] = {
1133 [IORING_OP_EPOLL_CTL
] = {
1134 .unbound_nonreg_file
= 1,
1137 [IORING_OP_SPLICE
] = {
1140 .unbound_nonreg_file
= 1,
1143 [IORING_OP_PROVIDE_BUFFERS
] = {
1146 [IORING_OP_REMOVE_BUFFERS
] = {
1152 .unbound_nonreg_file
= 1,
1155 [IORING_OP_SHUTDOWN
] = {
1158 [IORING_OP_RENAMEAT
] = {},
1159 [IORING_OP_UNLINKAT
] = {},
1160 [IORING_OP_MKDIRAT
] = {},
1161 [IORING_OP_SYMLINKAT
] = {},
1162 [IORING_OP_LINKAT
] = {},
1163 [IORING_OP_MSG_RING
] = {
1168 /* requests with any of those set should undergo io_disarm_next() */
1169 #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
1171 static bool io_disarm_next(struct io_kiocb
*req
);
1172 static void io_uring_del_tctx_node(unsigned long index
);
1173 static void io_uring_try_cancel_requests(struct io_ring_ctx
*ctx
,
1174 struct task_struct
*task
,
1176 static void io_uring_cancel_generic(bool cancel_all
, struct io_sq_data
*sqd
);
1178 static void io_fill_cqe_req(struct io_kiocb
*req
, s32 res
, u32 cflags
);
1180 static void io_put_req(struct io_kiocb
*req
);
1181 static void io_put_req_deferred(struct io_kiocb
*req
);
1182 static void io_dismantle_req(struct io_kiocb
*req
);
1183 static void io_queue_linked_timeout(struct io_kiocb
*req
);
1184 static int __io_register_rsrc_update(struct io_ring_ctx
*ctx
, unsigned type
,
1185 struct io_uring_rsrc_update2
*up
,
1187 static void io_clean_op(struct io_kiocb
*req
);
1188 static inline struct file
*io_file_get_fixed(struct io_kiocb
*req
, int fd
,
1189 unsigned issue_flags
);
1190 static inline struct file
*io_file_get_normal(struct io_kiocb
*req
, int fd
);
1191 static void __io_queue_sqe(struct io_kiocb
*req
);
1192 static void io_rsrc_put_work(struct work_struct
*work
);
1194 static void io_req_task_queue(struct io_kiocb
*req
);
1195 static void __io_submit_flush_completions(struct io_ring_ctx
*ctx
);
1196 static int io_req_prep_async(struct io_kiocb
*req
);
1198 static int io_install_fixed_file(struct io_kiocb
*req
, struct file
*file
,
1199 unsigned int issue_flags
, u32 slot_index
);
1200 static int io_close_fixed(struct io_kiocb
*req
, unsigned int issue_flags
);
1202 static enum hrtimer_restart
io_link_timeout_fn(struct hrtimer
*timer
);
1203 static void io_eventfd_signal(struct io_ring_ctx
*ctx
);
1205 static struct kmem_cache
*req_cachep
;
1207 static const struct file_operations io_uring_fops
;
1209 struct sock
*io_uring_get_socket(struct file
*file
)
1211 #if defined(CONFIG_UNIX)
1212 if (file
->f_op
== &io_uring_fops
) {
1213 struct io_ring_ctx
*ctx
= file
->private_data
;
1215 return ctx
->ring_sock
->sk
;
1220 EXPORT_SYMBOL(io_uring_get_socket
);
1222 static inline void io_tw_lock(struct io_ring_ctx
*ctx
, bool *locked
)
1225 mutex_lock(&ctx
->uring_lock
);
1230 #define io_for_each_link(pos, head) \
1231 for (pos = (head); pos; pos = pos->link)
1234 * Shamelessly stolen from the mm implementation of page reference checking,
1235 * see commit f958d7b528b1 for details.
1237 #define req_ref_zero_or_close_to_overflow(req) \
1238 ((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u)
1240 static inline bool req_ref_inc_not_zero(struct io_kiocb
*req
)
1242 WARN_ON_ONCE(!(req
->flags
& REQ_F_REFCOUNT
));
1243 return atomic_inc_not_zero(&req
->refs
);
1246 static inline bool req_ref_put_and_test(struct io_kiocb
*req
)
1248 if (likely(!(req
->flags
& REQ_F_REFCOUNT
)))
1251 WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req
));
1252 return atomic_dec_and_test(&req
->refs
);
1255 static inline void req_ref_get(struct io_kiocb
*req
)
1257 WARN_ON_ONCE(!(req
->flags
& REQ_F_REFCOUNT
));
1258 WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req
));
1259 atomic_inc(&req
->refs
);
1262 static inline void io_submit_flush_completions(struct io_ring_ctx
*ctx
)
1264 if (!wq_list_empty(&ctx
->submit_state
.compl_reqs
))
1265 __io_submit_flush_completions(ctx
);
1268 static inline void __io_req_set_refcount(struct io_kiocb
*req
, int nr
)
1270 if (!(req
->flags
& REQ_F_REFCOUNT
)) {
1271 req
->flags
|= REQ_F_REFCOUNT
;
1272 atomic_set(&req
->refs
, nr
);
1276 static inline void io_req_set_refcount(struct io_kiocb
*req
)
1278 __io_req_set_refcount(req
, 1);
1281 #define IO_RSRC_REF_BATCH 100
1283 static inline void io_req_put_rsrc_locked(struct io_kiocb
*req
,
1284 struct io_ring_ctx
*ctx
)
1285 __must_hold(&ctx
->uring_lock
)
1287 struct percpu_ref
*ref
= req
->fixed_rsrc_refs
;
1290 if (ref
== &ctx
->rsrc_node
->refs
)
1291 ctx
->rsrc_cached_refs
++;
1293 percpu_ref_put(ref
);
1297 static inline void io_req_put_rsrc(struct io_kiocb
*req
, struct io_ring_ctx
*ctx
)
1299 if (req
->fixed_rsrc_refs
)
1300 percpu_ref_put(req
->fixed_rsrc_refs
);
1303 static __cold
void io_rsrc_refs_drop(struct io_ring_ctx
*ctx
)
1304 __must_hold(&ctx
->uring_lock
)
1306 if (ctx
->rsrc_cached_refs
) {
1307 percpu_ref_put_many(&ctx
->rsrc_node
->refs
, ctx
->rsrc_cached_refs
);
1308 ctx
->rsrc_cached_refs
= 0;
1312 static void io_rsrc_refs_refill(struct io_ring_ctx
*ctx
)
1313 __must_hold(&ctx
->uring_lock
)
1315 ctx
->rsrc_cached_refs
+= IO_RSRC_REF_BATCH
;
1316 percpu_ref_get_many(&ctx
->rsrc_node
->refs
, IO_RSRC_REF_BATCH
);
1319 static inline void io_req_set_rsrc_node(struct io_kiocb
*req
,
1320 struct io_ring_ctx
*ctx
,
1321 unsigned int issue_flags
)
1323 if (!req
->fixed_rsrc_refs
) {
1324 req
->fixed_rsrc_refs
= &ctx
->rsrc_node
->refs
;
1326 if (!(issue_flags
& IO_URING_F_UNLOCKED
)) {
1327 lockdep_assert_held(&ctx
->uring_lock
);
1328 ctx
->rsrc_cached_refs
--;
1329 if (unlikely(ctx
->rsrc_cached_refs
< 0))
1330 io_rsrc_refs_refill(ctx
);
1332 percpu_ref_get(req
->fixed_rsrc_refs
);
1337 static unsigned int __io_put_kbuf(struct io_kiocb
*req
, struct list_head
*list
)
1339 struct io_buffer
*kbuf
= req
->kbuf
;
1340 unsigned int cflags
;
1342 cflags
= IORING_CQE_F_BUFFER
| (kbuf
->bid
<< IORING_CQE_BUFFER_SHIFT
);
1343 req
->flags
&= ~REQ_F_BUFFER_SELECTED
;
1344 list_add(&kbuf
->list
, list
);
1349 static inline unsigned int io_put_kbuf_comp(struct io_kiocb
*req
)
1351 lockdep_assert_held(&req
->ctx
->completion_lock
);
1353 if (likely(!(req
->flags
& REQ_F_BUFFER_SELECTED
)))
1355 return __io_put_kbuf(req
, &req
->ctx
->io_buffers_comp
);
1358 static inline unsigned int io_put_kbuf(struct io_kiocb
*req
,
1359 unsigned issue_flags
)
1361 unsigned int cflags
;
1363 if (likely(!(req
->flags
& REQ_F_BUFFER_SELECTED
)))
1367 * We can add this buffer back to two lists:
1369 * 1) The io_buffers_cache list. This one is protected by the
1370 * ctx->uring_lock. If we already hold this lock, add back to this
1371 * list as we can grab it from issue as well.
1372 * 2) The io_buffers_comp list. This one is protected by the
1373 * ctx->completion_lock.
1375 * We migrate buffers from the comp_list to the issue cache list
1378 if (issue_flags
& IO_URING_F_UNLOCKED
) {
1379 struct io_ring_ctx
*ctx
= req
->ctx
;
1381 spin_lock(&ctx
->completion_lock
);
1382 cflags
= __io_put_kbuf(req
, &ctx
->io_buffers_comp
);
1383 spin_unlock(&ctx
->completion_lock
);
1385 lockdep_assert_held(&req
->ctx
->uring_lock
);
1387 cflags
= __io_put_kbuf(req
, &req
->ctx
->io_buffers_cache
);
1393 static struct io_buffer_list
*io_buffer_get_list(struct io_ring_ctx
*ctx
,
1396 struct list_head
*hash_list
;
1397 struct io_buffer_list
*bl
;
1399 hash_list
= &ctx
->io_buffers
[hash_32(bgid
, IO_BUFFERS_HASH_BITS
)];
1400 list_for_each_entry(bl
, hash_list
, list
)
1401 if (bl
->bgid
== bgid
|| bgid
== -1U)
1407 static void io_kbuf_recycle(struct io_kiocb
*req
, unsigned issue_flags
)
1409 struct io_ring_ctx
*ctx
= req
->ctx
;
1410 struct io_buffer_list
*bl
;
1411 struct io_buffer
*buf
;
1413 if (likely(!(req
->flags
& REQ_F_BUFFER_SELECTED
)))
1415 /* don't recycle if we already did IO to this buffer */
1416 if (req
->flags
& REQ_F_PARTIAL_IO
)
1419 if (issue_flags
& IO_URING_F_UNLOCKED
)
1420 mutex_lock(&ctx
->uring_lock
);
1422 lockdep_assert_held(&ctx
->uring_lock
);
1425 bl
= io_buffer_get_list(ctx
, buf
->bgid
);
1426 list_add(&buf
->list
, &bl
->buf_list
);
1427 req
->flags
&= ~REQ_F_BUFFER_SELECTED
;
1430 if (issue_flags
& IO_URING_F_UNLOCKED
)
1431 mutex_unlock(&ctx
->uring_lock
);
1434 static bool io_match_task(struct io_kiocb
*head
, struct task_struct
*task
,
1436 __must_hold(&req
->ctx
->timeout_lock
)
1438 struct io_kiocb
*req
;
1440 if (task
&& head
->task
!= task
)
1445 io_for_each_link(req
, head
) {
1446 if (req
->flags
& REQ_F_INFLIGHT
)
1452 static bool io_match_linked(struct io_kiocb
*head
)
1454 struct io_kiocb
*req
;
1456 io_for_each_link(req
, head
) {
1457 if (req
->flags
& REQ_F_INFLIGHT
)
1464 * As io_match_task() but protected against racing with linked timeouts.
1465 * User must not hold timeout_lock.
1467 static bool io_match_task_safe(struct io_kiocb
*head
, struct task_struct
*task
,
1472 if (task
&& head
->task
!= task
)
1477 if (head
->flags
& REQ_F_LINK_TIMEOUT
) {
1478 struct io_ring_ctx
*ctx
= head
->ctx
;
1480 /* protect against races with linked timeouts */
1481 spin_lock_irq(&ctx
->timeout_lock
);
1482 matched
= io_match_linked(head
);
1483 spin_unlock_irq(&ctx
->timeout_lock
);
1485 matched
= io_match_linked(head
);
1490 static inline bool req_has_async_data(struct io_kiocb
*req
)
1492 return req
->flags
& REQ_F_ASYNC_DATA
;
1495 static inline void req_set_fail(struct io_kiocb
*req
)
1497 req
->flags
|= REQ_F_FAIL
;
1498 if (req
->flags
& REQ_F_CQE_SKIP
) {
1499 req
->flags
&= ~REQ_F_CQE_SKIP
;
1500 req
->flags
|= REQ_F_SKIP_LINK_CQES
;
1504 static inline void req_fail_link_node(struct io_kiocb
*req
, int res
)
1510 static __cold
void io_ring_ctx_ref_free(struct percpu_ref
*ref
)
1512 struct io_ring_ctx
*ctx
= container_of(ref
, struct io_ring_ctx
, refs
);
1514 complete(&ctx
->ref_comp
);
1517 static inline bool io_is_timeout_noseq(struct io_kiocb
*req
)
1519 return !req
->timeout
.off
;
1522 static __cold
void io_fallback_req_func(struct work_struct
*work
)
1524 struct io_ring_ctx
*ctx
= container_of(work
, struct io_ring_ctx
,
1525 fallback_work
.work
);
1526 struct llist_node
*node
= llist_del_all(&ctx
->fallback_llist
);
1527 struct io_kiocb
*req
, *tmp
;
1528 bool locked
= false;
1530 percpu_ref_get(&ctx
->refs
);
1531 llist_for_each_entry_safe(req
, tmp
, node
, io_task_work
.fallback_node
)
1532 req
->io_task_work
.func(req
, &locked
);
1535 io_submit_flush_completions(ctx
);
1536 mutex_unlock(&ctx
->uring_lock
);
1538 percpu_ref_put(&ctx
->refs
);
1541 static __cold
struct io_ring_ctx
*io_ring_ctx_alloc(struct io_uring_params
*p
)
1543 struct io_ring_ctx
*ctx
;
1546 ctx
= kzalloc(sizeof(*ctx
), GFP_KERNEL
);
1551 * Use 5 bits less than the max cq entries, that should give us around
1552 * 32 entries per hash list if totally full and uniformly spread.
1554 hash_bits
= ilog2(p
->cq_entries
);
1558 ctx
->cancel_hash_bits
= hash_bits
;
1559 ctx
->cancel_hash
= kmalloc((1U << hash_bits
) * sizeof(struct hlist_head
),
1561 if (!ctx
->cancel_hash
)
1563 __hash_init(ctx
->cancel_hash
, 1U << hash_bits
);
1565 ctx
->dummy_ubuf
= kzalloc(sizeof(*ctx
->dummy_ubuf
), GFP_KERNEL
);
1566 if (!ctx
->dummy_ubuf
)
1568 /* set invalid range, so io_import_fixed() fails meeting it */
1569 ctx
->dummy_ubuf
->ubuf
= -1UL;
1571 ctx
->io_buffers
= kcalloc(1U << IO_BUFFERS_HASH_BITS
,
1572 sizeof(struct list_head
), GFP_KERNEL
);
1573 if (!ctx
->io_buffers
)
1575 for (i
= 0; i
< (1U << IO_BUFFERS_HASH_BITS
); i
++)
1576 INIT_LIST_HEAD(&ctx
->io_buffers
[i
]);
1578 if (percpu_ref_init(&ctx
->refs
, io_ring_ctx_ref_free
,
1582 ctx
->flags
= p
->flags
;
1583 init_waitqueue_head(&ctx
->sqo_sq_wait
);
1584 INIT_LIST_HEAD(&ctx
->sqd_list
);
1585 INIT_LIST_HEAD(&ctx
->cq_overflow_list
);
1586 INIT_LIST_HEAD(&ctx
->io_buffers_cache
);
1587 INIT_LIST_HEAD(&ctx
->apoll_cache
);
1588 init_completion(&ctx
->ref_comp
);
1589 xa_init_flags(&ctx
->personalities
, XA_FLAGS_ALLOC1
);
1590 mutex_init(&ctx
->uring_lock
);
1591 init_waitqueue_head(&ctx
->cq_wait
);
1592 spin_lock_init(&ctx
->completion_lock
);
1593 spin_lock_init(&ctx
->timeout_lock
);
1594 INIT_WQ_LIST(&ctx
->iopoll_list
);
1595 INIT_LIST_HEAD(&ctx
->io_buffers_pages
);
1596 INIT_LIST_HEAD(&ctx
->io_buffers_comp
);
1597 INIT_LIST_HEAD(&ctx
->defer_list
);
1598 INIT_LIST_HEAD(&ctx
->timeout_list
);
1599 INIT_LIST_HEAD(&ctx
->ltimeout_list
);
1600 spin_lock_init(&ctx
->rsrc_ref_lock
);
1601 INIT_LIST_HEAD(&ctx
->rsrc_ref_list
);
1602 INIT_DELAYED_WORK(&ctx
->rsrc_put_work
, io_rsrc_put_work
);
1603 init_llist_head(&ctx
->rsrc_put_llist
);
1604 INIT_LIST_HEAD(&ctx
->tctx_list
);
1605 ctx
->submit_state
.free_list
.next
= NULL
;
1606 INIT_WQ_LIST(&ctx
->locked_free_list
);
1607 INIT_DELAYED_WORK(&ctx
->fallback_work
, io_fallback_req_func
);
1608 INIT_WQ_LIST(&ctx
->submit_state
.compl_reqs
);
1611 kfree(ctx
->dummy_ubuf
);
1612 kfree(ctx
->cancel_hash
);
1613 kfree(ctx
->io_buffers
);
1618 static void io_account_cq_overflow(struct io_ring_ctx
*ctx
)
1620 struct io_rings
*r
= ctx
->rings
;
1622 WRITE_ONCE(r
->cq_overflow
, READ_ONCE(r
->cq_overflow
) + 1);
1626 static bool req_need_defer(struct io_kiocb
*req
, u32 seq
)
1628 if (unlikely(req
->flags
& REQ_F_IO_DRAIN
)) {
1629 struct io_ring_ctx
*ctx
= req
->ctx
;
1631 return seq
+ READ_ONCE(ctx
->cq_extra
) != ctx
->cached_cq_tail
;
1637 #define FFS_NOWAIT 0x1UL
1638 #define FFS_ISREG 0x2UL
1639 #define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG)
1641 static inline bool io_req_ffs_set(struct io_kiocb
*req
)
1643 return req
->flags
& REQ_F_FIXED_FILE
;
1646 static inline void io_req_track_inflight(struct io_kiocb
*req
)
1648 if (!(req
->flags
& REQ_F_INFLIGHT
)) {
1649 req
->flags
|= REQ_F_INFLIGHT
;
1650 atomic_inc(&req
->task
->io_uring
->inflight_tracked
);
1654 static struct io_kiocb
*__io_prep_linked_timeout(struct io_kiocb
*req
)
1656 if (WARN_ON_ONCE(!req
->link
))
1659 req
->flags
&= ~REQ_F_ARM_LTIMEOUT
;
1660 req
->flags
|= REQ_F_LINK_TIMEOUT
;
1662 /* linked timeouts should have two refs once prep'ed */
1663 io_req_set_refcount(req
);
1664 __io_req_set_refcount(req
->link
, 2);
1668 static inline struct io_kiocb
*io_prep_linked_timeout(struct io_kiocb
*req
)
1670 if (likely(!(req
->flags
& REQ_F_ARM_LTIMEOUT
)))
1672 return __io_prep_linked_timeout(req
);
1675 static void io_prep_async_work(struct io_kiocb
*req
)
1677 const struct io_op_def
*def
= &io_op_defs
[req
->opcode
];
1678 struct io_ring_ctx
*ctx
= req
->ctx
;
1680 if (!(req
->flags
& REQ_F_CREDS
)) {
1681 req
->flags
|= REQ_F_CREDS
;
1682 req
->creds
= get_current_cred();
1685 req
->work
.list
.next
= NULL
;
1686 req
->work
.flags
= 0;
1687 if (req
->flags
& REQ_F_FORCE_ASYNC
)
1688 req
->work
.flags
|= IO_WQ_WORK_CONCURRENT
;
1690 if (req
->flags
& REQ_F_ISREG
) {
1691 if (def
->hash_reg_file
|| (ctx
->flags
& IORING_SETUP_IOPOLL
))
1692 io_wq_hash_work(&req
->work
, file_inode(req
->file
));
1693 } else if (!req
->file
|| !S_ISBLK(file_inode(req
->file
)->i_mode
)) {
1694 if (def
->unbound_nonreg_file
)
1695 req
->work
.flags
|= IO_WQ_WORK_UNBOUND
;
1699 static void io_prep_async_link(struct io_kiocb
*req
)
1701 struct io_kiocb
*cur
;
1703 if (req
->flags
& REQ_F_LINK_TIMEOUT
) {
1704 struct io_ring_ctx
*ctx
= req
->ctx
;
1706 spin_lock_irq(&ctx
->timeout_lock
);
1707 io_for_each_link(cur
, req
)
1708 io_prep_async_work(cur
);
1709 spin_unlock_irq(&ctx
->timeout_lock
);
1711 io_for_each_link(cur
, req
)
1712 io_prep_async_work(cur
);
1716 static inline void io_req_add_compl_list(struct io_kiocb
*req
)
1718 struct io_ring_ctx
*ctx
= req
->ctx
;
1719 struct io_submit_state
*state
= &ctx
->submit_state
;
1721 if (!(req
->flags
& REQ_F_CQE_SKIP
))
1722 ctx
->submit_state
.flush_cqes
= true;
1723 wq_list_add_tail(&req
->comp_list
, &state
->compl_reqs
);
1726 static void io_queue_async_work(struct io_kiocb
*req
, bool *dont_use
)
1728 struct io_ring_ctx
*ctx
= req
->ctx
;
1729 struct io_kiocb
*link
= io_prep_linked_timeout(req
);
1730 struct io_uring_task
*tctx
= req
->task
->io_uring
;
1733 BUG_ON(!tctx
->io_wq
);
1735 /* init ->work of the whole link before punting */
1736 io_prep_async_link(req
);
1739 * Not expected to happen, but if we do have a bug where this _can_
1740 * happen, catch it here and ensure the request is marked as
1741 * canceled. That will make io-wq go through the usual work cancel
1742 * procedure rather than attempt to run this request (or create a new
1745 if (WARN_ON_ONCE(!same_thread_group(req
->task
, current
)))
1746 req
->work
.flags
|= IO_WQ_WORK_CANCEL
;
1748 trace_io_uring_queue_async_work(ctx
, req
, req
->user_data
, req
->opcode
, req
->flags
,
1749 &req
->work
, io_wq_is_hashed(&req
->work
));
1750 io_wq_enqueue(tctx
->io_wq
, &req
->work
);
1752 io_queue_linked_timeout(link
);
1755 static void io_kill_timeout(struct io_kiocb
*req
, int status
)
1756 __must_hold(&req
->ctx
->completion_lock
)
1757 __must_hold(&req
->ctx
->timeout_lock
)
1759 struct io_timeout_data
*io
= req
->async_data
;
1761 if (hrtimer_try_to_cancel(&io
->timer
) != -1) {
1764 atomic_set(&req
->ctx
->cq_timeouts
,
1765 atomic_read(&req
->ctx
->cq_timeouts
) + 1);
1766 list_del_init(&req
->timeout
.list
);
1767 io_fill_cqe_req(req
, status
, 0);
1768 io_put_req_deferred(req
);
1772 static __cold
void io_queue_deferred(struct io_ring_ctx
*ctx
)
1774 while (!list_empty(&ctx
->defer_list
)) {
1775 struct io_defer_entry
*de
= list_first_entry(&ctx
->defer_list
,
1776 struct io_defer_entry
, list
);
1778 if (req_need_defer(de
->req
, de
->seq
))
1780 list_del_init(&de
->list
);
1781 io_req_task_queue(de
->req
);
1786 static __cold
void io_flush_timeouts(struct io_ring_ctx
*ctx
)
1787 __must_hold(&ctx
->completion_lock
)
1789 u32 seq
= ctx
->cached_cq_tail
- atomic_read(&ctx
->cq_timeouts
);
1790 struct io_kiocb
*req
, *tmp
;
1792 spin_lock_irq(&ctx
->timeout_lock
);
1793 list_for_each_entry_safe(req
, tmp
, &ctx
->timeout_list
, timeout
.list
) {
1794 u32 events_needed
, events_got
;
1796 if (io_is_timeout_noseq(req
))
1800 * Since seq can easily wrap around over time, subtract
1801 * the last seq at which timeouts were flushed before comparing.
1802 * Assuming not more than 2^31-1 events have happened since,
1803 * these subtractions won't have wrapped, so we can check if
1804 * target is in [last_seq, current_seq] by comparing the two.
1806 events_needed
= req
->timeout
.target_seq
- ctx
->cq_last_tm_flush
;
1807 events_got
= seq
- ctx
->cq_last_tm_flush
;
1808 if (events_got
< events_needed
)
1811 io_kill_timeout(req
, 0);
1813 ctx
->cq_last_tm_flush
= seq
;
1814 spin_unlock_irq(&ctx
->timeout_lock
);
1817 static inline void io_commit_cqring(struct io_ring_ctx
*ctx
)
1819 /* order cqe stores with ring update */
1820 smp_store_release(&ctx
->rings
->cq
.tail
, ctx
->cached_cq_tail
);
1823 static void __io_commit_cqring_flush(struct io_ring_ctx
*ctx
)
1825 if (ctx
->off_timeout_used
|| ctx
->drain_active
) {
1826 spin_lock(&ctx
->completion_lock
);
1827 if (ctx
->off_timeout_used
)
1828 io_flush_timeouts(ctx
);
1829 if (ctx
->drain_active
)
1830 io_queue_deferred(ctx
);
1831 io_commit_cqring(ctx
);
1832 spin_unlock(&ctx
->completion_lock
);
1835 io_eventfd_signal(ctx
);
1838 static inline bool io_sqring_full(struct io_ring_ctx
*ctx
)
1840 struct io_rings
*r
= ctx
->rings
;
1842 return READ_ONCE(r
->sq
.tail
) - ctx
->cached_sq_head
== ctx
->sq_entries
;
1845 static inline unsigned int __io_cqring_events(struct io_ring_ctx
*ctx
)
1847 return ctx
->cached_cq_tail
- READ_ONCE(ctx
->rings
->cq
.head
);
1850 static inline struct io_uring_cqe
*io_get_cqe(struct io_ring_ctx
*ctx
)
1852 struct io_rings
*rings
= ctx
->rings
;
1853 unsigned tail
, mask
= ctx
->cq_entries
- 1;
1856 * writes to the cq entry need to come after reading head; the
1857 * control dependency is enough as we're using WRITE_ONCE to
1860 if (__io_cqring_events(ctx
) == ctx
->cq_entries
)
1863 tail
= ctx
->cached_cq_tail
++;
1864 return &rings
->cqes
[tail
& mask
];
1867 static void io_eventfd_signal(struct io_ring_ctx
*ctx
)
1869 struct io_ev_fd
*ev_fd
;
1873 * rcu_dereference ctx->io_ev_fd once and use it for both for checking
1874 * and eventfd_signal
1876 ev_fd
= rcu_dereference(ctx
->io_ev_fd
);
1879 * Check again if ev_fd exists incase an io_eventfd_unregister call
1880 * completed between the NULL check of ctx->io_ev_fd at the start of
1881 * the function and rcu_read_lock.
1883 if (unlikely(!ev_fd
))
1885 if (READ_ONCE(ctx
->rings
->cq_flags
) & IORING_CQ_EVENTFD_DISABLED
)
1888 if (!ev_fd
->eventfd_async
|| io_wq_current_is_worker())
1889 eventfd_signal(ev_fd
->cq_ev_fd
, 1);
1894 static inline void io_cqring_wake(struct io_ring_ctx
*ctx
)
1897 * wake_up_all() may seem excessive, but io_wake_function() and
1898 * io_should_wake() handle the termination of the loop and only
1899 * wake as many waiters as we need to.
1901 if (wq_has_sleeper(&ctx
->cq_wait
))
1902 wake_up_all(&ctx
->cq_wait
);
1906 * This should only get called when at least one event has been posted.
1907 * Some applications rely on the eventfd notification count only changing
1908 * IFF a new CQE has been added to the CQ ring. There's no depedency on
1909 * 1:1 relationship between how many times this function is called (and
1910 * hence the eventfd count) and number of CQEs posted to the CQ ring.
1912 static inline void io_cqring_ev_posted(struct io_ring_ctx
*ctx
)
1914 if (unlikely(ctx
->off_timeout_used
|| ctx
->drain_active
||
1916 __io_commit_cqring_flush(ctx
);
1918 io_cqring_wake(ctx
);
1921 static void io_cqring_ev_posted_iopoll(struct io_ring_ctx
*ctx
)
1923 if (unlikely(ctx
->off_timeout_used
|| ctx
->drain_active
||
1925 __io_commit_cqring_flush(ctx
);
1927 if (ctx
->flags
& IORING_SETUP_SQPOLL
)
1928 io_cqring_wake(ctx
);
1931 /* Returns true if there are no backlogged entries after the flush */
1932 static bool __io_cqring_overflow_flush(struct io_ring_ctx
*ctx
, bool force
)
1934 bool all_flushed
, posted
;
1936 if (!force
&& __io_cqring_events(ctx
) == ctx
->cq_entries
)
1940 spin_lock(&ctx
->completion_lock
);
1941 while (!list_empty(&ctx
->cq_overflow_list
)) {
1942 struct io_uring_cqe
*cqe
= io_get_cqe(ctx
);
1943 struct io_overflow_cqe
*ocqe
;
1947 ocqe
= list_first_entry(&ctx
->cq_overflow_list
,
1948 struct io_overflow_cqe
, list
);
1950 memcpy(cqe
, &ocqe
->cqe
, sizeof(*cqe
));
1952 io_account_cq_overflow(ctx
);
1955 list_del(&ocqe
->list
);
1959 all_flushed
= list_empty(&ctx
->cq_overflow_list
);
1961 clear_bit(0, &ctx
->check_cq_overflow
);
1962 WRITE_ONCE(ctx
->rings
->sq_flags
,
1963 ctx
->rings
->sq_flags
& ~IORING_SQ_CQ_OVERFLOW
);
1967 io_commit_cqring(ctx
);
1968 spin_unlock(&ctx
->completion_lock
);
1970 io_cqring_ev_posted(ctx
);
1974 static bool io_cqring_overflow_flush(struct io_ring_ctx
*ctx
)
1978 if (test_bit(0, &ctx
->check_cq_overflow
)) {
1979 /* iopoll syncs against uring_lock, not completion_lock */
1980 if (ctx
->flags
& IORING_SETUP_IOPOLL
)
1981 mutex_lock(&ctx
->uring_lock
);
1982 ret
= __io_cqring_overflow_flush(ctx
, false);
1983 if (ctx
->flags
& IORING_SETUP_IOPOLL
)
1984 mutex_unlock(&ctx
->uring_lock
);
1990 /* must to be called somewhat shortly after putting a request */
1991 static inline void io_put_task(struct task_struct
*task
, int nr
)
1993 struct io_uring_task
*tctx
= task
->io_uring
;
1995 if (likely(task
== current
)) {
1996 tctx
->cached_refs
+= nr
;
1998 percpu_counter_sub(&tctx
->inflight
, nr
);
1999 if (unlikely(atomic_read(&tctx
->in_idle
)))
2000 wake_up(&tctx
->wait
);
2001 put_task_struct_many(task
, nr
);
2005 static void io_task_refs_refill(struct io_uring_task
*tctx
)
2007 unsigned int refill
= -tctx
->cached_refs
+ IO_TCTX_REFS_CACHE_NR
;
2009 percpu_counter_add(&tctx
->inflight
, refill
);
2010 refcount_add(refill
, ¤t
->usage
);
2011 tctx
->cached_refs
+= refill
;
2014 static inline void io_get_task_refs(int nr
)
2016 struct io_uring_task
*tctx
= current
->io_uring
;
2018 tctx
->cached_refs
-= nr
;
2019 if (unlikely(tctx
->cached_refs
< 0))
2020 io_task_refs_refill(tctx
);
2023 static __cold
void io_uring_drop_tctx_refs(struct task_struct
*task
)
2025 struct io_uring_task
*tctx
= task
->io_uring
;
2026 unsigned int refs
= tctx
->cached_refs
;
2029 tctx
->cached_refs
= 0;
2030 percpu_counter_sub(&tctx
->inflight
, refs
);
2031 put_task_struct_many(task
, refs
);
2035 static bool io_cqring_event_overflow(struct io_ring_ctx
*ctx
, u64 user_data
,
2036 s32 res
, u32 cflags
)
2038 struct io_overflow_cqe
*ocqe
;
2040 ocqe
= kmalloc(sizeof(*ocqe
), GFP_ATOMIC
| __GFP_ACCOUNT
);
2043 * If we're in ring overflow flush mode, or in task cancel mode,
2044 * or cannot allocate an overflow entry, then we need to drop it
2047 io_account_cq_overflow(ctx
);
2050 if (list_empty(&ctx
->cq_overflow_list
)) {
2051 set_bit(0, &ctx
->check_cq_overflow
);
2052 WRITE_ONCE(ctx
->rings
->sq_flags
,
2053 ctx
->rings
->sq_flags
| IORING_SQ_CQ_OVERFLOW
);
2056 ocqe
->cqe
.user_data
= user_data
;
2057 ocqe
->cqe
.res
= res
;
2058 ocqe
->cqe
.flags
= cflags
;
2059 list_add_tail(&ocqe
->list
, &ctx
->cq_overflow_list
);
2063 static inline bool __io_fill_cqe(struct io_ring_ctx
*ctx
, u64 user_data
,
2064 s32 res
, u32 cflags
)
2066 struct io_uring_cqe
*cqe
;
2069 * If we can't get a cq entry, userspace overflowed the
2070 * submission (by quite a lot). Increment the overflow count in
2073 cqe
= io_get_cqe(ctx
);
2075 WRITE_ONCE(cqe
->user_data
, user_data
);
2076 WRITE_ONCE(cqe
->res
, res
);
2077 WRITE_ONCE(cqe
->flags
, cflags
);
2080 return io_cqring_event_overflow(ctx
, user_data
, res
, cflags
);
2083 static inline bool __io_fill_cqe_req(struct io_kiocb
*req
, s32 res
, u32 cflags
)
2085 trace_io_uring_complete(req
->ctx
, req
, req
->user_data
, res
, cflags
);
2086 return __io_fill_cqe(req
->ctx
, req
->user_data
, res
, cflags
);
2089 static noinline
void io_fill_cqe_req(struct io_kiocb
*req
, s32 res
, u32 cflags
)
2091 if (!(req
->flags
& REQ_F_CQE_SKIP
))
2092 __io_fill_cqe_req(req
, res
, cflags
);
2095 static noinline
bool io_fill_cqe_aux(struct io_ring_ctx
*ctx
, u64 user_data
,
2096 s32 res
, u32 cflags
)
2099 trace_io_uring_complete(ctx
, NULL
, user_data
, res
, cflags
);
2100 return __io_fill_cqe(ctx
, user_data
, res
, cflags
);
2103 static void __io_req_complete_post(struct io_kiocb
*req
, s32 res
,
2106 struct io_ring_ctx
*ctx
= req
->ctx
;
2108 if (!(req
->flags
& REQ_F_CQE_SKIP
))
2109 __io_fill_cqe_req(req
, res
, cflags
);
2111 * If we're the last reference to this request, add to our locked
2114 if (req_ref_put_and_test(req
)) {
2115 if (req
->flags
& (REQ_F_LINK
| REQ_F_HARDLINK
)) {
2116 if (req
->flags
& IO_DISARM_MASK
)
2117 io_disarm_next(req
);
2119 io_req_task_queue(req
->link
);
2123 io_req_put_rsrc(req
, ctx
);
2125 * Selected buffer deallocation in io_clean_op() assumes that
2126 * we don't hold ->completion_lock. Clean them here to avoid
2129 io_put_kbuf_comp(req
);
2130 io_dismantle_req(req
);
2131 io_put_task(req
->task
, 1);
2132 wq_list_add_head(&req
->comp_list
, &ctx
->locked_free_list
);
2133 ctx
->locked_free_nr
++;
2137 static void io_req_complete_post(struct io_kiocb
*req
, s32 res
,
2140 struct io_ring_ctx
*ctx
= req
->ctx
;
2142 spin_lock(&ctx
->completion_lock
);
2143 __io_req_complete_post(req
, res
, cflags
);
2144 io_commit_cqring(ctx
);
2145 spin_unlock(&ctx
->completion_lock
);
2146 io_cqring_ev_posted(ctx
);
2149 static inline void io_req_complete_state(struct io_kiocb
*req
, s32 res
,
2153 req
->cflags
= cflags
;
2154 req
->flags
|= REQ_F_COMPLETE_INLINE
;
2157 static inline void __io_req_complete(struct io_kiocb
*req
, unsigned issue_flags
,
2158 s32 res
, u32 cflags
)
2160 if (issue_flags
& IO_URING_F_COMPLETE_DEFER
)
2161 io_req_complete_state(req
, res
, cflags
);
2163 io_req_complete_post(req
, res
, cflags
);
2166 static inline void io_req_complete(struct io_kiocb
*req
, s32 res
)
2168 __io_req_complete(req
, 0, res
, 0);
2171 static void io_req_complete_failed(struct io_kiocb
*req
, s32 res
)
2174 io_req_complete_post(req
, res
, io_put_kbuf(req
, IO_URING_F_UNLOCKED
));
2177 static void io_req_complete_fail_submit(struct io_kiocb
*req
)
2180 * We don't submit, fail them all, for that replace hardlinks with
2181 * normal links. Extra REQ_F_LINK is tolerated.
2183 req
->flags
&= ~REQ_F_HARDLINK
;
2184 req
->flags
|= REQ_F_LINK
;
2185 io_req_complete_failed(req
, req
->result
);
2189 * Don't initialise the fields below on every allocation, but do that in
2190 * advance and keep them valid across allocations.
2192 static void io_preinit_req(struct io_kiocb
*req
, struct io_ring_ctx
*ctx
)
2196 req
->async_data
= NULL
;
2197 /* not necessary, but safer to zero */
2201 static void io_flush_cached_locked_reqs(struct io_ring_ctx
*ctx
,
2202 struct io_submit_state
*state
)
2204 spin_lock(&ctx
->completion_lock
);
2205 wq_list_splice(&ctx
->locked_free_list
, &state
->free_list
);
2206 ctx
->locked_free_nr
= 0;
2207 spin_unlock(&ctx
->completion_lock
);
2210 /* Returns true IFF there are requests in the cache */
2211 static bool io_flush_cached_reqs(struct io_ring_ctx
*ctx
)
2213 struct io_submit_state
*state
= &ctx
->submit_state
;
2216 * If we have more than a batch's worth of requests in our IRQ side
2217 * locked cache, grab the lock and move them over to our submission
2220 if (READ_ONCE(ctx
->locked_free_nr
) > IO_COMPL_BATCH
)
2221 io_flush_cached_locked_reqs(ctx
, state
);
2222 return !!state
->free_list
.next
;
2226 * A request might get retired back into the request caches even before opcode
2227 * handlers and io_issue_sqe() are done with it, e.g. inline completion path.
2228 * Because of that, io_alloc_req() should be called only under ->uring_lock
2229 * and with extra caution to not get a request that is still worked on.
2231 static __cold
bool __io_alloc_req_refill(struct io_ring_ctx
*ctx
)
2232 __must_hold(&ctx
->uring_lock
)
2234 struct io_submit_state
*state
= &ctx
->submit_state
;
2235 gfp_t gfp
= GFP_KERNEL
| __GFP_NOWARN
;
2236 void *reqs
[IO_REQ_ALLOC_BATCH
];
2237 struct io_kiocb
*req
;
2240 if (likely(state
->free_list
.next
|| io_flush_cached_reqs(ctx
)))
2243 ret
= kmem_cache_alloc_bulk(req_cachep
, gfp
, ARRAY_SIZE(reqs
), reqs
);
2246 * Bulk alloc is all-or-nothing. If we fail to get a batch,
2247 * retry single alloc to be on the safe side.
2249 if (unlikely(ret
<= 0)) {
2250 reqs
[0] = kmem_cache_alloc(req_cachep
, gfp
);
2256 percpu_ref_get_many(&ctx
->refs
, ret
);
2257 for (i
= 0; i
< ret
; i
++) {
2260 io_preinit_req(req
, ctx
);
2261 wq_stack_add_head(&req
->comp_list
, &state
->free_list
);
2266 static inline bool io_alloc_req_refill(struct io_ring_ctx
*ctx
)
2268 if (unlikely(!ctx
->submit_state
.free_list
.next
))
2269 return __io_alloc_req_refill(ctx
);
2273 static inline struct io_kiocb
*io_alloc_req(struct io_ring_ctx
*ctx
)
2275 struct io_wq_work_node
*node
;
2277 node
= wq_stack_extract(&ctx
->submit_state
.free_list
);
2278 return container_of(node
, struct io_kiocb
, comp_list
);
2281 static inline void io_put_file(struct file
*file
)
2287 static inline void io_dismantle_req(struct io_kiocb
*req
)
2289 unsigned int flags
= req
->flags
;
2291 if (unlikely(flags
& IO_REQ_CLEAN_FLAGS
))
2293 if (!(flags
& REQ_F_FIXED_FILE
))
2294 io_put_file(req
->file
);
2297 static __cold
void __io_free_req(struct io_kiocb
*req
)
2299 struct io_ring_ctx
*ctx
= req
->ctx
;
2301 io_req_put_rsrc(req
, ctx
);
2302 io_dismantle_req(req
);
2303 io_put_task(req
->task
, 1);
2305 spin_lock(&ctx
->completion_lock
);
2306 wq_list_add_head(&req
->comp_list
, &ctx
->locked_free_list
);
2307 ctx
->locked_free_nr
++;
2308 spin_unlock(&ctx
->completion_lock
);
2311 static inline void io_remove_next_linked(struct io_kiocb
*req
)
2313 struct io_kiocb
*nxt
= req
->link
;
2315 req
->link
= nxt
->link
;
2319 static bool io_kill_linked_timeout(struct io_kiocb
*req
)
2320 __must_hold(&req
->ctx
->completion_lock
)
2321 __must_hold(&req
->ctx
->timeout_lock
)
2323 struct io_kiocb
*link
= req
->link
;
2325 if (link
&& link
->opcode
== IORING_OP_LINK_TIMEOUT
) {
2326 struct io_timeout_data
*io
= link
->async_data
;
2328 io_remove_next_linked(req
);
2329 link
->timeout
.head
= NULL
;
2330 if (hrtimer_try_to_cancel(&io
->timer
) != -1) {
2331 list_del(&link
->timeout
.list
);
2332 /* leave REQ_F_CQE_SKIP to io_fill_cqe_req */
2333 io_fill_cqe_req(link
, -ECANCELED
, 0);
2334 io_put_req_deferred(link
);
2341 static void io_fail_links(struct io_kiocb
*req
)
2342 __must_hold(&req
->ctx
->completion_lock
)
2344 struct io_kiocb
*nxt
, *link
= req
->link
;
2345 bool ignore_cqes
= req
->flags
& REQ_F_SKIP_LINK_CQES
;
2349 long res
= -ECANCELED
;
2351 if (link
->flags
& REQ_F_FAIL
)
2357 trace_io_uring_fail_link(req
->ctx
, req
, req
->user_data
,
2361 link
->flags
&= ~REQ_F_CQE_SKIP
;
2362 io_fill_cqe_req(link
, res
, 0);
2364 io_put_req_deferred(link
);
2369 static bool io_disarm_next(struct io_kiocb
*req
)
2370 __must_hold(&req
->ctx
->completion_lock
)
2372 bool posted
= false;
2374 if (req
->flags
& REQ_F_ARM_LTIMEOUT
) {
2375 struct io_kiocb
*link
= req
->link
;
2377 req
->flags
&= ~REQ_F_ARM_LTIMEOUT
;
2378 if (link
&& link
->opcode
== IORING_OP_LINK_TIMEOUT
) {
2379 io_remove_next_linked(req
);
2380 /* leave REQ_F_CQE_SKIP to io_fill_cqe_req */
2381 io_fill_cqe_req(link
, -ECANCELED
, 0);
2382 io_put_req_deferred(link
);
2385 } else if (req
->flags
& REQ_F_LINK_TIMEOUT
) {
2386 struct io_ring_ctx
*ctx
= req
->ctx
;
2388 spin_lock_irq(&ctx
->timeout_lock
);
2389 posted
= io_kill_linked_timeout(req
);
2390 spin_unlock_irq(&ctx
->timeout_lock
);
2392 if (unlikely((req
->flags
& REQ_F_FAIL
) &&
2393 !(req
->flags
& REQ_F_HARDLINK
))) {
2394 posted
|= (req
->link
!= NULL
);
2400 static void __io_req_find_next_prep(struct io_kiocb
*req
)
2402 struct io_ring_ctx
*ctx
= req
->ctx
;
2405 spin_lock(&ctx
->completion_lock
);
2406 posted
= io_disarm_next(req
);
2408 io_commit_cqring(ctx
);
2409 spin_unlock(&ctx
->completion_lock
);
2411 io_cqring_ev_posted(ctx
);
2414 static inline struct io_kiocb
*io_req_find_next(struct io_kiocb
*req
)
2416 struct io_kiocb
*nxt
;
2418 if (likely(!(req
->flags
& (REQ_F_LINK
|REQ_F_HARDLINK
))))
2421 * If LINK is set, we have dependent requests in this chain. If we
2422 * didn't fail this request, queue the first one up, moving any other
2423 * dependencies to the next request. In case of failure, fail the rest
2426 if (unlikely(req
->flags
& IO_DISARM_MASK
))
2427 __io_req_find_next_prep(req
);
2433 static void ctx_flush_and_put(struct io_ring_ctx
*ctx
, bool *locked
)
2438 io_submit_flush_completions(ctx
);
2439 mutex_unlock(&ctx
->uring_lock
);
2442 percpu_ref_put(&ctx
->refs
);
2445 static inline void ctx_commit_and_unlock(struct io_ring_ctx
*ctx
)
2447 io_commit_cqring(ctx
);
2448 spin_unlock(&ctx
->completion_lock
);
2449 io_cqring_ev_posted(ctx
);
2452 static void handle_prev_tw_list(struct io_wq_work_node
*node
,
2453 struct io_ring_ctx
**ctx
, bool *uring_locked
)
2455 if (*ctx
&& !*uring_locked
)
2456 spin_lock(&(*ctx
)->completion_lock
);
2459 struct io_wq_work_node
*next
= node
->next
;
2460 struct io_kiocb
*req
= container_of(node
, struct io_kiocb
,
2463 prefetch(container_of(next
, struct io_kiocb
, io_task_work
.node
));
2465 if (req
->ctx
!= *ctx
) {
2466 if (unlikely(!*uring_locked
&& *ctx
))
2467 ctx_commit_and_unlock(*ctx
);
2469 ctx_flush_and_put(*ctx
, uring_locked
);
2471 /* if not contended, grab and improve batching */
2472 *uring_locked
= mutex_trylock(&(*ctx
)->uring_lock
);
2473 percpu_ref_get(&(*ctx
)->refs
);
2474 if (unlikely(!*uring_locked
))
2475 spin_lock(&(*ctx
)->completion_lock
);
2477 if (likely(*uring_locked
))
2478 req
->io_task_work
.func(req
, uring_locked
);
2480 __io_req_complete_post(req
, req
->result
,
2481 io_put_kbuf_comp(req
));
2485 if (unlikely(!*uring_locked
))
2486 ctx_commit_and_unlock(*ctx
);
2489 static void handle_tw_list(struct io_wq_work_node
*node
,
2490 struct io_ring_ctx
**ctx
, bool *locked
)
2493 struct io_wq_work_node
*next
= node
->next
;
2494 struct io_kiocb
*req
= container_of(node
, struct io_kiocb
,
2497 prefetch(container_of(next
, struct io_kiocb
, io_task_work
.node
));
2499 if (req
->ctx
!= *ctx
) {
2500 ctx_flush_and_put(*ctx
, locked
);
2502 /* if not contended, grab and improve batching */
2503 *locked
= mutex_trylock(&(*ctx
)->uring_lock
);
2504 percpu_ref_get(&(*ctx
)->refs
);
2506 req
->io_task_work
.func(req
, locked
);
2511 static void tctx_task_work(struct callback_head
*cb
)
2513 bool uring_locked
= false;
2514 struct io_ring_ctx
*ctx
= NULL
;
2515 struct io_uring_task
*tctx
= container_of(cb
, struct io_uring_task
,
2519 struct io_wq_work_node
*node1
, *node2
;
2521 if (!tctx
->task_list
.first
&&
2522 !tctx
->prior_task_list
.first
&& uring_locked
)
2523 io_submit_flush_completions(ctx
);
2525 spin_lock_irq(&tctx
->task_lock
);
2526 node1
= tctx
->prior_task_list
.first
;
2527 node2
= tctx
->task_list
.first
;
2528 INIT_WQ_LIST(&tctx
->task_list
);
2529 INIT_WQ_LIST(&tctx
->prior_task_list
);
2530 if (!node2
&& !node1
)
2531 tctx
->task_running
= false;
2532 spin_unlock_irq(&tctx
->task_lock
);
2533 if (!node2
&& !node1
)
2537 handle_prev_tw_list(node1
, &ctx
, &uring_locked
);
2540 handle_tw_list(node2
, &ctx
, &uring_locked
);
2544 ctx_flush_and_put(ctx
, &uring_locked
);
2546 /* relaxed read is enough as only the task itself sets ->in_idle */
2547 if (unlikely(atomic_read(&tctx
->in_idle
)))
2548 io_uring_drop_tctx_refs(current
);
2551 static void io_req_task_work_add(struct io_kiocb
*req
, bool priority
)
2553 struct task_struct
*tsk
= req
->task
;
2554 struct io_uring_task
*tctx
= tsk
->io_uring
;
2555 enum task_work_notify_mode notify
;
2556 struct io_wq_work_node
*node
;
2557 unsigned long flags
;
2560 WARN_ON_ONCE(!tctx
);
2562 spin_lock_irqsave(&tctx
->task_lock
, flags
);
2564 wq_list_add_tail(&req
->io_task_work
.node
, &tctx
->prior_task_list
);
2566 wq_list_add_tail(&req
->io_task_work
.node
, &tctx
->task_list
);
2567 running
= tctx
->task_running
;
2569 tctx
->task_running
= true;
2570 spin_unlock_irqrestore(&tctx
->task_lock
, flags
);
2572 /* task_work already pending, we're done */
2577 * SQPOLL kernel thread doesn't need notification, just a wakeup. For
2578 * all other cases, use TWA_SIGNAL unconditionally to ensure we're
2579 * processing task_work. There's no reliable way to tell if TWA_RESUME
2582 notify
= (req
->ctx
->flags
& IORING_SETUP_SQPOLL
) ? TWA_NONE
: TWA_SIGNAL
;
2583 if (likely(!task_work_add(tsk
, &tctx
->task_work
, notify
))) {
2584 if (notify
== TWA_NONE
)
2585 wake_up_process(tsk
);
2589 spin_lock_irqsave(&tctx
->task_lock
, flags
);
2590 tctx
->task_running
= false;
2591 node
= wq_list_merge(&tctx
->prior_task_list
, &tctx
->task_list
);
2592 spin_unlock_irqrestore(&tctx
->task_lock
, flags
);
2595 req
= container_of(node
, struct io_kiocb
, io_task_work
.node
);
2597 if (llist_add(&req
->io_task_work
.fallback_node
,
2598 &req
->ctx
->fallback_llist
))
2599 schedule_delayed_work(&req
->ctx
->fallback_work
, 1);
2603 static void io_req_task_cancel(struct io_kiocb
*req
, bool *locked
)
2605 struct io_ring_ctx
*ctx
= req
->ctx
;
2607 /* not needed for normal modes, but SQPOLL depends on it */
2608 io_tw_lock(ctx
, locked
);
2609 io_req_complete_failed(req
, req
->result
);
2612 static void io_req_task_submit(struct io_kiocb
*req
, bool *locked
)
2614 struct io_ring_ctx
*ctx
= req
->ctx
;
2616 io_tw_lock(ctx
, locked
);
2617 /* req->task == current here, checking PF_EXITING is safe */
2618 if (likely(!(req
->task
->flags
& PF_EXITING
)))
2619 __io_queue_sqe(req
);
2621 io_req_complete_failed(req
, -EFAULT
);
2624 static void io_req_task_queue_fail(struct io_kiocb
*req
, int ret
)
2627 req
->io_task_work
.func
= io_req_task_cancel
;
2628 io_req_task_work_add(req
, false);
2631 static void io_req_task_queue(struct io_kiocb
*req
)
2633 req
->io_task_work
.func
= io_req_task_submit
;
2634 io_req_task_work_add(req
, false);
2637 static void io_req_task_queue_reissue(struct io_kiocb
*req
)
2639 req
->io_task_work
.func
= io_queue_async_work
;
2640 io_req_task_work_add(req
, false);
2643 static inline void io_queue_next(struct io_kiocb
*req
)
2645 struct io_kiocb
*nxt
= io_req_find_next(req
);
2648 io_req_task_queue(nxt
);
2651 static void io_free_req(struct io_kiocb
*req
)
2657 static void io_free_req_work(struct io_kiocb
*req
, bool *locked
)
2662 static void io_free_batch_list(struct io_ring_ctx
*ctx
,
2663 struct io_wq_work_node
*node
)
2664 __must_hold(&ctx
->uring_lock
)
2666 struct task_struct
*task
= NULL
;
2670 struct io_kiocb
*req
= container_of(node
, struct io_kiocb
,
2673 if (unlikely(req
->flags
& REQ_F_REFCOUNT
)) {
2674 node
= req
->comp_list
.next
;
2675 if (!req_ref_put_and_test(req
))
2679 io_req_put_rsrc_locked(req
, ctx
);
2681 io_dismantle_req(req
);
2683 if (req
->task
!= task
) {
2685 io_put_task(task
, task_refs
);
2690 node
= req
->comp_list
.next
;
2691 wq_stack_add_head(&req
->comp_list
, &ctx
->submit_state
.free_list
);
2695 io_put_task(task
, task_refs
);
2698 static void __io_submit_flush_completions(struct io_ring_ctx
*ctx
)
2699 __must_hold(&ctx
->uring_lock
)
2701 struct io_wq_work_node
*node
, *prev
;
2702 struct io_submit_state
*state
= &ctx
->submit_state
;
2704 if (state
->flush_cqes
) {
2705 spin_lock(&ctx
->completion_lock
);
2706 wq_list_for_each(node
, prev
, &state
->compl_reqs
) {
2707 struct io_kiocb
*req
= container_of(node
, struct io_kiocb
,
2710 if (!(req
->flags
& REQ_F_CQE_SKIP
))
2711 __io_fill_cqe_req(req
, req
->result
, req
->cflags
);
2712 if ((req
->flags
& REQ_F_POLLED
) && req
->apoll
) {
2713 struct async_poll
*apoll
= req
->apoll
;
2715 if (apoll
->double_poll
)
2716 kfree(apoll
->double_poll
);
2717 list_add(&apoll
->poll
.wait
.entry
,
2719 req
->flags
&= ~REQ_F_POLLED
;
2723 io_commit_cqring(ctx
);
2724 spin_unlock(&ctx
->completion_lock
);
2725 io_cqring_ev_posted(ctx
);
2726 state
->flush_cqes
= false;
2729 io_free_batch_list(ctx
, state
->compl_reqs
.first
);
2730 INIT_WQ_LIST(&state
->compl_reqs
);
2734 * Drop reference to request, return next in chain (if there is one) if this
2735 * was the last reference to this request.
2737 static inline struct io_kiocb
*io_put_req_find_next(struct io_kiocb
*req
)
2739 struct io_kiocb
*nxt
= NULL
;
2741 if (req_ref_put_and_test(req
)) {
2742 nxt
= io_req_find_next(req
);
2748 static inline void io_put_req(struct io_kiocb
*req
)
2750 if (req_ref_put_and_test(req
))
2754 static inline void io_put_req_deferred(struct io_kiocb
*req
)
2756 if (req_ref_put_and_test(req
)) {
2757 req
->io_task_work
.func
= io_free_req_work
;
2758 io_req_task_work_add(req
, false);
2762 static unsigned io_cqring_events(struct io_ring_ctx
*ctx
)
2764 /* See comment at the top of this file */
2766 return __io_cqring_events(ctx
);
2769 static inline unsigned int io_sqring_entries(struct io_ring_ctx
*ctx
)
2771 struct io_rings
*rings
= ctx
->rings
;
2773 /* make sure SQ entry isn't read before tail */
2774 return smp_load_acquire(&rings
->sq
.tail
) - ctx
->cached_sq_head
;
2777 static inline bool io_run_task_work(void)
2779 if (test_thread_flag(TIF_NOTIFY_SIGNAL
) || task_work_pending(current
)) {
2780 __set_current_state(TASK_RUNNING
);
2781 clear_notify_signal();
2782 if (task_work_pending(current
))
2790 static int io_do_iopoll(struct io_ring_ctx
*ctx
, bool force_nonspin
)
2792 struct io_wq_work_node
*pos
, *start
, *prev
;
2793 unsigned int poll_flags
= BLK_POLL_NOSLEEP
;
2794 DEFINE_IO_COMP_BATCH(iob
);
2798 * Only spin for completions if we don't have multiple devices hanging
2799 * off our complete list.
2801 if (ctx
->poll_multi_queue
|| force_nonspin
)
2802 poll_flags
|= BLK_POLL_ONESHOT
;
2804 wq_list_for_each(pos
, start
, &ctx
->iopoll_list
) {
2805 struct io_kiocb
*req
= container_of(pos
, struct io_kiocb
, comp_list
);
2806 struct kiocb
*kiocb
= &req
->rw
.kiocb
;
2810 * Move completed and retryable entries to our local lists.
2811 * If we find a request that requires polling, break out
2812 * and complete those lists first, if we have entries there.
2814 if (READ_ONCE(req
->iopoll_completed
))
2817 ret
= kiocb
->ki_filp
->f_op
->iopoll(kiocb
, &iob
, poll_flags
);
2818 if (unlikely(ret
< 0))
2821 poll_flags
|= BLK_POLL_ONESHOT
;
2823 /* iopoll may have completed current req */
2824 if (!rq_list_empty(iob
.req_list
) ||
2825 READ_ONCE(req
->iopoll_completed
))
2829 if (!rq_list_empty(iob
.req_list
))
2835 wq_list_for_each_resume(pos
, prev
) {
2836 struct io_kiocb
*req
= container_of(pos
, struct io_kiocb
, comp_list
);
2838 /* order with io_complete_rw_iopoll(), e.g. ->result updates */
2839 if (!smp_load_acquire(&req
->iopoll_completed
))
2842 if (unlikely(req
->flags
& REQ_F_CQE_SKIP
))
2844 __io_fill_cqe_req(req
, req
->result
, io_put_kbuf(req
, 0));
2847 if (unlikely(!nr_events
))
2850 io_commit_cqring(ctx
);
2851 io_cqring_ev_posted_iopoll(ctx
);
2852 pos
= start
? start
->next
: ctx
->iopoll_list
.first
;
2853 wq_list_cut(&ctx
->iopoll_list
, prev
, start
);
2854 io_free_batch_list(ctx
, pos
);
2859 * We can't just wait for polled events to come to us, we have to actively
2860 * find and complete them.
2862 static __cold
void io_iopoll_try_reap_events(struct io_ring_ctx
*ctx
)
2864 if (!(ctx
->flags
& IORING_SETUP_IOPOLL
))
2867 mutex_lock(&ctx
->uring_lock
);
2868 while (!wq_list_empty(&ctx
->iopoll_list
)) {
2869 /* let it sleep and repeat later if can't complete a request */
2870 if (io_do_iopoll(ctx
, true) == 0)
2873 * Ensure we allow local-to-the-cpu processing to take place,
2874 * in this case we need to ensure that we reap all events.
2875 * Also let task_work, etc. to progress by releasing the mutex
2877 if (need_resched()) {
2878 mutex_unlock(&ctx
->uring_lock
);
2880 mutex_lock(&ctx
->uring_lock
);
2883 mutex_unlock(&ctx
->uring_lock
);
2886 static int io_iopoll_check(struct io_ring_ctx
*ctx
, long min
)
2888 unsigned int nr_events
= 0;
2892 * We disallow the app entering submit/complete with polling, but we
2893 * still need to lock the ring to prevent racing with polled issue
2894 * that got punted to a workqueue.
2896 mutex_lock(&ctx
->uring_lock
);
2898 * Don't enter poll loop if we already have events pending.
2899 * If we do, we can potentially be spinning for commands that
2900 * already triggered a CQE (eg in error).
2902 if (test_bit(0, &ctx
->check_cq_overflow
))
2903 __io_cqring_overflow_flush(ctx
, false);
2904 if (io_cqring_events(ctx
))
2908 * If a submit got punted to a workqueue, we can have the
2909 * application entering polling for a command before it gets
2910 * issued. That app will hold the uring_lock for the duration
2911 * of the poll right here, so we need to take a breather every
2912 * now and then to ensure that the issue has a chance to add
2913 * the poll to the issued list. Otherwise we can spin here
2914 * forever, while the workqueue is stuck trying to acquire the
2917 if (wq_list_empty(&ctx
->iopoll_list
)) {
2918 u32 tail
= ctx
->cached_cq_tail
;
2920 mutex_unlock(&ctx
->uring_lock
);
2922 mutex_lock(&ctx
->uring_lock
);
2924 /* some requests don't go through iopoll_list */
2925 if (tail
!= ctx
->cached_cq_tail
||
2926 wq_list_empty(&ctx
->iopoll_list
))
2929 ret
= io_do_iopoll(ctx
, !min
);
2934 } while (nr_events
< min
&& !need_resched());
2936 mutex_unlock(&ctx
->uring_lock
);
2940 static void kiocb_end_write(struct io_kiocb
*req
)
2943 * Tell lockdep we inherited freeze protection from submission
2946 if (req
->flags
& REQ_F_ISREG
) {
2947 struct super_block
*sb
= file_inode(req
->file
)->i_sb
;
2949 __sb_writers_acquired(sb
, SB_FREEZE_WRITE
);
2955 static bool io_resubmit_prep(struct io_kiocb
*req
)
2957 struct io_async_rw
*rw
= req
->async_data
;
2959 if (!req_has_async_data(req
))
2960 return !io_req_prep_async(req
);
2961 iov_iter_restore(&rw
->s
.iter
, &rw
->s
.iter_state
);
2965 static bool io_rw_should_reissue(struct io_kiocb
*req
)
2967 umode_t mode
= file_inode(req
->file
)->i_mode
;
2968 struct io_ring_ctx
*ctx
= req
->ctx
;
2970 if (!S_ISBLK(mode
) && !S_ISREG(mode
))
2972 if ((req
->flags
& REQ_F_NOWAIT
) || (io_wq_current_is_worker() &&
2973 !(ctx
->flags
& IORING_SETUP_IOPOLL
)))
2976 * If ref is dying, we might be running poll reap from the exit work.
2977 * Don't attempt to reissue from that path, just let it fail with
2980 if (percpu_ref_is_dying(&ctx
->refs
))
2983 * Play it safe and assume not safe to re-import and reissue if we're
2984 * not in the original thread group (or in task context).
2986 if (!same_thread_group(req
->task
, current
) || !in_task())
2991 static bool io_resubmit_prep(struct io_kiocb
*req
)
2995 static bool io_rw_should_reissue(struct io_kiocb
*req
)
3001 static bool __io_complete_rw_common(struct io_kiocb
*req
, long res
)
3003 if (req
->rw
.kiocb
.ki_flags
& IOCB_WRITE
) {
3004 kiocb_end_write(req
);
3005 fsnotify_modify(req
->file
);
3007 fsnotify_access(req
->file
);
3009 if (unlikely(res
!= req
->result
)) {
3010 if ((res
== -EAGAIN
|| res
== -EOPNOTSUPP
) &&
3011 io_rw_should_reissue(req
)) {
3012 req
->flags
|= REQ_F_REISSUE
;
3021 static inline void io_req_task_complete(struct io_kiocb
*req
, bool *locked
)
3023 int res
= req
->result
;
3026 io_req_complete_state(req
, res
, io_put_kbuf(req
, 0));
3027 io_req_add_compl_list(req
);
3029 io_req_complete_post(req
, res
,
3030 io_put_kbuf(req
, IO_URING_F_UNLOCKED
));
3034 static void __io_complete_rw(struct io_kiocb
*req
, long res
,
3035 unsigned int issue_flags
)
3037 if (__io_complete_rw_common(req
, res
))
3039 __io_req_complete(req
, issue_flags
, req
->result
,
3040 io_put_kbuf(req
, issue_flags
));
3043 static void io_complete_rw(struct kiocb
*kiocb
, long res
)
3045 struct io_kiocb
*req
= container_of(kiocb
, struct io_kiocb
, rw
.kiocb
);
3047 if (__io_complete_rw_common(req
, res
))
3050 req
->io_task_work
.func
= io_req_task_complete
;
3051 io_req_task_work_add(req
, !!(req
->ctx
->flags
& IORING_SETUP_SQPOLL
));
3054 static void io_complete_rw_iopoll(struct kiocb
*kiocb
, long res
)
3056 struct io_kiocb
*req
= container_of(kiocb
, struct io_kiocb
, rw
.kiocb
);
3058 if (kiocb
->ki_flags
& IOCB_WRITE
)
3059 kiocb_end_write(req
);
3060 if (unlikely(res
!= req
->result
)) {
3061 if (res
== -EAGAIN
&& io_rw_should_reissue(req
)) {
3062 req
->flags
|= REQ_F_REISSUE
;
3068 /* order with io_iopoll_complete() checking ->iopoll_completed */
3069 smp_store_release(&req
->iopoll_completed
, 1);
3073 * After the iocb has been issued, it's safe to be found on the poll list.
3074 * Adding the kiocb to the list AFTER submission ensures that we don't
3075 * find it from a io_do_iopoll() thread before the issuer is done
3076 * accessing the kiocb cookie.
3078 static void io_iopoll_req_issued(struct io_kiocb
*req
, unsigned int issue_flags
)
3080 struct io_ring_ctx
*ctx
= req
->ctx
;
3081 const bool needs_lock
= issue_flags
& IO_URING_F_UNLOCKED
;
3083 /* workqueue context doesn't hold uring_lock, grab it now */
3084 if (unlikely(needs_lock
))
3085 mutex_lock(&ctx
->uring_lock
);
3088 * Track whether we have multiple files in our lists. This will impact
3089 * how we do polling eventually, not spinning if we're on potentially
3090 * different devices.
3092 if (wq_list_empty(&ctx
->iopoll_list
)) {
3093 ctx
->poll_multi_queue
= false;
3094 } else if (!ctx
->poll_multi_queue
) {
3095 struct io_kiocb
*list_req
;
3097 list_req
= container_of(ctx
->iopoll_list
.first
, struct io_kiocb
,
3099 if (list_req
->file
!= req
->file
)
3100 ctx
->poll_multi_queue
= true;
3104 * For fast devices, IO may have already completed. If it has, add
3105 * it to the front so we find it first.
3107 if (READ_ONCE(req
->iopoll_completed
))
3108 wq_list_add_head(&req
->comp_list
, &ctx
->iopoll_list
);
3110 wq_list_add_tail(&req
->comp_list
, &ctx
->iopoll_list
);
3112 if (unlikely(needs_lock
)) {
3114 * If IORING_SETUP_SQPOLL is enabled, sqes are either handle
3115 * in sq thread task context or in io worker task context. If
3116 * current task context is sq thread, we don't need to check
3117 * whether should wake up sq thread.
3119 if ((ctx
->flags
& IORING_SETUP_SQPOLL
) &&
3120 wq_has_sleeper(&ctx
->sq_data
->wait
))
3121 wake_up(&ctx
->sq_data
->wait
);
3123 mutex_unlock(&ctx
->uring_lock
);
3127 static bool io_bdev_nowait(struct block_device
*bdev
)
3129 return !bdev
|| blk_queue_nowait(bdev_get_queue(bdev
));
3133 * If we tracked the file through the SCM inflight mechanism, we could support
3134 * any file. For now, just ensure that anything potentially problematic is done
3137 static bool __io_file_supports_nowait(struct file
*file
, umode_t mode
)
3139 if (S_ISBLK(mode
)) {
3140 if (IS_ENABLED(CONFIG_BLOCK
) &&
3141 io_bdev_nowait(I_BDEV(file
->f_mapping
->host
)))
3147 if (S_ISREG(mode
)) {
3148 if (IS_ENABLED(CONFIG_BLOCK
) &&
3149 io_bdev_nowait(file
->f_inode
->i_sb
->s_bdev
) &&
3150 file
->f_op
!= &io_uring_fops
)
3155 /* any ->read/write should understand O_NONBLOCK */
3156 if (file
->f_flags
& O_NONBLOCK
)
3158 return file
->f_mode
& FMODE_NOWAIT
;
3162 * If we tracked the file through the SCM inflight mechanism, we could support
3163 * any file. For now, just ensure that anything potentially problematic is done
3166 static unsigned int io_file_get_flags(struct file
*file
)
3168 umode_t mode
= file_inode(file
)->i_mode
;
3169 unsigned int res
= 0;
3173 if (__io_file_supports_nowait(file
, mode
))
3178 static inline bool io_file_supports_nowait(struct io_kiocb
*req
)
3180 return req
->flags
& REQ_F_SUPPORT_NOWAIT
;
3183 static int io_prep_rw(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
3185 struct kiocb
*kiocb
= &req
->rw
.kiocb
;
3189 kiocb
->ki_pos
= READ_ONCE(sqe
->off
);
3190 /* used for fixed read/write too - just read unconditionally */
3191 req
->buf_index
= READ_ONCE(sqe
->buf_index
);
3194 if (req
->opcode
== IORING_OP_READ_FIXED
||
3195 req
->opcode
== IORING_OP_WRITE_FIXED
) {
3196 struct io_ring_ctx
*ctx
= req
->ctx
;
3199 if (unlikely(req
->buf_index
>= ctx
->nr_user_bufs
))
3201 index
= array_index_nospec(req
->buf_index
, ctx
->nr_user_bufs
);
3202 req
->imu
= ctx
->user_bufs
[index
];
3203 io_req_set_rsrc_node(req
, ctx
, 0);
3206 ioprio
= READ_ONCE(sqe
->ioprio
);
3208 ret
= ioprio_check_cap(ioprio
);
3212 kiocb
->ki_ioprio
= ioprio
;
3214 kiocb
->ki_ioprio
= get_current_ioprio();
3217 req
->rw
.addr
= READ_ONCE(sqe
->addr
);
3218 req
->rw
.len
= READ_ONCE(sqe
->len
);
3219 req
->rw
.flags
= READ_ONCE(sqe
->rw_flags
);
3223 static inline void io_rw_done(struct kiocb
*kiocb
, ssize_t ret
)
3229 case -ERESTARTNOINTR
:
3230 case -ERESTARTNOHAND
:
3231 case -ERESTART_RESTARTBLOCK
:
3233 * We can't just restart the syscall, since previously
3234 * submitted sqes may already be in progress. Just fail this
3240 kiocb
->ki_complete(kiocb
, ret
);
3244 static inline loff_t
*io_kiocb_update_pos(struct io_kiocb
*req
)
3246 struct kiocb
*kiocb
= &req
->rw
.kiocb
;
3248 if (kiocb
->ki_pos
!= -1)
3249 return &kiocb
->ki_pos
;
3251 if (!(req
->file
->f_mode
& FMODE_STREAM
)) {
3252 req
->flags
|= REQ_F_CUR_POS
;
3253 kiocb
->ki_pos
= req
->file
->f_pos
;
3254 return &kiocb
->ki_pos
;
3261 static void kiocb_done(struct io_kiocb
*req
, ssize_t ret
,
3262 unsigned int issue_flags
)
3264 struct io_async_rw
*io
= req
->async_data
;
3266 /* add previously done IO, if any */
3267 if (req_has_async_data(req
) && io
->bytes_done
> 0) {
3269 ret
= io
->bytes_done
;
3271 ret
+= io
->bytes_done
;
3274 if (req
->flags
& REQ_F_CUR_POS
)
3275 req
->file
->f_pos
= req
->rw
.kiocb
.ki_pos
;
3276 if (ret
>= 0 && (req
->rw
.kiocb
.ki_complete
== io_complete_rw
))
3277 __io_complete_rw(req
, ret
, issue_flags
);
3279 io_rw_done(&req
->rw
.kiocb
, ret
);
3281 if (req
->flags
& REQ_F_REISSUE
) {
3282 req
->flags
&= ~REQ_F_REISSUE
;
3283 if (io_resubmit_prep(req
))
3284 io_req_task_queue_reissue(req
);
3286 io_req_task_queue_fail(req
, ret
);
3290 static int __io_import_fixed(struct io_kiocb
*req
, int rw
, struct iov_iter
*iter
,
3291 struct io_mapped_ubuf
*imu
)
3293 size_t len
= req
->rw
.len
;
3294 u64 buf_end
, buf_addr
= req
->rw
.addr
;
3297 if (unlikely(check_add_overflow(buf_addr
, (u64
)len
, &buf_end
)))
3299 /* not inside the mapped region */
3300 if (unlikely(buf_addr
< imu
->ubuf
|| buf_end
> imu
->ubuf_end
))
3304 * May not be a start of buffer, set size appropriately
3305 * and advance us to the beginning.
3307 offset
= buf_addr
- imu
->ubuf
;
3308 iov_iter_bvec(iter
, rw
, imu
->bvec
, imu
->nr_bvecs
, offset
+ len
);
3312 * Don't use iov_iter_advance() here, as it's really slow for
3313 * using the latter parts of a big fixed buffer - it iterates
3314 * over each segment manually. We can cheat a bit here, because
3317 * 1) it's a BVEC iter, we set it up
3318 * 2) all bvecs are PAGE_SIZE in size, except potentially the
3319 * first and last bvec
3321 * So just find our index, and adjust the iterator afterwards.
3322 * If the offset is within the first bvec (or the whole first
3323 * bvec, just use iov_iter_advance(). This makes it easier
3324 * since we can just skip the first segment, which may not
3325 * be PAGE_SIZE aligned.
3327 const struct bio_vec
*bvec
= imu
->bvec
;
3329 if (offset
<= bvec
->bv_len
) {
3330 iov_iter_advance(iter
, offset
);
3332 unsigned long seg_skip
;
3334 /* skip first vec */
3335 offset
-= bvec
->bv_len
;
3336 seg_skip
= 1 + (offset
>> PAGE_SHIFT
);
3338 iter
->bvec
= bvec
+ seg_skip
;
3339 iter
->nr_segs
-= seg_skip
;
3340 iter
->count
-= bvec
->bv_len
+ offset
;
3341 iter
->iov_offset
= offset
& ~PAGE_MASK
;
3348 static int io_import_fixed(struct io_kiocb
*req
, int rw
, struct iov_iter
*iter
,
3349 unsigned int issue_flags
)
3351 if (WARN_ON_ONCE(!req
->imu
))
3353 return __io_import_fixed(req
, rw
, iter
, req
->imu
);
3356 static void io_ring_submit_unlock(struct io_ring_ctx
*ctx
, bool needs_lock
)
3359 mutex_unlock(&ctx
->uring_lock
);
3362 static void io_ring_submit_lock(struct io_ring_ctx
*ctx
, bool needs_lock
)
3365 * "Normal" inline submissions always hold the uring_lock, since we
3366 * grab it from the system call. Same is true for the SQPOLL offload.
3367 * The only exception is when we've detached the request and issue it
3368 * from an async worker thread, grab the lock for that case.
3371 mutex_lock(&ctx
->uring_lock
);
3374 static void io_buffer_add_list(struct io_ring_ctx
*ctx
,
3375 struct io_buffer_list
*bl
, unsigned int bgid
)
3377 struct list_head
*list
;
3379 list
= &ctx
->io_buffers
[hash_32(bgid
, IO_BUFFERS_HASH_BITS
)];
3380 INIT_LIST_HEAD(&bl
->buf_list
);
3382 list_add(&bl
->list
, list
);
3385 static struct io_buffer
*io_buffer_select(struct io_kiocb
*req
, size_t *len
,
3386 int bgid
, unsigned int issue_flags
)
3388 struct io_buffer
*kbuf
= req
->kbuf
;
3389 bool needs_lock
= issue_flags
& IO_URING_F_UNLOCKED
;
3390 struct io_ring_ctx
*ctx
= req
->ctx
;
3391 struct io_buffer_list
*bl
;
3393 if (req
->flags
& REQ_F_BUFFER_SELECTED
)
3396 io_ring_submit_lock(ctx
, needs_lock
);
3398 lockdep_assert_held(&ctx
->uring_lock
);
3400 bl
= io_buffer_get_list(ctx
, bgid
);
3401 if (bl
&& !list_empty(&bl
->buf_list
)) {
3402 kbuf
= list_first_entry(&bl
->buf_list
, struct io_buffer
, list
);
3403 list_del(&kbuf
->list
);
3404 if (*len
> kbuf
->len
)
3406 req
->flags
|= REQ_F_BUFFER_SELECTED
;
3409 kbuf
= ERR_PTR(-ENOBUFS
);
3412 io_ring_submit_unlock(req
->ctx
, needs_lock
);
3416 static void __user
*io_rw_buffer_select(struct io_kiocb
*req
, size_t *len
,
3417 unsigned int issue_flags
)
3419 struct io_buffer
*kbuf
;
3422 bgid
= req
->buf_index
;
3423 kbuf
= io_buffer_select(req
, len
, bgid
, issue_flags
);
3426 return u64_to_user_ptr(kbuf
->addr
);
3429 #ifdef CONFIG_COMPAT
3430 static ssize_t
io_compat_import(struct io_kiocb
*req
, struct iovec
*iov
,
3431 unsigned int issue_flags
)
3433 struct compat_iovec __user
*uiov
;
3434 compat_ssize_t clen
;
3438 uiov
= u64_to_user_ptr(req
->rw
.addr
);
3439 if (!access_ok(uiov
, sizeof(*uiov
)))
3441 if (__get_user(clen
, &uiov
->iov_len
))
3447 buf
= io_rw_buffer_select(req
, &len
, issue_flags
);
3449 return PTR_ERR(buf
);
3450 iov
[0].iov_base
= buf
;
3451 iov
[0].iov_len
= (compat_size_t
) len
;
3456 static ssize_t
__io_iov_buffer_select(struct io_kiocb
*req
, struct iovec
*iov
,
3457 unsigned int issue_flags
)
3459 struct iovec __user
*uiov
= u64_to_user_ptr(req
->rw
.addr
);
3463 if (copy_from_user(iov
, uiov
, sizeof(*uiov
)))
3466 len
= iov
[0].iov_len
;
3469 buf
= io_rw_buffer_select(req
, &len
, issue_flags
);
3471 return PTR_ERR(buf
);
3472 iov
[0].iov_base
= buf
;
3473 iov
[0].iov_len
= len
;
3477 static ssize_t
io_iov_buffer_select(struct io_kiocb
*req
, struct iovec
*iov
,
3478 unsigned int issue_flags
)
3480 if (req
->flags
& REQ_F_BUFFER_SELECTED
) {
3481 struct io_buffer
*kbuf
= req
->kbuf
;
3483 iov
[0].iov_base
= u64_to_user_ptr(kbuf
->addr
);
3484 iov
[0].iov_len
= kbuf
->len
;
3487 if (req
->rw
.len
!= 1)
3490 #ifdef CONFIG_COMPAT
3491 if (req
->ctx
->compat
)
3492 return io_compat_import(req
, iov
, issue_flags
);
3495 return __io_iov_buffer_select(req
, iov
, issue_flags
);
3498 static inline bool io_do_buffer_select(struct io_kiocb
*req
)
3500 if (!(req
->flags
& REQ_F_BUFFER_SELECT
))
3502 return !(req
->flags
& REQ_F_BUFFER_SELECTED
);
3505 static struct iovec
*__io_import_iovec(int rw
, struct io_kiocb
*req
,
3506 struct io_rw_state
*s
,
3507 unsigned int issue_flags
)
3509 struct iov_iter
*iter
= &s
->iter
;
3510 u8 opcode
= req
->opcode
;
3511 struct iovec
*iovec
;
3516 if (opcode
== IORING_OP_READ_FIXED
|| opcode
== IORING_OP_WRITE_FIXED
) {
3517 ret
= io_import_fixed(req
, rw
, iter
, issue_flags
);
3519 return ERR_PTR(ret
);
3523 /* buffer index only valid with fixed read/write, or buffer select */
3524 if (unlikely(req
->buf_index
&& !(req
->flags
& REQ_F_BUFFER_SELECT
)))
3525 return ERR_PTR(-EINVAL
);
3527 buf
= u64_to_user_ptr(req
->rw
.addr
);
3528 sqe_len
= req
->rw
.len
;
3530 if (opcode
== IORING_OP_READ
|| opcode
== IORING_OP_WRITE
) {
3531 if (req
->flags
& REQ_F_BUFFER_SELECT
) {
3532 buf
= io_rw_buffer_select(req
, &sqe_len
, issue_flags
);
3534 return ERR_CAST(buf
);
3535 req
->rw
.len
= sqe_len
;
3538 ret
= import_single_range(rw
, buf
, sqe_len
, s
->fast_iov
, iter
);
3540 return ERR_PTR(ret
);
3544 iovec
= s
->fast_iov
;
3545 if (req
->flags
& REQ_F_BUFFER_SELECT
) {
3546 ret
= io_iov_buffer_select(req
, iovec
, issue_flags
);
3548 return ERR_PTR(ret
);
3549 iov_iter_init(iter
, rw
, iovec
, 1, iovec
->iov_len
);
3553 ret
= __import_iovec(rw
, buf
, sqe_len
, UIO_FASTIOV
, &iovec
, iter
,
3555 if (unlikely(ret
< 0))
3556 return ERR_PTR(ret
);
3560 static inline int io_import_iovec(int rw
, struct io_kiocb
*req
,
3561 struct iovec
**iovec
, struct io_rw_state
*s
,
3562 unsigned int issue_flags
)
3564 *iovec
= __io_import_iovec(rw
, req
, s
, issue_flags
);
3565 if (unlikely(IS_ERR(*iovec
)))
3566 return PTR_ERR(*iovec
);
3568 iov_iter_save_state(&s
->iter
, &s
->iter_state
);
3572 static inline loff_t
*io_kiocb_ppos(struct kiocb
*kiocb
)
3574 return (kiocb
->ki_filp
->f_mode
& FMODE_STREAM
) ? NULL
: &kiocb
->ki_pos
;
3578 * For files that don't have ->read_iter() and ->write_iter(), handle them
3579 * by looping over ->read() or ->write() manually.
3581 static ssize_t
loop_rw_iter(int rw
, struct io_kiocb
*req
, struct iov_iter
*iter
)
3583 struct kiocb
*kiocb
= &req
->rw
.kiocb
;
3584 struct file
*file
= req
->file
;
3589 * Don't support polled IO through this interface, and we can't
3590 * support non-blocking either. For the latter, this just causes
3591 * the kiocb to be handled from an async context.
3593 if (kiocb
->ki_flags
& IOCB_HIPRI
)
3595 if ((kiocb
->ki_flags
& IOCB_NOWAIT
) &&
3596 !(kiocb
->ki_filp
->f_flags
& O_NONBLOCK
))
3599 ppos
= io_kiocb_ppos(kiocb
);
3601 while (iov_iter_count(iter
)) {
3605 if (!iov_iter_is_bvec(iter
)) {
3606 iovec
= iov_iter_iovec(iter
);
3608 iovec
.iov_base
= u64_to_user_ptr(req
->rw
.addr
);
3609 iovec
.iov_len
= req
->rw
.len
;
3613 nr
= file
->f_op
->read(file
, iovec
.iov_base
,
3614 iovec
.iov_len
, ppos
);
3616 nr
= file
->f_op
->write(file
, iovec
.iov_base
,
3617 iovec
.iov_len
, ppos
);
3626 if (!iov_iter_is_bvec(iter
)) {
3627 iov_iter_advance(iter
, nr
);
3634 if (nr
!= iovec
.iov_len
)
3641 static void io_req_map_rw(struct io_kiocb
*req
, const struct iovec
*iovec
,
3642 const struct iovec
*fast_iov
, struct iov_iter
*iter
)
3644 struct io_async_rw
*rw
= req
->async_data
;
3646 memcpy(&rw
->s
.iter
, iter
, sizeof(*iter
));
3647 rw
->free_iovec
= iovec
;
3649 /* can only be fixed buffers, no need to do anything */
3650 if (iov_iter_is_bvec(iter
))
3653 unsigned iov_off
= 0;
3655 rw
->s
.iter
.iov
= rw
->s
.fast_iov
;
3656 if (iter
->iov
!= fast_iov
) {
3657 iov_off
= iter
->iov
- fast_iov
;
3658 rw
->s
.iter
.iov
+= iov_off
;
3660 if (rw
->s
.fast_iov
!= fast_iov
)
3661 memcpy(rw
->s
.fast_iov
+ iov_off
, fast_iov
+ iov_off
,
3662 sizeof(struct iovec
) * iter
->nr_segs
);
3664 req
->flags
|= REQ_F_NEED_CLEANUP
;
3668 static inline bool io_alloc_async_data(struct io_kiocb
*req
)
3670 WARN_ON_ONCE(!io_op_defs
[req
->opcode
].async_size
);
3671 req
->async_data
= kmalloc(io_op_defs
[req
->opcode
].async_size
, GFP_KERNEL
);
3672 if (req
->async_data
) {
3673 req
->flags
|= REQ_F_ASYNC_DATA
;
3679 static int io_setup_async_rw(struct io_kiocb
*req
, const struct iovec
*iovec
,
3680 struct io_rw_state
*s
, bool force
)
3682 if (!force
&& !io_op_defs
[req
->opcode
].needs_async_setup
)
3684 if (!req_has_async_data(req
)) {
3685 struct io_async_rw
*iorw
;
3687 if (io_alloc_async_data(req
)) {
3692 io_req_map_rw(req
, iovec
, s
->fast_iov
, &s
->iter
);
3693 iorw
= req
->async_data
;
3694 /* we've copied and mapped the iter, ensure state is saved */
3695 iov_iter_save_state(&iorw
->s
.iter
, &iorw
->s
.iter_state
);
3700 static inline int io_rw_prep_async(struct io_kiocb
*req
, int rw
)
3702 struct io_async_rw
*iorw
= req
->async_data
;
3706 /* submission path, ->uring_lock should already be taken */
3707 ret
= io_import_iovec(rw
, req
, &iov
, &iorw
->s
, 0);
3708 if (unlikely(ret
< 0))
3711 iorw
->bytes_done
= 0;
3712 iorw
->free_iovec
= iov
;
3714 req
->flags
|= REQ_F_NEED_CLEANUP
;
3719 * This is our waitqueue callback handler, registered through __folio_lock_async()
3720 * when we initially tried to do the IO with the iocb armed our waitqueue.
3721 * This gets called when the page is unlocked, and we generally expect that to
3722 * happen when the page IO is completed and the page is now uptodate. This will
3723 * queue a task_work based retry of the operation, attempting to copy the data
3724 * again. If the latter fails because the page was NOT uptodate, then we will
3725 * do a thread based blocking retry of the operation. That's the unexpected
3728 static int io_async_buf_func(struct wait_queue_entry
*wait
, unsigned mode
,
3729 int sync
, void *arg
)
3731 struct wait_page_queue
*wpq
;
3732 struct io_kiocb
*req
= wait
->private;
3733 struct wait_page_key
*key
= arg
;
3735 wpq
= container_of(wait
, struct wait_page_queue
, wait
);
3737 if (!wake_page_match(wpq
, key
))
3740 req
->rw
.kiocb
.ki_flags
&= ~IOCB_WAITQ
;
3741 list_del_init(&wait
->entry
);
3742 io_req_task_queue(req
);
3747 * This controls whether a given IO request should be armed for async page
3748 * based retry. If we return false here, the request is handed to the async
3749 * worker threads for retry. If we're doing buffered reads on a regular file,
3750 * we prepare a private wait_page_queue entry and retry the operation. This
3751 * will either succeed because the page is now uptodate and unlocked, or it
3752 * will register a callback when the page is unlocked at IO completion. Through
3753 * that callback, io_uring uses task_work to setup a retry of the operation.
3754 * That retry will attempt the buffered read again. The retry will generally
3755 * succeed, or in rare cases where it fails, we then fall back to using the
3756 * async worker threads for a blocking retry.
3758 static bool io_rw_should_retry(struct io_kiocb
*req
)
3760 struct io_async_rw
*rw
= req
->async_data
;
3761 struct wait_page_queue
*wait
= &rw
->wpq
;
3762 struct kiocb
*kiocb
= &req
->rw
.kiocb
;
3764 /* never retry for NOWAIT, we just complete with -EAGAIN */
3765 if (req
->flags
& REQ_F_NOWAIT
)
3768 /* Only for buffered IO */
3769 if (kiocb
->ki_flags
& (IOCB_DIRECT
| IOCB_HIPRI
))
3773 * just use poll if we can, and don't attempt if the fs doesn't
3774 * support callback based unlocks
3776 if (file_can_poll(req
->file
) || !(req
->file
->f_mode
& FMODE_BUF_RASYNC
))
3779 wait
->wait
.func
= io_async_buf_func
;
3780 wait
->wait
.private = req
;
3781 wait
->wait
.flags
= 0;
3782 INIT_LIST_HEAD(&wait
->wait
.entry
);
3783 kiocb
->ki_flags
|= IOCB_WAITQ
;
3784 kiocb
->ki_flags
&= ~IOCB_NOWAIT
;
3785 kiocb
->ki_waitq
= wait
;
3789 static inline int io_iter_do_read(struct io_kiocb
*req
, struct iov_iter
*iter
)
3791 if (likely(req
->file
->f_op
->read_iter
))
3792 return call_read_iter(req
->file
, &req
->rw
.kiocb
, iter
);
3793 else if (req
->file
->f_op
->read
)
3794 return loop_rw_iter(READ
, req
, iter
);
3799 static bool need_read_all(struct io_kiocb
*req
)
3801 return req
->flags
& REQ_F_ISREG
||
3802 S_ISBLK(file_inode(req
->file
)->i_mode
);
3805 static int io_rw_init_file(struct io_kiocb
*req
, fmode_t mode
)
3807 struct kiocb
*kiocb
= &req
->rw
.kiocb
;
3808 struct io_ring_ctx
*ctx
= req
->ctx
;
3809 struct file
*file
= req
->file
;
3812 if (unlikely(!file
|| !(file
->f_mode
& mode
)))
3815 if (!io_req_ffs_set(req
))
3816 req
->flags
|= io_file_get_flags(file
) << REQ_F_SUPPORT_NOWAIT_BIT
;
3818 kiocb
->ki_flags
= iocb_flags(file
);
3819 ret
= kiocb_set_rw_flags(kiocb
, req
->rw
.flags
);
3824 * If the file is marked O_NONBLOCK, still allow retry for it if it
3825 * supports async. Otherwise it's impossible to use O_NONBLOCK files
3826 * reliably. If not, or it IOCB_NOWAIT is set, don't retry.
3828 if ((kiocb
->ki_flags
& IOCB_NOWAIT
) ||
3829 ((file
->f_flags
& O_NONBLOCK
) && !io_file_supports_nowait(req
)))
3830 req
->flags
|= REQ_F_NOWAIT
;
3832 if (ctx
->flags
& IORING_SETUP_IOPOLL
) {
3833 if (!(kiocb
->ki_flags
& IOCB_DIRECT
) || !file
->f_op
->iopoll
)
3836 kiocb
->private = NULL
;
3837 kiocb
->ki_flags
|= IOCB_HIPRI
| IOCB_ALLOC_CACHE
;
3838 kiocb
->ki_complete
= io_complete_rw_iopoll
;
3839 req
->iopoll_completed
= 0;
3841 if (kiocb
->ki_flags
& IOCB_HIPRI
)
3843 kiocb
->ki_complete
= io_complete_rw
;
3849 static int io_read(struct io_kiocb
*req
, unsigned int issue_flags
)
3851 struct io_rw_state __s
, *s
= &__s
;
3852 struct iovec
*iovec
;
3853 struct kiocb
*kiocb
= &req
->rw
.kiocb
;
3854 bool force_nonblock
= issue_flags
& IO_URING_F_NONBLOCK
;
3855 struct io_async_rw
*rw
;
3859 if (!req_has_async_data(req
)) {
3860 ret
= io_import_iovec(READ
, req
, &iovec
, s
, issue_flags
);
3861 if (unlikely(ret
< 0))
3864 rw
= req
->async_data
;
3868 * Safe and required to re-import if we're using provided
3869 * buffers, as we dropped the selected one before retry.
3871 if (io_do_buffer_select(req
)) {
3872 ret
= io_import_iovec(READ
, req
, &iovec
, s
, issue_flags
);
3873 if (unlikely(ret
< 0))
3878 * We come here from an earlier attempt, restore our state to
3879 * match in case it doesn't. It's cheap enough that we don't
3880 * need to make this conditional.
3882 iov_iter_restore(&s
->iter
, &s
->iter_state
);
3885 ret
= io_rw_init_file(req
, FMODE_READ
);
3886 if (unlikely(ret
)) {
3890 req
->result
= iov_iter_count(&s
->iter
);
3892 if (force_nonblock
) {
3893 /* If the file doesn't support async, just async punt */
3894 if (unlikely(!io_file_supports_nowait(req
))) {
3895 ret
= io_setup_async_rw(req
, iovec
, s
, true);
3896 return ret
?: -EAGAIN
;
3898 kiocb
->ki_flags
|= IOCB_NOWAIT
;
3900 /* Ensure we clear previously set non-block flag */
3901 kiocb
->ki_flags
&= ~IOCB_NOWAIT
;
3904 ppos
= io_kiocb_update_pos(req
);
3906 ret
= rw_verify_area(READ
, req
->file
, ppos
, req
->result
);
3907 if (unlikely(ret
)) {
3912 ret
= io_iter_do_read(req
, &s
->iter
);
3914 if (ret
== -EAGAIN
|| (req
->flags
& REQ_F_REISSUE
)) {
3915 req
->flags
&= ~REQ_F_REISSUE
;
3916 /* if we can poll, just do that */
3917 if (req
->opcode
== IORING_OP_READ
&& file_can_poll(req
->file
))
3919 /* IOPOLL retry should happen for io-wq threads */
3920 if (!force_nonblock
&& !(req
->ctx
->flags
& IORING_SETUP_IOPOLL
))
3922 /* no retry on NONBLOCK nor RWF_NOWAIT */
3923 if (req
->flags
& REQ_F_NOWAIT
)
3926 } else if (ret
== -EIOCBQUEUED
) {
3928 } else if (ret
== req
->result
|| ret
<= 0 || !force_nonblock
||
3929 (req
->flags
& REQ_F_NOWAIT
) || !need_read_all(req
)) {
3930 /* read all, failed, already did sync or don't want to retry */
3935 * Don't depend on the iter state matching what was consumed, or being
3936 * untouched in case of error. Restore it and we'll advance it
3937 * manually if we need to.
3939 iov_iter_restore(&s
->iter
, &s
->iter_state
);
3941 ret2
= io_setup_async_rw(req
, iovec
, s
, true);
3946 rw
= req
->async_data
;
3949 * Now use our persistent iterator and state, if we aren't already.
3950 * We've restored and mapped the iter to match.
3955 * We end up here because of a partial read, either from
3956 * above or inside this loop. Advance the iter by the bytes
3957 * that were consumed.
3959 iov_iter_advance(&s
->iter
, ret
);
3960 if (!iov_iter_count(&s
->iter
))
3962 rw
->bytes_done
+= ret
;
3963 iov_iter_save_state(&s
->iter
, &s
->iter_state
);
3965 /* if we can retry, do so with the callbacks armed */
3966 if (!io_rw_should_retry(req
)) {
3967 kiocb
->ki_flags
&= ~IOCB_WAITQ
;
3972 * Now retry read with the IOCB_WAITQ parts set in the iocb. If
3973 * we get -EIOCBQUEUED, then we'll get a notification when the
3974 * desired page gets unlocked. We can also get a partial read
3975 * here, and if we do, then just retry at the new offset.
3977 ret
= io_iter_do_read(req
, &s
->iter
);
3978 if (ret
== -EIOCBQUEUED
)
3980 /* we got some bytes, but not all. retry. */
3981 kiocb
->ki_flags
&= ~IOCB_WAITQ
;
3982 iov_iter_restore(&s
->iter
, &s
->iter_state
);
3985 kiocb_done(req
, ret
, issue_flags
);
3987 /* it's faster to check here then delegate to kfree */
3993 static int io_write(struct io_kiocb
*req
, unsigned int issue_flags
)
3995 struct io_rw_state __s
, *s
= &__s
;
3996 struct iovec
*iovec
;
3997 struct kiocb
*kiocb
= &req
->rw
.kiocb
;
3998 bool force_nonblock
= issue_flags
& IO_URING_F_NONBLOCK
;
4002 if (!req_has_async_data(req
)) {
4003 ret
= io_import_iovec(WRITE
, req
, &iovec
, s
, issue_flags
);
4004 if (unlikely(ret
< 0))
4007 struct io_async_rw
*rw
= req
->async_data
;
4010 iov_iter_restore(&s
->iter
, &s
->iter_state
);
4013 ret
= io_rw_init_file(req
, FMODE_WRITE
);
4014 if (unlikely(ret
)) {
4018 req
->result
= iov_iter_count(&s
->iter
);
4020 if (force_nonblock
) {
4021 /* If the file doesn't support async, just async punt */
4022 if (unlikely(!io_file_supports_nowait(req
)))
4025 /* file path doesn't support NOWAIT for non-direct_IO */
4026 if (force_nonblock
&& !(kiocb
->ki_flags
& IOCB_DIRECT
) &&
4027 (req
->flags
& REQ_F_ISREG
))
4030 kiocb
->ki_flags
|= IOCB_NOWAIT
;
4032 /* Ensure we clear previously set non-block flag */
4033 kiocb
->ki_flags
&= ~IOCB_NOWAIT
;
4036 ppos
= io_kiocb_update_pos(req
);
4038 ret
= rw_verify_area(WRITE
, req
->file
, ppos
, req
->result
);
4043 * Open-code file_start_write here to grab freeze protection,
4044 * which will be released by another thread in
4045 * io_complete_rw(). Fool lockdep by telling it the lock got
4046 * released so that it doesn't complain about the held lock when
4047 * we return to userspace.
4049 if (req
->flags
& REQ_F_ISREG
) {
4050 sb_start_write(file_inode(req
->file
)->i_sb
);
4051 __sb_writers_release(file_inode(req
->file
)->i_sb
,
4054 kiocb
->ki_flags
|= IOCB_WRITE
;
4056 if (likely(req
->file
->f_op
->write_iter
))
4057 ret2
= call_write_iter(req
->file
, kiocb
, &s
->iter
);
4058 else if (req
->file
->f_op
->write
)
4059 ret2
= loop_rw_iter(WRITE
, req
, &s
->iter
);
4063 if (req
->flags
& REQ_F_REISSUE
) {
4064 req
->flags
&= ~REQ_F_REISSUE
;
4069 * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
4070 * retry them without IOCB_NOWAIT.
4072 if (ret2
== -EOPNOTSUPP
&& (kiocb
->ki_flags
& IOCB_NOWAIT
))
4074 /* no retry on NONBLOCK nor RWF_NOWAIT */
4075 if (ret2
== -EAGAIN
&& (req
->flags
& REQ_F_NOWAIT
))
4077 if (!force_nonblock
|| ret2
!= -EAGAIN
) {
4078 /* IOPOLL retry should happen for io-wq threads */
4079 if (ret2
== -EAGAIN
&& (req
->ctx
->flags
& IORING_SETUP_IOPOLL
))
4082 kiocb_done(req
, ret2
, issue_flags
);
4085 iov_iter_restore(&s
->iter
, &s
->iter_state
);
4086 ret
= io_setup_async_rw(req
, iovec
, s
, false);
4087 return ret
?: -EAGAIN
;
4090 /* it's reportedly faster than delegating the null check to kfree() */
4096 static int io_renameat_prep(struct io_kiocb
*req
,
4097 const struct io_uring_sqe
*sqe
)
4099 struct io_rename
*ren
= &req
->rename
;
4100 const char __user
*oldf
, *newf
;
4102 if (unlikely(req
->ctx
->flags
& IORING_SETUP_IOPOLL
))
4104 if (sqe
->ioprio
|| sqe
->buf_index
|| sqe
->splice_fd_in
)
4106 if (unlikely(req
->flags
& REQ_F_FIXED_FILE
))
4109 ren
->old_dfd
= READ_ONCE(sqe
->fd
);
4110 oldf
= u64_to_user_ptr(READ_ONCE(sqe
->addr
));
4111 newf
= u64_to_user_ptr(READ_ONCE(sqe
->addr2
));
4112 ren
->new_dfd
= READ_ONCE(sqe
->len
);
4113 ren
->flags
= READ_ONCE(sqe
->rename_flags
);
4115 ren
->oldpath
= getname(oldf
);
4116 if (IS_ERR(ren
->oldpath
))
4117 return PTR_ERR(ren
->oldpath
);
4119 ren
->newpath
= getname(newf
);
4120 if (IS_ERR(ren
->newpath
)) {
4121 putname(ren
->oldpath
);
4122 return PTR_ERR(ren
->newpath
);
4125 req
->flags
|= REQ_F_NEED_CLEANUP
;
4129 static int io_renameat(struct io_kiocb
*req
, unsigned int issue_flags
)
4131 struct io_rename
*ren
= &req
->rename
;
4134 if (issue_flags
& IO_URING_F_NONBLOCK
)
4137 ret
= do_renameat2(ren
->old_dfd
, ren
->oldpath
, ren
->new_dfd
,
4138 ren
->newpath
, ren
->flags
);
4140 req
->flags
&= ~REQ_F_NEED_CLEANUP
;
4143 io_req_complete(req
, ret
);
4147 static int io_unlinkat_prep(struct io_kiocb
*req
,
4148 const struct io_uring_sqe
*sqe
)
4150 struct io_unlink
*un
= &req
->unlink
;
4151 const char __user
*fname
;
4153 if (unlikely(req
->ctx
->flags
& IORING_SETUP_IOPOLL
))
4155 if (sqe
->ioprio
|| sqe
->off
|| sqe
->len
|| sqe
->buf_index
||
4158 if (unlikely(req
->flags
& REQ_F_FIXED_FILE
))
4161 un
->dfd
= READ_ONCE(sqe
->fd
);
4163 un
->flags
= READ_ONCE(sqe
->unlink_flags
);
4164 if (un
->flags
& ~AT_REMOVEDIR
)
4167 fname
= u64_to_user_ptr(READ_ONCE(sqe
->addr
));
4168 un
->filename
= getname(fname
);
4169 if (IS_ERR(un
->filename
))
4170 return PTR_ERR(un
->filename
);
4172 req
->flags
|= REQ_F_NEED_CLEANUP
;
4176 static int io_unlinkat(struct io_kiocb
*req
, unsigned int issue_flags
)
4178 struct io_unlink
*un
= &req
->unlink
;
4181 if (issue_flags
& IO_URING_F_NONBLOCK
)
4184 if (un
->flags
& AT_REMOVEDIR
)
4185 ret
= do_rmdir(un
->dfd
, un
->filename
);
4187 ret
= do_unlinkat(un
->dfd
, un
->filename
);
4189 req
->flags
&= ~REQ_F_NEED_CLEANUP
;
4192 io_req_complete(req
, ret
);
4196 static int io_mkdirat_prep(struct io_kiocb
*req
,
4197 const struct io_uring_sqe
*sqe
)
4199 struct io_mkdir
*mkd
= &req
->mkdir
;
4200 const char __user
*fname
;
4202 if (unlikely(req
->ctx
->flags
& IORING_SETUP_IOPOLL
))
4204 if (sqe
->ioprio
|| sqe
->off
|| sqe
->rw_flags
|| sqe
->buf_index
||
4207 if (unlikely(req
->flags
& REQ_F_FIXED_FILE
))
4210 mkd
->dfd
= READ_ONCE(sqe
->fd
);
4211 mkd
->mode
= READ_ONCE(sqe
->len
);
4213 fname
= u64_to_user_ptr(READ_ONCE(sqe
->addr
));
4214 mkd
->filename
= getname(fname
);
4215 if (IS_ERR(mkd
->filename
))
4216 return PTR_ERR(mkd
->filename
);
4218 req
->flags
|= REQ_F_NEED_CLEANUP
;
4222 static int io_mkdirat(struct io_kiocb
*req
, unsigned int issue_flags
)
4224 struct io_mkdir
*mkd
= &req
->mkdir
;
4227 if (issue_flags
& IO_URING_F_NONBLOCK
)
4230 ret
= do_mkdirat(mkd
->dfd
, mkd
->filename
, mkd
->mode
);
4232 req
->flags
&= ~REQ_F_NEED_CLEANUP
;
4235 io_req_complete(req
, ret
);
4239 static int io_symlinkat_prep(struct io_kiocb
*req
,
4240 const struct io_uring_sqe
*sqe
)
4242 struct io_symlink
*sl
= &req
->symlink
;
4243 const char __user
*oldpath
, *newpath
;
4245 if (unlikely(req
->ctx
->flags
& IORING_SETUP_IOPOLL
))
4247 if (sqe
->ioprio
|| sqe
->len
|| sqe
->rw_flags
|| sqe
->buf_index
||
4250 if (unlikely(req
->flags
& REQ_F_FIXED_FILE
))
4253 sl
->new_dfd
= READ_ONCE(sqe
->fd
);
4254 oldpath
= u64_to_user_ptr(READ_ONCE(sqe
->addr
));
4255 newpath
= u64_to_user_ptr(READ_ONCE(sqe
->addr2
));
4257 sl
->oldpath
= getname(oldpath
);
4258 if (IS_ERR(sl
->oldpath
))
4259 return PTR_ERR(sl
->oldpath
);
4261 sl
->newpath
= getname(newpath
);
4262 if (IS_ERR(sl
->newpath
)) {
4263 putname(sl
->oldpath
);
4264 return PTR_ERR(sl
->newpath
);
4267 req
->flags
|= REQ_F_NEED_CLEANUP
;
4271 static int io_symlinkat(struct io_kiocb
*req
, unsigned int issue_flags
)
4273 struct io_symlink
*sl
= &req
->symlink
;
4276 if (issue_flags
& IO_URING_F_NONBLOCK
)
4279 ret
= do_symlinkat(sl
->oldpath
, sl
->new_dfd
, sl
->newpath
);
4281 req
->flags
&= ~REQ_F_NEED_CLEANUP
;
4284 io_req_complete(req
, ret
);
4288 static int io_linkat_prep(struct io_kiocb
*req
,
4289 const struct io_uring_sqe
*sqe
)
4291 struct io_hardlink
*lnk
= &req
->hardlink
;
4292 const char __user
*oldf
, *newf
;
4294 if (unlikely(req
->ctx
->flags
& IORING_SETUP_IOPOLL
))
4296 if (sqe
->ioprio
|| sqe
->rw_flags
|| sqe
->buf_index
|| sqe
->splice_fd_in
)
4298 if (unlikely(req
->flags
& REQ_F_FIXED_FILE
))
4301 lnk
->old_dfd
= READ_ONCE(sqe
->fd
);
4302 lnk
->new_dfd
= READ_ONCE(sqe
->len
);
4303 oldf
= u64_to_user_ptr(READ_ONCE(sqe
->addr
));
4304 newf
= u64_to_user_ptr(READ_ONCE(sqe
->addr2
));
4305 lnk
->flags
= READ_ONCE(sqe
->hardlink_flags
);
4307 lnk
->oldpath
= getname(oldf
);
4308 if (IS_ERR(lnk
->oldpath
))
4309 return PTR_ERR(lnk
->oldpath
);
4311 lnk
->newpath
= getname(newf
);
4312 if (IS_ERR(lnk
->newpath
)) {
4313 putname(lnk
->oldpath
);
4314 return PTR_ERR(lnk
->newpath
);
4317 req
->flags
|= REQ_F_NEED_CLEANUP
;
4321 static int io_linkat(struct io_kiocb
*req
, unsigned int issue_flags
)
4323 struct io_hardlink
*lnk
= &req
->hardlink
;
4326 if (issue_flags
& IO_URING_F_NONBLOCK
)
4329 ret
= do_linkat(lnk
->old_dfd
, lnk
->oldpath
, lnk
->new_dfd
,
4330 lnk
->newpath
, lnk
->flags
);
4332 req
->flags
&= ~REQ_F_NEED_CLEANUP
;
4335 io_req_complete(req
, ret
);
4339 static int io_shutdown_prep(struct io_kiocb
*req
,
4340 const struct io_uring_sqe
*sqe
)
4342 #if defined(CONFIG_NET)
4343 if (unlikely(req
->ctx
->flags
& IORING_SETUP_IOPOLL
))
4345 if (unlikely(sqe
->ioprio
|| sqe
->off
|| sqe
->addr
|| sqe
->rw_flags
||
4346 sqe
->buf_index
|| sqe
->splice_fd_in
))
4349 req
->shutdown
.how
= READ_ONCE(sqe
->len
);
4356 static int io_shutdown(struct io_kiocb
*req
, unsigned int issue_flags
)
4358 #if defined(CONFIG_NET)
4359 struct socket
*sock
;
4362 if (issue_flags
& IO_URING_F_NONBLOCK
)
4365 sock
= sock_from_file(req
->file
);
4366 if (unlikely(!sock
))
4369 ret
= __sys_shutdown_sock(sock
, req
->shutdown
.how
);
4372 io_req_complete(req
, ret
);
4379 static int __io_splice_prep(struct io_kiocb
*req
,
4380 const struct io_uring_sqe
*sqe
)
4382 struct io_splice
*sp
= &req
->splice
;
4383 unsigned int valid_flags
= SPLICE_F_FD_IN_FIXED
| SPLICE_F_ALL
;
4385 if (unlikely(req
->ctx
->flags
& IORING_SETUP_IOPOLL
))
4388 sp
->len
= READ_ONCE(sqe
->len
);
4389 sp
->flags
= READ_ONCE(sqe
->splice_flags
);
4390 if (unlikely(sp
->flags
& ~valid_flags
))
4392 sp
->splice_fd_in
= READ_ONCE(sqe
->splice_fd_in
);
4396 static int io_tee_prep(struct io_kiocb
*req
,
4397 const struct io_uring_sqe
*sqe
)
4399 if (READ_ONCE(sqe
->splice_off_in
) || READ_ONCE(sqe
->off
))
4401 return __io_splice_prep(req
, sqe
);
4404 static int io_tee(struct io_kiocb
*req
, unsigned int issue_flags
)
4406 struct io_splice
*sp
= &req
->splice
;
4407 struct file
*out
= sp
->file_out
;
4408 unsigned int flags
= sp
->flags
& ~SPLICE_F_FD_IN_FIXED
;
4412 if (issue_flags
& IO_URING_F_NONBLOCK
)
4415 if (sp
->flags
& SPLICE_F_FD_IN_FIXED
)
4416 in
= io_file_get_fixed(req
, sp
->splice_fd_in
, issue_flags
);
4418 in
= io_file_get_normal(req
, sp
->splice_fd_in
);
4425 ret
= do_tee(in
, out
, sp
->len
, flags
);
4427 if (!(sp
->flags
& SPLICE_F_FD_IN_FIXED
))
4432 io_req_complete(req
, ret
);
4436 static int io_splice_prep(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
4438 struct io_splice
*sp
= &req
->splice
;
4440 sp
->off_in
= READ_ONCE(sqe
->splice_off_in
);
4441 sp
->off_out
= READ_ONCE(sqe
->off
);
4442 return __io_splice_prep(req
, sqe
);
4445 static int io_splice(struct io_kiocb
*req
, unsigned int issue_flags
)
4447 struct io_splice
*sp
= &req
->splice
;
4448 struct file
*out
= sp
->file_out
;
4449 unsigned int flags
= sp
->flags
& ~SPLICE_F_FD_IN_FIXED
;
4450 loff_t
*poff_in
, *poff_out
;
4454 if (issue_flags
& IO_URING_F_NONBLOCK
)
4457 if (sp
->flags
& SPLICE_F_FD_IN_FIXED
)
4458 in
= io_file_get_fixed(req
, sp
->splice_fd_in
, issue_flags
);
4460 in
= io_file_get_normal(req
, sp
->splice_fd_in
);
4466 poff_in
= (sp
->off_in
== -1) ? NULL
: &sp
->off_in
;
4467 poff_out
= (sp
->off_out
== -1) ? NULL
: &sp
->off_out
;
4470 ret
= do_splice(in
, poff_in
, out
, poff_out
, sp
->len
, flags
);
4472 if (!(sp
->flags
& SPLICE_F_FD_IN_FIXED
))
4477 io_req_complete(req
, ret
);
4482 * IORING_OP_NOP just posts a completion event, nothing else.
4484 static int io_nop(struct io_kiocb
*req
, unsigned int issue_flags
)
4486 struct io_ring_ctx
*ctx
= req
->ctx
;
4488 if (unlikely(ctx
->flags
& IORING_SETUP_IOPOLL
))
4491 __io_req_complete(req
, issue_flags
, 0, 0);
4495 static int io_msg_ring_prep(struct io_kiocb
*req
,
4496 const struct io_uring_sqe
*sqe
)
4498 if (unlikely(sqe
->addr
|| sqe
->ioprio
|| sqe
->rw_flags
||
4499 sqe
->splice_fd_in
|| sqe
->buf_index
|| sqe
->personality
))
4502 req
->msg
.user_data
= READ_ONCE(sqe
->off
);
4503 req
->msg
.len
= READ_ONCE(sqe
->len
);
4507 static int io_msg_ring(struct io_kiocb
*req
, unsigned int issue_flags
)
4509 struct io_ring_ctx
*target_ctx
;
4510 struct io_msg
*msg
= &req
->msg
;
4515 if (req
->file
->f_op
!= &io_uring_fops
)
4519 target_ctx
= req
->file
->private_data
;
4521 spin_lock(&target_ctx
->completion_lock
);
4522 filled
= io_fill_cqe_aux(target_ctx
, msg
->user_data
, msg
->len
, 0);
4523 io_commit_cqring(target_ctx
);
4524 spin_unlock(&target_ctx
->completion_lock
);
4527 io_cqring_ev_posted(target_ctx
);
4534 __io_req_complete(req
, issue_flags
, ret
, 0);
4535 /* put file to avoid an attempt to IOPOLL the req */
4536 io_put_file(req
->file
);
4541 static int io_fsync_prep(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
4543 struct io_ring_ctx
*ctx
= req
->ctx
;
4545 if (unlikely(ctx
->flags
& IORING_SETUP_IOPOLL
))
4547 if (unlikely(sqe
->addr
|| sqe
->ioprio
|| sqe
->buf_index
||
4551 req
->sync
.flags
= READ_ONCE(sqe
->fsync_flags
);
4552 if (unlikely(req
->sync
.flags
& ~IORING_FSYNC_DATASYNC
))
4555 req
->sync
.off
= READ_ONCE(sqe
->off
);
4556 req
->sync
.len
= READ_ONCE(sqe
->len
);
4560 static int io_fsync(struct io_kiocb
*req
, unsigned int issue_flags
)
4562 loff_t end
= req
->sync
.off
+ req
->sync
.len
;
4565 /* fsync always requires a blocking context */
4566 if (issue_flags
& IO_URING_F_NONBLOCK
)
4569 ret
= vfs_fsync_range(req
->file
, req
->sync
.off
,
4570 end
> 0 ? end
: LLONG_MAX
,
4571 req
->sync
.flags
& IORING_FSYNC_DATASYNC
);
4574 io_req_complete(req
, ret
);
4578 static int io_fallocate_prep(struct io_kiocb
*req
,
4579 const struct io_uring_sqe
*sqe
)
4581 if (sqe
->ioprio
|| sqe
->buf_index
|| sqe
->rw_flags
||
4584 if (unlikely(req
->ctx
->flags
& IORING_SETUP_IOPOLL
))
4587 req
->sync
.off
= READ_ONCE(sqe
->off
);
4588 req
->sync
.len
= READ_ONCE(sqe
->addr
);
4589 req
->sync
.mode
= READ_ONCE(sqe
->len
);
4593 static int io_fallocate(struct io_kiocb
*req
, unsigned int issue_flags
)
4597 /* fallocate always requiring blocking context */
4598 if (issue_flags
& IO_URING_F_NONBLOCK
)
4600 ret
= vfs_fallocate(req
->file
, req
->sync
.mode
, req
->sync
.off
,
4605 fsnotify_modify(req
->file
);
4606 io_req_complete(req
, ret
);
4610 static int __io_openat_prep(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
4612 const char __user
*fname
;
4615 if (unlikely(req
->ctx
->flags
& IORING_SETUP_IOPOLL
))
4617 if (unlikely(sqe
->ioprio
|| sqe
->buf_index
))
4619 if (unlikely(req
->flags
& REQ_F_FIXED_FILE
))
4622 /* open.how should be already initialised */
4623 if (!(req
->open
.how
.flags
& O_PATH
) && force_o_largefile())
4624 req
->open
.how
.flags
|= O_LARGEFILE
;
4626 req
->open
.dfd
= READ_ONCE(sqe
->fd
);
4627 fname
= u64_to_user_ptr(READ_ONCE(sqe
->addr
));
4628 req
->open
.filename
= getname(fname
);
4629 if (IS_ERR(req
->open
.filename
)) {
4630 ret
= PTR_ERR(req
->open
.filename
);
4631 req
->open
.filename
= NULL
;
4635 req
->open
.file_slot
= READ_ONCE(sqe
->file_index
);
4636 if (req
->open
.file_slot
&& (req
->open
.how
.flags
& O_CLOEXEC
))
4639 req
->open
.nofile
= rlimit(RLIMIT_NOFILE
);
4640 req
->flags
|= REQ_F_NEED_CLEANUP
;
4644 static int io_openat_prep(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
4646 u64 mode
= READ_ONCE(sqe
->len
);
4647 u64 flags
= READ_ONCE(sqe
->open_flags
);
4649 req
->open
.how
= build_open_how(flags
, mode
);
4650 return __io_openat_prep(req
, sqe
);
4653 static int io_openat2_prep(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
4655 struct open_how __user
*how
;
4659 how
= u64_to_user_ptr(READ_ONCE(sqe
->addr2
));
4660 len
= READ_ONCE(sqe
->len
);
4661 if (len
< OPEN_HOW_SIZE_VER0
)
4664 ret
= copy_struct_from_user(&req
->open
.how
, sizeof(req
->open
.how
), how
,
4669 return __io_openat_prep(req
, sqe
);
4672 static int io_openat2(struct io_kiocb
*req
, unsigned int issue_flags
)
4674 struct open_flags op
;
4676 bool resolve_nonblock
, nonblock_set
;
4677 bool fixed
= !!req
->open
.file_slot
;
4680 ret
= build_open_flags(&req
->open
.how
, &op
);
4683 nonblock_set
= op
.open_flag
& O_NONBLOCK
;
4684 resolve_nonblock
= req
->open
.how
.resolve
& RESOLVE_CACHED
;
4685 if (issue_flags
& IO_URING_F_NONBLOCK
) {
4687 * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
4688 * it'll always -EAGAIN
4690 if (req
->open
.how
.flags
& (O_TRUNC
| O_CREAT
| O_TMPFILE
))
4692 op
.lookup_flags
|= LOOKUP_CACHED
;
4693 op
.open_flag
|= O_NONBLOCK
;
4697 ret
= __get_unused_fd_flags(req
->open
.how
.flags
, req
->open
.nofile
);
4702 file
= do_filp_open(req
->open
.dfd
, req
->open
.filename
, &op
);
4705 * We could hang on to this 'fd' on retrying, but seems like
4706 * marginal gain for something that is now known to be a slower
4707 * path. So just put it, and we'll get a new one when we retry.
4712 ret
= PTR_ERR(file
);
4713 /* only retry if RESOLVE_CACHED wasn't already set by application */
4714 if (ret
== -EAGAIN
&&
4715 (!resolve_nonblock
&& (issue_flags
& IO_URING_F_NONBLOCK
)))
4720 if ((issue_flags
& IO_URING_F_NONBLOCK
) && !nonblock_set
)
4721 file
->f_flags
&= ~O_NONBLOCK
;
4722 fsnotify_open(file
);
4725 fd_install(ret
, file
);
4727 ret
= io_install_fixed_file(req
, file
, issue_flags
,
4728 req
->open
.file_slot
- 1);
4730 putname(req
->open
.filename
);
4731 req
->flags
&= ~REQ_F_NEED_CLEANUP
;
4734 __io_req_complete(req
, issue_flags
, ret
, 0);
4738 static int io_openat(struct io_kiocb
*req
, unsigned int issue_flags
)
4740 return io_openat2(req
, issue_flags
);
4743 static int io_remove_buffers_prep(struct io_kiocb
*req
,
4744 const struct io_uring_sqe
*sqe
)
4746 struct io_provide_buf
*p
= &req
->pbuf
;
4749 if (sqe
->ioprio
|| sqe
->rw_flags
|| sqe
->addr
|| sqe
->len
|| sqe
->off
||
4753 tmp
= READ_ONCE(sqe
->fd
);
4754 if (!tmp
|| tmp
> USHRT_MAX
)
4757 memset(p
, 0, sizeof(*p
));
4759 p
->bgid
= READ_ONCE(sqe
->buf_group
);
4763 static int __io_remove_buffers(struct io_ring_ctx
*ctx
,
4764 struct io_buffer_list
*bl
, unsigned nbufs
)
4768 /* shouldn't happen */
4772 /* the head kbuf is the list itself */
4773 while (!list_empty(&bl
->buf_list
)) {
4774 struct io_buffer
*nxt
;
4776 nxt
= list_first_entry(&bl
->buf_list
, struct io_buffer
, list
);
4777 list_del(&nxt
->list
);
4787 static int io_remove_buffers(struct io_kiocb
*req
, unsigned int issue_flags
)
4789 struct io_provide_buf
*p
= &req
->pbuf
;
4790 struct io_ring_ctx
*ctx
= req
->ctx
;
4791 struct io_buffer_list
*bl
;
4793 bool needs_lock
= issue_flags
& IO_URING_F_UNLOCKED
;
4795 io_ring_submit_lock(ctx
, needs_lock
);
4797 lockdep_assert_held(&ctx
->uring_lock
);
4800 bl
= io_buffer_get_list(ctx
, p
->bgid
);
4802 ret
= __io_remove_buffers(ctx
, bl
, p
->nbufs
);
4806 /* complete before unlock, IOPOLL may need the lock */
4807 __io_req_complete(req
, issue_flags
, ret
, 0);
4808 io_ring_submit_unlock(ctx
, needs_lock
);
4812 static int io_provide_buffers_prep(struct io_kiocb
*req
,
4813 const struct io_uring_sqe
*sqe
)
4815 unsigned long size
, tmp_check
;
4816 struct io_provide_buf
*p
= &req
->pbuf
;
4819 if (sqe
->ioprio
|| sqe
->rw_flags
|| sqe
->splice_fd_in
)
4822 tmp
= READ_ONCE(sqe
->fd
);
4823 if (!tmp
|| tmp
> USHRT_MAX
)
4826 p
->addr
= READ_ONCE(sqe
->addr
);
4827 p
->len
= READ_ONCE(sqe
->len
);
4829 if (check_mul_overflow((unsigned long)p
->len
, (unsigned long)p
->nbufs
,
4832 if (check_add_overflow((unsigned long)p
->addr
, size
, &tmp_check
))
4835 size
= (unsigned long)p
->len
* p
->nbufs
;
4836 if (!access_ok(u64_to_user_ptr(p
->addr
), size
))
4839 p
->bgid
= READ_ONCE(sqe
->buf_group
);
4840 tmp
= READ_ONCE(sqe
->off
);
4841 if (tmp
> USHRT_MAX
)
4847 static int io_refill_buffer_cache(struct io_ring_ctx
*ctx
)
4849 struct io_buffer
*buf
;
4854 * Completions that don't happen inline (eg not under uring_lock) will
4855 * add to ->io_buffers_comp. If we don't have any free buffers, check
4856 * the completion list and splice those entries first.
4858 if (!list_empty_careful(&ctx
->io_buffers_comp
)) {
4859 spin_lock(&ctx
->completion_lock
);
4860 if (!list_empty(&ctx
->io_buffers_comp
)) {
4861 list_splice_init(&ctx
->io_buffers_comp
,
4862 &ctx
->io_buffers_cache
);
4863 spin_unlock(&ctx
->completion_lock
);
4866 spin_unlock(&ctx
->completion_lock
);
4870 * No free buffers and no completion entries either. Allocate a new
4871 * page worth of buffer entries and add those to our freelist.
4873 page
= alloc_page(GFP_KERNEL_ACCOUNT
);
4877 list_add(&page
->lru
, &ctx
->io_buffers_pages
);
4879 buf
= page_address(page
);
4880 bufs_in_page
= PAGE_SIZE
/ sizeof(*buf
);
4881 while (bufs_in_page
) {
4882 list_add_tail(&buf
->list
, &ctx
->io_buffers_cache
);
4890 static int io_add_buffers(struct io_ring_ctx
*ctx
, struct io_provide_buf
*pbuf
,
4891 struct io_buffer_list
*bl
)
4893 struct io_buffer
*buf
;
4894 u64 addr
= pbuf
->addr
;
4895 int i
, bid
= pbuf
->bid
;
4897 for (i
= 0; i
< pbuf
->nbufs
; i
++) {
4898 if (list_empty(&ctx
->io_buffers_cache
) &&
4899 io_refill_buffer_cache(ctx
))
4901 buf
= list_first_entry(&ctx
->io_buffers_cache
, struct io_buffer
,
4903 list_move_tail(&buf
->list
, &bl
->buf_list
);
4905 buf
->len
= min_t(__u32
, pbuf
->len
, MAX_RW_COUNT
);
4907 buf
->bgid
= pbuf
->bgid
;
4913 return i
? 0 : -ENOMEM
;
4916 static int io_provide_buffers(struct io_kiocb
*req
, unsigned int issue_flags
)
4918 struct io_provide_buf
*p
= &req
->pbuf
;
4919 struct io_ring_ctx
*ctx
= req
->ctx
;
4920 struct io_buffer_list
*bl
;
4922 bool needs_lock
= issue_flags
& IO_URING_F_UNLOCKED
;
4924 io_ring_submit_lock(ctx
, needs_lock
);
4926 lockdep_assert_held(&ctx
->uring_lock
);
4928 bl
= io_buffer_get_list(ctx
, p
->bgid
);
4929 if (unlikely(!bl
)) {
4930 bl
= kmalloc(sizeof(*bl
), GFP_KERNEL
);
4935 io_buffer_add_list(ctx
, bl
, p
->bgid
);
4938 ret
= io_add_buffers(ctx
, p
, bl
);
4942 /* complete before unlock, IOPOLL may need the lock */
4943 __io_req_complete(req
, issue_flags
, ret
, 0);
4944 io_ring_submit_unlock(ctx
, needs_lock
);
4948 static int io_epoll_ctl_prep(struct io_kiocb
*req
,
4949 const struct io_uring_sqe
*sqe
)
4951 #if defined(CONFIG_EPOLL)
4952 if (sqe
->ioprio
|| sqe
->buf_index
|| sqe
->splice_fd_in
)
4954 if (unlikely(req
->ctx
->flags
& IORING_SETUP_IOPOLL
))
4957 req
->epoll
.epfd
= READ_ONCE(sqe
->fd
);
4958 req
->epoll
.op
= READ_ONCE(sqe
->len
);
4959 req
->epoll
.fd
= READ_ONCE(sqe
->off
);
4961 if (ep_op_has_event(req
->epoll
.op
)) {
4962 struct epoll_event __user
*ev
;
4964 ev
= u64_to_user_ptr(READ_ONCE(sqe
->addr
));
4965 if (copy_from_user(&req
->epoll
.event
, ev
, sizeof(*ev
)))
4975 static int io_epoll_ctl(struct io_kiocb
*req
, unsigned int issue_flags
)
4977 #if defined(CONFIG_EPOLL)
4978 struct io_epoll
*ie
= &req
->epoll
;
4980 bool force_nonblock
= issue_flags
& IO_URING_F_NONBLOCK
;
4982 ret
= do_epoll_ctl(ie
->epfd
, ie
->op
, ie
->fd
, &ie
->event
, force_nonblock
);
4983 if (force_nonblock
&& ret
== -EAGAIN
)
4988 __io_req_complete(req
, issue_flags
, ret
, 0);
4995 static int io_madvise_prep(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
4997 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
4998 if (sqe
->ioprio
|| sqe
->buf_index
|| sqe
->off
|| sqe
->splice_fd_in
)
5000 if (unlikely(req
->ctx
->flags
& IORING_SETUP_IOPOLL
))
5003 req
->madvise
.addr
= READ_ONCE(sqe
->addr
);
5004 req
->madvise
.len
= READ_ONCE(sqe
->len
);
5005 req
->madvise
.advice
= READ_ONCE(sqe
->fadvise_advice
);
5012 static int io_madvise(struct io_kiocb
*req
, unsigned int issue_flags
)
5014 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
5015 struct io_madvise
*ma
= &req
->madvise
;
5018 if (issue_flags
& IO_URING_F_NONBLOCK
)
5021 ret
= do_madvise(current
->mm
, ma
->addr
, ma
->len
, ma
->advice
);
5024 io_req_complete(req
, ret
);
5031 static int io_fadvise_prep(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
5033 if (sqe
->ioprio
|| sqe
->buf_index
|| sqe
->addr
|| sqe
->splice_fd_in
)
5035 if (unlikely(req
->ctx
->flags
& IORING_SETUP_IOPOLL
))
5038 req
->fadvise
.offset
= READ_ONCE(sqe
->off
);
5039 req
->fadvise
.len
= READ_ONCE(sqe
->len
);
5040 req
->fadvise
.advice
= READ_ONCE(sqe
->fadvise_advice
);
5044 static int io_fadvise(struct io_kiocb
*req
, unsigned int issue_flags
)
5046 struct io_fadvise
*fa
= &req
->fadvise
;
5049 if (issue_flags
& IO_URING_F_NONBLOCK
) {
5050 switch (fa
->advice
) {
5051 case POSIX_FADV_NORMAL
:
5052 case POSIX_FADV_RANDOM
:
5053 case POSIX_FADV_SEQUENTIAL
:
5060 ret
= vfs_fadvise(req
->file
, fa
->offset
, fa
->len
, fa
->advice
);
5063 __io_req_complete(req
, issue_flags
, ret
, 0);
5067 static int io_statx_prep(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
5069 const char __user
*path
;
5071 if (unlikely(req
->ctx
->flags
& IORING_SETUP_IOPOLL
))
5073 if (sqe
->ioprio
|| sqe
->buf_index
|| sqe
->splice_fd_in
)
5075 if (req
->flags
& REQ_F_FIXED_FILE
)
5078 req
->statx
.dfd
= READ_ONCE(sqe
->fd
);
5079 req
->statx
.mask
= READ_ONCE(sqe
->len
);
5080 path
= u64_to_user_ptr(READ_ONCE(sqe
->addr
));
5081 req
->statx
.buffer
= u64_to_user_ptr(READ_ONCE(sqe
->addr2
));
5082 req
->statx
.flags
= READ_ONCE(sqe
->statx_flags
);
5084 req
->statx
.filename
= getname_flags(path
,
5085 getname_statx_lookup_flags(req
->statx
.flags
),
5088 if (IS_ERR(req
->statx
.filename
)) {
5089 int ret
= PTR_ERR(req
->statx
.filename
);
5091 req
->statx
.filename
= NULL
;
5095 req
->flags
|= REQ_F_NEED_CLEANUP
;
5099 static int io_statx(struct io_kiocb
*req
, unsigned int issue_flags
)
5101 struct io_statx
*ctx
= &req
->statx
;
5104 if (issue_flags
& IO_URING_F_NONBLOCK
)
5107 ret
= do_statx(ctx
->dfd
, ctx
->filename
, ctx
->flags
, ctx
->mask
,
5112 io_req_complete(req
, ret
);
5116 static int io_close_prep(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
5118 if (unlikely(req
->ctx
->flags
& IORING_SETUP_IOPOLL
))
5120 if (sqe
->ioprio
|| sqe
->off
|| sqe
->addr
|| sqe
->len
||
5121 sqe
->rw_flags
|| sqe
->buf_index
)
5123 if (req
->flags
& REQ_F_FIXED_FILE
)
5126 req
->close
.fd
= READ_ONCE(sqe
->fd
);
5127 req
->close
.file_slot
= READ_ONCE(sqe
->file_index
);
5128 if (req
->close
.file_slot
&& req
->close
.fd
)
5134 static int io_close(struct io_kiocb
*req
, unsigned int issue_flags
)
5136 struct files_struct
*files
= current
->files
;
5137 struct io_close
*close
= &req
->close
;
5138 struct fdtable
*fdt
;
5139 struct file
*file
= NULL
;
5142 if (req
->close
.file_slot
) {
5143 ret
= io_close_fixed(req
, issue_flags
);
5147 spin_lock(&files
->file_lock
);
5148 fdt
= files_fdtable(files
);
5149 if (close
->fd
>= fdt
->max_fds
) {
5150 spin_unlock(&files
->file_lock
);
5153 file
= fdt
->fd
[close
->fd
];
5154 if (!file
|| file
->f_op
== &io_uring_fops
) {
5155 spin_unlock(&files
->file_lock
);
5160 /* if the file has a flush method, be safe and punt to async */
5161 if (file
->f_op
->flush
&& (issue_flags
& IO_URING_F_NONBLOCK
)) {
5162 spin_unlock(&files
->file_lock
);
5166 ret
= __close_fd_get_file(close
->fd
, &file
);
5167 spin_unlock(&files
->file_lock
);
5174 /* No ->flush() or already async, safely close from here */
5175 ret
= filp_close(file
, current
->files
);
5181 __io_req_complete(req
, issue_flags
, ret
, 0);
5185 static int io_sfr_prep(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
5187 struct io_ring_ctx
*ctx
= req
->ctx
;
5189 if (unlikely(ctx
->flags
& IORING_SETUP_IOPOLL
))
5191 if (unlikely(sqe
->addr
|| sqe
->ioprio
|| sqe
->buf_index
||
5195 req
->sync
.off
= READ_ONCE(sqe
->off
);
5196 req
->sync
.len
= READ_ONCE(sqe
->len
);
5197 req
->sync
.flags
= READ_ONCE(sqe
->sync_range_flags
);
5201 static int io_sync_file_range(struct io_kiocb
*req
, unsigned int issue_flags
)
5205 /* sync_file_range always requires a blocking context */
5206 if (issue_flags
& IO_URING_F_NONBLOCK
)
5209 ret
= sync_file_range(req
->file
, req
->sync
.off
, req
->sync
.len
,
5213 io_req_complete(req
, ret
);
5217 #if defined(CONFIG_NET)
5218 static int io_setup_async_msg(struct io_kiocb
*req
,
5219 struct io_async_msghdr
*kmsg
)
5221 struct io_async_msghdr
*async_msg
= req
->async_data
;
5225 if (io_alloc_async_data(req
)) {
5226 kfree(kmsg
->free_iov
);
5229 async_msg
= req
->async_data
;
5230 req
->flags
|= REQ_F_NEED_CLEANUP
;
5231 memcpy(async_msg
, kmsg
, sizeof(*kmsg
));
5232 async_msg
->msg
.msg_name
= &async_msg
->addr
;
5233 /* if were using fast_iov, set it to the new one */
5234 if (!async_msg
->free_iov
)
5235 async_msg
->msg
.msg_iter
.iov
= async_msg
->fast_iov
;
5240 static int io_sendmsg_copy_hdr(struct io_kiocb
*req
,
5241 struct io_async_msghdr
*iomsg
)
5243 iomsg
->msg
.msg_name
= &iomsg
->addr
;
5244 iomsg
->free_iov
= iomsg
->fast_iov
;
5245 return sendmsg_copy_msghdr(&iomsg
->msg
, req
->sr_msg
.umsg
,
5246 req
->sr_msg
.msg_flags
, &iomsg
->free_iov
);
5249 static int io_sendmsg_prep_async(struct io_kiocb
*req
)
5253 ret
= io_sendmsg_copy_hdr(req
, req
->async_data
);
5255 req
->flags
|= REQ_F_NEED_CLEANUP
;
5259 static int io_sendmsg_prep(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
5261 struct io_sr_msg
*sr
= &req
->sr_msg
;
5263 if (unlikely(req
->ctx
->flags
& IORING_SETUP_IOPOLL
))
5265 if (unlikely(sqe
->addr2
|| sqe
->file_index
|| sqe
->ioprio
))
5268 sr
->umsg
= u64_to_user_ptr(READ_ONCE(sqe
->addr
));
5269 sr
->len
= READ_ONCE(sqe
->len
);
5270 sr
->msg_flags
= READ_ONCE(sqe
->msg_flags
) | MSG_NOSIGNAL
;
5271 if (sr
->msg_flags
& MSG_DONTWAIT
)
5272 req
->flags
|= REQ_F_NOWAIT
;
5274 #ifdef CONFIG_COMPAT
5275 if (req
->ctx
->compat
)
5276 sr
->msg_flags
|= MSG_CMSG_COMPAT
;
5281 static int io_sendmsg(struct io_kiocb
*req
, unsigned int issue_flags
)
5283 struct io_async_msghdr iomsg
, *kmsg
;
5284 struct socket
*sock
;
5289 sock
= sock_from_file(req
->file
);
5290 if (unlikely(!sock
))
5293 if (req_has_async_data(req
)) {
5294 kmsg
= req
->async_data
;
5296 ret
= io_sendmsg_copy_hdr(req
, &iomsg
);
5302 flags
= req
->sr_msg
.msg_flags
;
5303 if (issue_flags
& IO_URING_F_NONBLOCK
)
5304 flags
|= MSG_DONTWAIT
;
5305 if (flags
& MSG_WAITALL
)
5306 min_ret
= iov_iter_count(&kmsg
->msg
.msg_iter
);
5308 ret
= __sys_sendmsg_sock(sock
, &kmsg
->msg
, flags
);
5310 if (ret
< min_ret
) {
5311 if (ret
== -EAGAIN
&& (issue_flags
& IO_URING_F_NONBLOCK
))
5312 return io_setup_async_msg(req
, kmsg
);
5313 if (ret
== -ERESTARTSYS
)
5317 /* fast path, check for non-NULL to avoid function call */
5319 kfree(kmsg
->free_iov
);
5320 req
->flags
&= ~REQ_F_NEED_CLEANUP
;
5321 __io_req_complete(req
, issue_flags
, ret
, 0);
5325 static int io_send(struct io_kiocb
*req
, unsigned int issue_flags
)
5327 struct io_sr_msg
*sr
= &req
->sr_msg
;
5330 struct socket
*sock
;
5335 sock
= sock_from_file(req
->file
);
5336 if (unlikely(!sock
))
5339 ret
= import_single_range(WRITE
, sr
->buf
, sr
->len
, &iov
, &msg
.msg_iter
);
5343 msg
.msg_name
= NULL
;
5344 msg
.msg_control
= NULL
;
5345 msg
.msg_controllen
= 0;
5346 msg
.msg_namelen
= 0;
5348 flags
= req
->sr_msg
.msg_flags
;
5349 if (issue_flags
& IO_URING_F_NONBLOCK
)
5350 flags
|= MSG_DONTWAIT
;
5351 if (flags
& MSG_WAITALL
)
5352 min_ret
= iov_iter_count(&msg
.msg_iter
);
5354 msg
.msg_flags
= flags
;
5355 ret
= sock_sendmsg(sock
, &msg
);
5356 if (ret
< min_ret
) {
5357 if (ret
== -EAGAIN
&& (issue_flags
& IO_URING_F_NONBLOCK
))
5359 if (ret
== -ERESTARTSYS
)
5363 __io_req_complete(req
, issue_flags
, ret
, 0);
5367 static int __io_recvmsg_copy_hdr(struct io_kiocb
*req
,
5368 struct io_async_msghdr
*iomsg
)
5370 struct io_sr_msg
*sr
= &req
->sr_msg
;
5371 struct iovec __user
*uiov
;
5375 ret
= __copy_msghdr_from_user(&iomsg
->msg
, sr
->umsg
,
5376 &iomsg
->uaddr
, &uiov
, &iov_len
);
5380 if (req
->flags
& REQ_F_BUFFER_SELECT
) {
5383 if (copy_from_user(iomsg
->fast_iov
, uiov
, sizeof(*uiov
)))
5385 sr
->len
= iomsg
->fast_iov
[0].iov_len
;
5386 iomsg
->free_iov
= NULL
;
5388 iomsg
->free_iov
= iomsg
->fast_iov
;
5389 ret
= __import_iovec(READ
, uiov
, iov_len
, UIO_FASTIOV
,
5390 &iomsg
->free_iov
, &iomsg
->msg
.msg_iter
,
5399 #ifdef CONFIG_COMPAT
5400 static int __io_compat_recvmsg_copy_hdr(struct io_kiocb
*req
,
5401 struct io_async_msghdr
*iomsg
)
5403 struct io_sr_msg
*sr
= &req
->sr_msg
;
5404 struct compat_iovec __user
*uiov
;
5409 ret
= __get_compat_msghdr(&iomsg
->msg
, sr
->umsg_compat
, &iomsg
->uaddr
,
5414 uiov
= compat_ptr(ptr
);
5415 if (req
->flags
& REQ_F_BUFFER_SELECT
) {
5416 compat_ssize_t clen
;
5420 if (!access_ok(uiov
, sizeof(*uiov
)))
5422 if (__get_user(clen
, &uiov
->iov_len
))
5427 iomsg
->free_iov
= NULL
;
5429 iomsg
->free_iov
= iomsg
->fast_iov
;
5430 ret
= __import_iovec(READ
, (struct iovec __user
*)uiov
, len
,
5431 UIO_FASTIOV
, &iomsg
->free_iov
,
5432 &iomsg
->msg
.msg_iter
, true);
5441 static int io_recvmsg_copy_hdr(struct io_kiocb
*req
,
5442 struct io_async_msghdr
*iomsg
)
5444 iomsg
->msg
.msg_name
= &iomsg
->addr
;
5446 #ifdef CONFIG_COMPAT
5447 if (req
->ctx
->compat
)
5448 return __io_compat_recvmsg_copy_hdr(req
, iomsg
);
5451 return __io_recvmsg_copy_hdr(req
, iomsg
);
5454 static struct io_buffer
*io_recv_buffer_select(struct io_kiocb
*req
,
5455 unsigned int issue_flags
)
5457 struct io_sr_msg
*sr
= &req
->sr_msg
;
5459 return io_buffer_select(req
, &sr
->len
, sr
->bgid
, issue_flags
);
5462 static int io_recvmsg_prep_async(struct io_kiocb
*req
)
5466 ret
= io_recvmsg_copy_hdr(req
, req
->async_data
);
5468 req
->flags
|= REQ_F_NEED_CLEANUP
;
5472 static int io_recvmsg_prep(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
5474 struct io_sr_msg
*sr
= &req
->sr_msg
;
5476 if (unlikely(req
->ctx
->flags
& IORING_SETUP_IOPOLL
))
5478 if (unlikely(sqe
->addr2
|| sqe
->file_index
|| sqe
->ioprio
))
5481 sr
->umsg
= u64_to_user_ptr(READ_ONCE(sqe
->addr
));
5482 sr
->len
= READ_ONCE(sqe
->len
);
5483 sr
->bgid
= READ_ONCE(sqe
->buf_group
);
5484 sr
->msg_flags
= READ_ONCE(sqe
->msg_flags
) | MSG_NOSIGNAL
;
5485 if (sr
->msg_flags
& MSG_DONTWAIT
)
5486 req
->flags
|= REQ_F_NOWAIT
;
5488 #ifdef CONFIG_COMPAT
5489 if (req
->ctx
->compat
)
5490 sr
->msg_flags
|= MSG_CMSG_COMPAT
;
5496 static bool io_net_retry(struct socket
*sock
, int flags
)
5498 if (!(flags
& MSG_WAITALL
))
5500 return sock
->type
== SOCK_STREAM
|| sock
->type
== SOCK_SEQPACKET
;
5503 static int io_recvmsg(struct io_kiocb
*req
, unsigned int issue_flags
)
5505 struct io_async_msghdr iomsg
, *kmsg
;
5506 struct io_sr_msg
*sr
= &req
->sr_msg
;
5507 struct socket
*sock
;
5508 struct io_buffer
*kbuf
;
5510 int ret
, min_ret
= 0;
5511 bool force_nonblock
= issue_flags
& IO_URING_F_NONBLOCK
;
5513 sock
= sock_from_file(req
->file
);
5514 if (unlikely(!sock
))
5517 if (req_has_async_data(req
)) {
5518 kmsg
= req
->async_data
;
5520 ret
= io_recvmsg_copy_hdr(req
, &iomsg
);
5526 if (req
->flags
& REQ_F_BUFFER_SELECT
) {
5527 kbuf
= io_recv_buffer_select(req
, issue_flags
);
5529 return PTR_ERR(kbuf
);
5530 kmsg
->fast_iov
[0].iov_base
= u64_to_user_ptr(kbuf
->addr
);
5531 kmsg
->fast_iov
[0].iov_len
= req
->sr_msg
.len
;
5532 iov_iter_init(&kmsg
->msg
.msg_iter
, READ
, kmsg
->fast_iov
,
5533 1, req
->sr_msg
.len
);
5536 flags
= req
->sr_msg
.msg_flags
;
5538 flags
|= MSG_DONTWAIT
;
5539 if (flags
& MSG_WAITALL
)
5540 min_ret
= iov_iter_count(&kmsg
->msg
.msg_iter
);
5542 ret
= __sys_recvmsg_sock(sock
, &kmsg
->msg
, req
->sr_msg
.umsg
,
5543 kmsg
->uaddr
, flags
);
5544 if (ret
< min_ret
) {
5545 if (ret
== -EAGAIN
&& force_nonblock
)
5546 return io_setup_async_msg(req
, kmsg
);
5547 if (ret
== -ERESTARTSYS
)
5549 if (ret
> 0 && io_net_retry(sock
, flags
)) {
5551 req
->flags
|= REQ_F_PARTIAL_IO
;
5552 return io_setup_async_msg(req
, kmsg
);
5555 } else if ((flags
& MSG_WAITALL
) && (kmsg
->msg
.msg_flags
& (MSG_TRUNC
| MSG_CTRUNC
))) {
5559 /* fast path, check for non-NULL to avoid function call */
5561 kfree(kmsg
->free_iov
);
5562 req
->flags
&= ~REQ_F_NEED_CLEANUP
;
5565 else if (sr
->done_io
)
5567 __io_req_complete(req
, issue_flags
, ret
, io_put_kbuf(req
, issue_flags
));
5571 static int io_recv(struct io_kiocb
*req
, unsigned int issue_flags
)
5573 struct io_buffer
*kbuf
;
5574 struct io_sr_msg
*sr
= &req
->sr_msg
;
5576 void __user
*buf
= sr
->buf
;
5577 struct socket
*sock
;
5580 int ret
, min_ret
= 0;
5581 bool force_nonblock
= issue_flags
& IO_URING_F_NONBLOCK
;
5583 sock
= sock_from_file(req
->file
);
5584 if (unlikely(!sock
))
5587 if (req
->flags
& REQ_F_BUFFER_SELECT
) {
5588 kbuf
= io_recv_buffer_select(req
, issue_flags
);
5590 return PTR_ERR(kbuf
);
5591 buf
= u64_to_user_ptr(kbuf
->addr
);
5594 ret
= import_single_range(READ
, buf
, sr
->len
, &iov
, &msg
.msg_iter
);
5598 msg
.msg_name
= NULL
;
5599 msg
.msg_control
= NULL
;
5600 msg
.msg_controllen
= 0;
5601 msg
.msg_namelen
= 0;
5602 msg
.msg_iocb
= NULL
;
5605 flags
= req
->sr_msg
.msg_flags
;
5607 flags
|= MSG_DONTWAIT
;
5608 if (flags
& MSG_WAITALL
)
5609 min_ret
= iov_iter_count(&msg
.msg_iter
);
5611 ret
= sock_recvmsg(sock
, &msg
, flags
);
5612 if (ret
< min_ret
) {
5613 if (ret
== -EAGAIN
&& force_nonblock
)
5615 if (ret
== -ERESTARTSYS
)
5617 if (ret
> 0 && io_net_retry(sock
, flags
)) {
5621 req
->flags
|= REQ_F_PARTIAL_IO
;
5625 } else if ((flags
& MSG_WAITALL
) && (msg
.msg_flags
& (MSG_TRUNC
| MSG_CTRUNC
))) {
5632 else if (sr
->done_io
)
5634 __io_req_complete(req
, issue_flags
, ret
, io_put_kbuf(req
, issue_flags
));
5638 static int io_accept_prep(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
5640 struct io_accept
*accept
= &req
->accept
;
5642 if (unlikely(req
->ctx
->flags
& IORING_SETUP_IOPOLL
))
5644 if (sqe
->ioprio
|| sqe
->len
|| sqe
->buf_index
)
5647 accept
->addr
= u64_to_user_ptr(READ_ONCE(sqe
->addr
));
5648 accept
->addr_len
= u64_to_user_ptr(READ_ONCE(sqe
->addr2
));
5649 accept
->flags
= READ_ONCE(sqe
->accept_flags
);
5650 accept
->nofile
= rlimit(RLIMIT_NOFILE
);
5652 accept
->file_slot
= READ_ONCE(sqe
->file_index
);
5653 if (accept
->file_slot
&& (accept
->flags
& SOCK_CLOEXEC
))
5655 if (accept
->flags
& ~(SOCK_CLOEXEC
| SOCK_NONBLOCK
))
5657 if (SOCK_NONBLOCK
!= O_NONBLOCK
&& (accept
->flags
& SOCK_NONBLOCK
))
5658 accept
->flags
= (accept
->flags
& ~SOCK_NONBLOCK
) | O_NONBLOCK
;
5662 static int io_accept(struct io_kiocb
*req
, unsigned int issue_flags
)
5664 struct io_accept
*accept
= &req
->accept
;
5665 bool force_nonblock
= issue_flags
& IO_URING_F_NONBLOCK
;
5666 unsigned int file_flags
= force_nonblock
? O_NONBLOCK
: 0;
5667 bool fixed
= !!accept
->file_slot
;
5672 fd
= __get_unused_fd_flags(accept
->flags
, accept
->nofile
);
5673 if (unlikely(fd
< 0))
5676 file
= do_accept(req
->file
, file_flags
, accept
->addr
, accept
->addr_len
,
5681 ret
= PTR_ERR(file
);
5682 if (ret
== -EAGAIN
&& force_nonblock
)
5684 if (ret
== -ERESTARTSYS
)
5687 } else if (!fixed
) {
5688 fd_install(fd
, file
);
5691 ret
= io_install_fixed_file(req
, file
, issue_flags
,
5692 accept
->file_slot
- 1);
5694 __io_req_complete(req
, issue_flags
, ret
, 0);
5698 static int io_connect_prep_async(struct io_kiocb
*req
)
5700 struct io_async_connect
*io
= req
->async_data
;
5701 struct io_connect
*conn
= &req
->connect
;
5703 return move_addr_to_kernel(conn
->addr
, conn
->addr_len
, &io
->address
);
5706 static int io_connect_prep(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
5708 struct io_connect
*conn
= &req
->connect
;
5710 if (unlikely(req
->ctx
->flags
& IORING_SETUP_IOPOLL
))
5712 if (sqe
->ioprio
|| sqe
->len
|| sqe
->buf_index
|| sqe
->rw_flags
||
5716 conn
->addr
= u64_to_user_ptr(READ_ONCE(sqe
->addr
));
5717 conn
->addr_len
= READ_ONCE(sqe
->addr2
);
5721 static int io_connect(struct io_kiocb
*req
, unsigned int issue_flags
)
5723 struct io_async_connect __io
, *io
;
5724 unsigned file_flags
;
5726 bool force_nonblock
= issue_flags
& IO_URING_F_NONBLOCK
;
5728 if (req_has_async_data(req
)) {
5729 io
= req
->async_data
;
5731 ret
= move_addr_to_kernel(req
->connect
.addr
,
5732 req
->connect
.addr_len
,
5739 file_flags
= force_nonblock
? O_NONBLOCK
: 0;
5741 ret
= __sys_connect_file(req
->file
, &io
->address
,
5742 req
->connect
.addr_len
, file_flags
);
5743 if ((ret
== -EAGAIN
|| ret
== -EINPROGRESS
) && force_nonblock
) {
5744 if (req_has_async_data(req
))
5746 if (io_alloc_async_data(req
)) {
5750 memcpy(req
->async_data
, &__io
, sizeof(__io
));
5753 if (ret
== -ERESTARTSYS
)
5758 __io_req_complete(req
, issue_flags
, ret
, 0);
5761 #else /* !CONFIG_NET */
5762 #define IO_NETOP_FN(op) \
5763 static int io_##op(struct io_kiocb *req, unsigned int issue_flags) \
5765 return -EOPNOTSUPP; \
5768 #define IO_NETOP_PREP(op) \
5770 static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \
5772 return -EOPNOTSUPP; \
5775 #define IO_NETOP_PREP_ASYNC(op) \
5777 static int io_##op##_prep_async(struct io_kiocb *req) \
5779 return -EOPNOTSUPP; \
5782 IO_NETOP_PREP_ASYNC(sendmsg
);
5783 IO_NETOP_PREP_ASYNC(recvmsg
);
5784 IO_NETOP_PREP_ASYNC(connect
);
5785 IO_NETOP_PREP(accept
);
5788 #endif /* CONFIG_NET */
5790 struct io_poll_table
{
5791 struct poll_table_struct pt
;
5792 struct io_kiocb
*req
;
5797 #define IO_POLL_CANCEL_FLAG BIT(31)
5798 #define IO_POLL_REF_MASK GENMASK(30, 0)
5801 * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can
5802 * bump it and acquire ownership. It's disallowed to modify requests while not
5803 * owning it, that prevents from races for enqueueing task_work's and b/w
5804 * arming poll and wakeups.
5806 static inline bool io_poll_get_ownership(struct io_kiocb
*req
)
5808 return !(atomic_fetch_inc(&req
->poll_refs
) & IO_POLL_REF_MASK
);
5811 static void io_poll_mark_cancelled(struct io_kiocb
*req
)
5813 atomic_or(IO_POLL_CANCEL_FLAG
, &req
->poll_refs
);
5816 static struct io_poll_iocb
*io_poll_get_double(struct io_kiocb
*req
)
5818 /* pure poll stashes this in ->async_data, poll driven retry elsewhere */
5819 if (req
->opcode
== IORING_OP_POLL_ADD
)
5820 return req
->async_data
;
5821 return req
->apoll
->double_poll
;
5824 static struct io_poll_iocb
*io_poll_get_single(struct io_kiocb
*req
)
5826 if (req
->opcode
== IORING_OP_POLL_ADD
)
5828 return &req
->apoll
->poll
;
5831 static void io_poll_req_insert(struct io_kiocb
*req
)
5833 struct io_ring_ctx
*ctx
= req
->ctx
;
5834 struct hlist_head
*list
;
5836 list
= &ctx
->cancel_hash
[hash_long(req
->user_data
, ctx
->cancel_hash_bits
)];
5837 hlist_add_head(&req
->hash_node
, list
);
5840 static void io_init_poll_iocb(struct io_poll_iocb
*poll
, __poll_t events
,
5841 wait_queue_func_t wake_func
)
5844 #define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
5845 /* mask in events that we always want/need */
5846 poll
->events
= events
| IO_POLL_UNMASK
;
5847 INIT_LIST_HEAD(&poll
->wait
.entry
);
5848 init_waitqueue_func_entry(&poll
->wait
, wake_func
);
5851 static inline void io_poll_remove_entry(struct io_poll_iocb
*poll
)
5853 struct wait_queue_head
*head
= smp_load_acquire(&poll
->head
);
5856 spin_lock_irq(&head
->lock
);
5857 list_del_init(&poll
->wait
.entry
);
5859 spin_unlock_irq(&head
->lock
);
5863 static void io_poll_remove_entries(struct io_kiocb
*req
)
5866 * Nothing to do if neither of those flags are set. Avoid dipping
5867 * into the poll/apoll/double cachelines if we can.
5869 if (!(req
->flags
& (REQ_F_SINGLE_POLL
| REQ_F_DOUBLE_POLL
)))
5873 * While we hold the waitqueue lock and the waitqueue is nonempty,
5874 * wake_up_pollfree() will wait for us. However, taking the waitqueue
5875 * lock in the first place can race with the waitqueue being freed.
5877 * We solve this as eventpoll does: by taking advantage of the fact that
5878 * all users of wake_up_pollfree() will RCU-delay the actual free. If
5879 * we enter rcu_read_lock() and see that the pointer to the queue is
5880 * non-NULL, we can then lock it without the memory being freed out from
5883 * Keep holding rcu_read_lock() as long as we hold the queue lock, in
5884 * case the caller deletes the entry from the queue, leaving it empty.
5885 * In that case, only RCU prevents the queue memory from being freed.
5888 if (req
->flags
& REQ_F_SINGLE_POLL
)
5889 io_poll_remove_entry(io_poll_get_single(req
));
5890 if (req
->flags
& REQ_F_DOUBLE_POLL
)
5891 io_poll_remove_entry(io_poll_get_double(req
));
5896 * All poll tw should go through this. Checks for poll events, manages
5897 * references, does rewait, etc.
5899 * Returns a negative error on failure. >0 when no action require, which is
5900 * either spurious wakeup or multishot CQE is served. 0 when it's done with
5901 * the request, then the mask is stored in req->result.
5903 static int io_poll_check_events(struct io_kiocb
*req
, bool locked
)
5905 struct io_ring_ctx
*ctx
= req
->ctx
;
5908 /* req->task == current here, checking PF_EXITING is safe */
5909 if (unlikely(req
->task
->flags
& PF_EXITING
))
5910 io_poll_mark_cancelled(req
);
5913 v
= atomic_read(&req
->poll_refs
);
5915 /* tw handler should be the owner, and so have some references */
5916 if (WARN_ON_ONCE(!(v
& IO_POLL_REF_MASK
)))
5918 if (v
& IO_POLL_CANCEL_FLAG
)
5922 struct poll_table_struct pt
= { ._key
= req
->apoll_events
};
5923 req
->result
= vfs_poll(req
->file
, &pt
) & req
->apoll_events
;
5926 /* multishot, just fill an CQE and proceed */
5927 if (req
->result
&& !(req
->apoll_events
& EPOLLONESHOT
)) {
5928 __poll_t mask
= mangle_poll(req
->result
& req
->apoll_events
);
5931 spin_lock(&ctx
->completion_lock
);
5932 filled
= io_fill_cqe_aux(ctx
, req
->user_data
, mask
,
5934 io_commit_cqring(ctx
);
5935 spin_unlock(&ctx
->completion_lock
);
5936 if (unlikely(!filled
))
5938 io_cqring_ev_posted(ctx
);
5939 } else if (req
->result
) {
5944 * Release all references, retry if someone tried to restart
5945 * task_work while we were executing it.
5947 } while (atomic_sub_return(v
& IO_POLL_REF_MASK
, &req
->poll_refs
));
5952 static void io_poll_task_func(struct io_kiocb
*req
, bool *locked
)
5954 struct io_ring_ctx
*ctx
= req
->ctx
;
5957 ret
= io_poll_check_events(req
, *locked
);
5962 req
->result
= mangle_poll(req
->result
& req
->poll
.events
);
5968 io_poll_remove_entries(req
);
5969 spin_lock(&ctx
->completion_lock
);
5970 hash_del(&req
->hash_node
);
5971 __io_req_complete_post(req
, req
->result
, 0);
5972 io_commit_cqring(ctx
);
5973 spin_unlock(&ctx
->completion_lock
);
5974 io_cqring_ev_posted(ctx
);
5977 static void io_apoll_task_func(struct io_kiocb
*req
, bool *locked
)
5979 struct io_ring_ctx
*ctx
= req
->ctx
;
5982 ret
= io_poll_check_events(req
, *locked
);
5986 io_poll_remove_entries(req
);
5987 spin_lock(&ctx
->completion_lock
);
5988 hash_del(&req
->hash_node
);
5989 spin_unlock(&ctx
->completion_lock
);
5992 io_req_task_submit(req
, locked
);
5994 io_req_complete_failed(req
, ret
);
5997 static void __io_poll_execute(struct io_kiocb
*req
, int mask
,
5998 __poll_t __maybe_unused events
)
6002 * This is useful for poll that is armed on behalf of another
6003 * request, and where the wakeup path could be on a different
6004 * CPU. We want to avoid pulling in req->apoll->events for that
6007 if (req
->opcode
== IORING_OP_POLL_ADD
)
6008 req
->io_task_work
.func
= io_poll_task_func
;
6010 req
->io_task_work
.func
= io_apoll_task_func
;
6012 trace_io_uring_task_add(req
->ctx
, req
, req
->user_data
, req
->opcode
, mask
);
6013 io_req_task_work_add(req
, false);
6016 static inline void io_poll_execute(struct io_kiocb
*req
, int res
,
6019 if (io_poll_get_ownership(req
))
6020 __io_poll_execute(req
, res
, events
);
6023 static void io_poll_cancel_req(struct io_kiocb
*req
)
6025 io_poll_mark_cancelled(req
);
6026 /* kick tw, which should complete the request */
6027 io_poll_execute(req
, 0, 0);
6030 #define wqe_to_req(wait) ((void *)((unsigned long) (wait)->private & ~1))
6031 #define wqe_is_double(wait) ((unsigned long) (wait)->private & 1)
6032 #define IO_ASYNC_POLL_COMMON (EPOLLONESHOT | POLLPRI)
6034 static int io_poll_wake(struct wait_queue_entry
*wait
, unsigned mode
, int sync
,
6037 struct io_kiocb
*req
= wqe_to_req(wait
);
6038 struct io_poll_iocb
*poll
= container_of(wait
, struct io_poll_iocb
,
6040 __poll_t mask
= key_to_poll(key
);
6042 if (unlikely(mask
& POLLFREE
)) {
6043 io_poll_mark_cancelled(req
);
6044 /* we have to kick tw in case it's not already */
6045 io_poll_execute(req
, 0, poll
->events
);
6048 * If the waitqueue is being freed early but someone is already
6049 * holds ownership over it, we have to tear down the request as
6050 * best we can. That means immediately removing the request from
6051 * its waitqueue and preventing all further accesses to the
6052 * waitqueue via the request.
6054 list_del_init(&poll
->wait
.entry
);
6057 * Careful: this *must* be the last step, since as soon
6058 * as req->head is NULL'ed out, the request can be
6059 * completed and freed, since aio_poll_complete_work()
6060 * will no longer need to take the waitqueue lock.
6062 smp_store_release(&poll
->head
, NULL
);
6066 /* for instances that support it check for an event match first */
6067 if (mask
&& !(mask
& (poll
->events
& ~IO_ASYNC_POLL_COMMON
)))
6070 if (io_poll_get_ownership(req
)) {
6071 /* optional, saves extra locking for removal in tw handler */
6072 if (mask
&& poll
->events
& EPOLLONESHOT
) {
6073 list_del_init(&poll
->wait
.entry
);
6075 if (wqe_is_double(wait
))
6076 req
->flags
&= ~REQ_F_DOUBLE_POLL
;
6078 req
->flags
&= ~REQ_F_SINGLE_POLL
;
6080 __io_poll_execute(req
, mask
, poll
->events
);
6085 static void __io_queue_proc(struct io_poll_iocb
*poll
, struct io_poll_table
*pt
,
6086 struct wait_queue_head
*head
,
6087 struct io_poll_iocb
**poll_ptr
)
6089 struct io_kiocb
*req
= pt
->req
;
6090 unsigned long wqe_private
= (unsigned long) req
;
6093 * The file being polled uses multiple waitqueues for poll handling
6094 * (e.g. one for read, one for write). Setup a separate io_poll_iocb
6097 if (unlikely(pt
->nr_entries
)) {
6098 struct io_poll_iocb
*first
= poll
;
6100 /* double add on the same waitqueue head, ignore */
6101 if (first
->head
== head
)
6103 /* already have a 2nd entry, fail a third attempt */
6105 if ((*poll_ptr
)->head
== head
)
6107 pt
->error
= -EINVAL
;
6111 poll
= kmalloc(sizeof(*poll
), GFP_ATOMIC
);
6113 pt
->error
= -ENOMEM
;
6116 /* mark as double wq entry */
6118 req
->flags
|= REQ_F_DOUBLE_POLL
;
6119 io_init_poll_iocb(poll
, first
->events
, first
->wait
.func
);
6121 if (req
->opcode
== IORING_OP_POLL_ADD
)
6122 req
->flags
|= REQ_F_ASYNC_DATA
;
6125 req
->flags
|= REQ_F_SINGLE_POLL
;
6128 poll
->wait
.private = (void *) wqe_private
;
6130 if (poll
->events
& EPOLLEXCLUSIVE
)
6131 add_wait_queue_exclusive(head
, &poll
->wait
);
6133 add_wait_queue(head
, &poll
->wait
);
6136 static void io_poll_queue_proc(struct file
*file
, struct wait_queue_head
*head
,
6137 struct poll_table_struct
*p
)
6139 struct io_poll_table
*pt
= container_of(p
, struct io_poll_table
, pt
);
6141 __io_queue_proc(&pt
->req
->poll
, pt
, head
,
6142 (struct io_poll_iocb
**) &pt
->req
->async_data
);
6145 static int __io_arm_poll_handler(struct io_kiocb
*req
,
6146 struct io_poll_iocb
*poll
,
6147 struct io_poll_table
*ipt
, __poll_t mask
)
6149 struct io_ring_ctx
*ctx
= req
->ctx
;
6152 INIT_HLIST_NODE(&req
->hash_node
);
6153 io_init_poll_iocb(poll
, mask
, io_poll_wake
);
6154 poll
->file
= req
->file
;
6156 req
->apoll_events
= poll
->events
;
6158 ipt
->pt
._key
= mask
;
6161 ipt
->nr_entries
= 0;
6164 * Take the ownership to delay any tw execution up until we're done
6165 * with poll arming. see io_poll_get_ownership().
6167 atomic_set(&req
->poll_refs
, 1);
6168 mask
= vfs_poll(req
->file
, &ipt
->pt
) & poll
->events
;
6170 if (mask
&& (poll
->events
& EPOLLONESHOT
)) {
6171 io_poll_remove_entries(req
);
6172 /* no one else has access to the req, forget about the ref */
6175 if (!mask
&& unlikely(ipt
->error
|| !ipt
->nr_entries
)) {
6176 io_poll_remove_entries(req
);
6178 ipt
->error
= -EINVAL
;
6182 spin_lock(&ctx
->completion_lock
);
6183 io_poll_req_insert(req
);
6184 spin_unlock(&ctx
->completion_lock
);
6187 /* can't multishot if failed, just queue the event we've got */
6188 if (unlikely(ipt
->error
|| !ipt
->nr_entries
)) {
6189 poll
->events
|= EPOLLONESHOT
;
6190 req
->apoll_events
|= EPOLLONESHOT
;
6193 __io_poll_execute(req
, mask
, poll
->events
);
6198 * Release ownership. If someone tried to queue a tw while it was
6199 * locked, kick it off for them.
6201 v
= atomic_dec_return(&req
->poll_refs
);
6202 if (unlikely(v
& IO_POLL_REF_MASK
))
6203 __io_poll_execute(req
, 0, poll
->events
);
6207 static void io_async_queue_proc(struct file
*file
, struct wait_queue_head
*head
,
6208 struct poll_table_struct
*p
)
6210 struct io_poll_table
*pt
= container_of(p
, struct io_poll_table
, pt
);
6211 struct async_poll
*apoll
= pt
->req
->apoll
;
6213 __io_queue_proc(&apoll
->poll
, pt
, head
, &apoll
->double_poll
);
6222 static int io_arm_poll_handler(struct io_kiocb
*req
, unsigned issue_flags
)
6224 const struct io_op_def
*def
= &io_op_defs
[req
->opcode
];
6225 struct io_ring_ctx
*ctx
= req
->ctx
;
6226 struct async_poll
*apoll
;
6227 struct io_poll_table ipt
;
6228 __poll_t mask
= IO_ASYNC_POLL_COMMON
| POLLERR
;
6231 if (!def
->pollin
&& !def
->pollout
)
6232 return IO_APOLL_ABORTED
;
6233 if (!file_can_poll(req
->file
) || (req
->flags
& REQ_F_POLLED
))
6234 return IO_APOLL_ABORTED
;
6237 mask
|= POLLIN
| POLLRDNORM
;
6239 /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
6240 if ((req
->opcode
== IORING_OP_RECVMSG
) &&
6241 (req
->sr_msg
.msg_flags
& MSG_ERRQUEUE
))
6244 mask
|= POLLOUT
| POLLWRNORM
;
6246 if (def
->poll_exclusive
)
6247 mask
|= EPOLLEXCLUSIVE
;
6248 if (!(issue_flags
& IO_URING_F_UNLOCKED
) &&
6249 !list_empty(&ctx
->apoll_cache
)) {
6250 apoll
= list_first_entry(&ctx
->apoll_cache
, struct async_poll
,
6252 list_del_init(&apoll
->poll
.wait
.entry
);
6254 apoll
= kmalloc(sizeof(*apoll
), GFP_ATOMIC
);
6255 if (unlikely(!apoll
))
6256 return IO_APOLL_ABORTED
;
6258 apoll
->double_poll
= NULL
;
6260 req
->flags
|= REQ_F_POLLED
;
6261 ipt
.pt
._qproc
= io_async_queue_proc
;
6263 io_kbuf_recycle(req
, issue_flags
);
6265 ret
= __io_arm_poll_handler(req
, &apoll
->poll
, &ipt
, mask
);
6266 if (ret
|| ipt
.error
)
6267 return ret
? IO_APOLL_READY
: IO_APOLL_ABORTED
;
6269 trace_io_uring_poll_arm(ctx
, req
, req
->user_data
, req
->opcode
,
6270 mask
, apoll
->poll
.events
);
6275 * Returns true if we found and killed one or more poll requests
6277 static __cold
bool io_poll_remove_all(struct io_ring_ctx
*ctx
,
6278 struct task_struct
*tsk
, bool cancel_all
)
6280 struct hlist_node
*tmp
;
6281 struct io_kiocb
*req
;
6285 spin_lock(&ctx
->completion_lock
);
6286 for (i
= 0; i
< (1U << ctx
->cancel_hash_bits
); i
++) {
6287 struct hlist_head
*list
;
6289 list
= &ctx
->cancel_hash
[i
];
6290 hlist_for_each_entry_safe(req
, tmp
, list
, hash_node
) {
6291 if (io_match_task_safe(req
, tsk
, cancel_all
)) {
6292 hlist_del_init(&req
->hash_node
);
6293 io_poll_cancel_req(req
);
6298 spin_unlock(&ctx
->completion_lock
);
6302 static struct io_kiocb
*io_poll_find(struct io_ring_ctx
*ctx
, __u64 sqe_addr
,
6304 __must_hold(&ctx
->completion_lock
)
6306 struct hlist_head
*list
;
6307 struct io_kiocb
*req
;
6309 list
= &ctx
->cancel_hash
[hash_long(sqe_addr
, ctx
->cancel_hash_bits
)];
6310 hlist_for_each_entry(req
, list
, hash_node
) {
6311 if (sqe_addr
!= req
->user_data
)
6313 if (poll_only
&& req
->opcode
!= IORING_OP_POLL_ADD
)
6320 static bool io_poll_disarm(struct io_kiocb
*req
)
6321 __must_hold(&ctx
->completion_lock
)
6323 if (!io_poll_get_ownership(req
))
6325 io_poll_remove_entries(req
);
6326 hash_del(&req
->hash_node
);
6330 static int io_poll_cancel(struct io_ring_ctx
*ctx
, __u64 sqe_addr
,
6332 __must_hold(&ctx
->completion_lock
)
6334 struct io_kiocb
*req
= io_poll_find(ctx
, sqe_addr
, poll_only
);
6338 io_poll_cancel_req(req
);
6342 static __poll_t
io_poll_parse_events(const struct io_uring_sqe
*sqe
,
6347 events
= READ_ONCE(sqe
->poll32_events
);
6349 events
= swahw32(events
);
6351 if (!(flags
& IORING_POLL_ADD_MULTI
))
6352 events
|= EPOLLONESHOT
;
6353 return demangle_poll(events
) | (events
& (EPOLLEXCLUSIVE
|EPOLLONESHOT
));
6356 static int io_poll_update_prep(struct io_kiocb
*req
,
6357 const struct io_uring_sqe
*sqe
)
6359 struct io_poll_update
*upd
= &req
->poll_update
;
6362 if (unlikely(req
->ctx
->flags
& IORING_SETUP_IOPOLL
))
6364 if (sqe
->ioprio
|| sqe
->buf_index
|| sqe
->splice_fd_in
)
6366 flags
= READ_ONCE(sqe
->len
);
6367 if (flags
& ~(IORING_POLL_UPDATE_EVENTS
| IORING_POLL_UPDATE_USER_DATA
|
6368 IORING_POLL_ADD_MULTI
))
6370 /* meaningless without update */
6371 if (flags
== IORING_POLL_ADD_MULTI
)
6374 upd
->old_user_data
= READ_ONCE(sqe
->addr
);
6375 upd
->update_events
= flags
& IORING_POLL_UPDATE_EVENTS
;
6376 upd
->update_user_data
= flags
& IORING_POLL_UPDATE_USER_DATA
;
6378 upd
->new_user_data
= READ_ONCE(sqe
->off
);
6379 if (!upd
->update_user_data
&& upd
->new_user_data
)
6381 if (upd
->update_events
)
6382 upd
->events
= io_poll_parse_events(sqe
, flags
);
6383 else if (sqe
->poll32_events
)
6389 static int io_poll_add_prep(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
6391 struct io_poll_iocb
*poll
= &req
->poll
;
6394 if (unlikely(req
->ctx
->flags
& IORING_SETUP_IOPOLL
))
6396 if (sqe
->ioprio
|| sqe
->buf_index
|| sqe
->off
|| sqe
->addr
)
6398 flags
= READ_ONCE(sqe
->len
);
6399 if (flags
& ~IORING_POLL_ADD_MULTI
)
6401 if ((flags
& IORING_POLL_ADD_MULTI
) && (req
->flags
& REQ_F_CQE_SKIP
))
6404 io_req_set_refcount(req
);
6405 poll
->events
= io_poll_parse_events(sqe
, flags
);
6409 static int io_poll_add(struct io_kiocb
*req
, unsigned int issue_flags
)
6411 struct io_poll_iocb
*poll
= &req
->poll
;
6412 struct io_poll_table ipt
;
6415 ipt
.pt
._qproc
= io_poll_queue_proc
;
6417 ret
= __io_arm_poll_handler(req
, &req
->poll
, &ipt
, poll
->events
);
6418 if (!ret
&& ipt
.error
)
6420 ret
= ret
?: ipt
.error
;
6422 __io_req_complete(req
, issue_flags
, ret
, 0);
6426 static int io_poll_update(struct io_kiocb
*req
, unsigned int issue_flags
)
6428 struct io_ring_ctx
*ctx
= req
->ctx
;
6429 struct io_kiocb
*preq
;
6433 spin_lock(&ctx
->completion_lock
);
6434 preq
= io_poll_find(ctx
, req
->poll_update
.old_user_data
, true);
6435 if (!preq
|| !io_poll_disarm(preq
)) {
6436 spin_unlock(&ctx
->completion_lock
);
6437 ret
= preq
? -EALREADY
: -ENOENT
;
6440 spin_unlock(&ctx
->completion_lock
);
6442 if (req
->poll_update
.update_events
|| req
->poll_update
.update_user_data
) {
6443 /* only mask one event flags, keep behavior flags */
6444 if (req
->poll_update
.update_events
) {
6445 preq
->poll
.events
&= ~0xffff;
6446 preq
->poll
.events
|= req
->poll_update
.events
& 0xffff;
6447 preq
->poll
.events
|= IO_POLL_UNMASK
;
6449 if (req
->poll_update
.update_user_data
)
6450 preq
->user_data
= req
->poll_update
.new_user_data
;
6452 ret2
= io_poll_add(preq
, issue_flags
);
6453 /* successfully updated, don't complete poll request */
6459 preq
->result
= -ECANCELED
;
6460 locked
= !(issue_flags
& IO_URING_F_UNLOCKED
);
6461 io_req_task_complete(preq
, &locked
);
6465 /* complete update request, we're done with it */
6466 __io_req_complete(req
, issue_flags
, ret
, 0);
6470 static enum hrtimer_restart
io_timeout_fn(struct hrtimer
*timer
)
6472 struct io_timeout_data
*data
= container_of(timer
,
6473 struct io_timeout_data
, timer
);
6474 struct io_kiocb
*req
= data
->req
;
6475 struct io_ring_ctx
*ctx
= req
->ctx
;
6476 unsigned long flags
;
6478 spin_lock_irqsave(&ctx
->timeout_lock
, flags
);
6479 list_del_init(&req
->timeout
.list
);
6480 atomic_set(&req
->ctx
->cq_timeouts
,
6481 atomic_read(&req
->ctx
->cq_timeouts
) + 1);
6482 spin_unlock_irqrestore(&ctx
->timeout_lock
, flags
);
6484 if (!(data
->flags
& IORING_TIMEOUT_ETIME_SUCCESS
))
6487 req
->result
= -ETIME
;
6488 req
->io_task_work
.func
= io_req_task_complete
;
6489 io_req_task_work_add(req
, false);
6490 return HRTIMER_NORESTART
;
6493 static struct io_kiocb
*io_timeout_extract(struct io_ring_ctx
*ctx
,
6495 __must_hold(&ctx
->timeout_lock
)
6497 struct io_timeout_data
*io
;
6498 struct io_kiocb
*req
;
6501 list_for_each_entry(req
, &ctx
->timeout_list
, timeout
.list
) {
6502 found
= user_data
== req
->user_data
;
6507 return ERR_PTR(-ENOENT
);
6509 io
= req
->async_data
;
6510 if (hrtimer_try_to_cancel(&io
->timer
) == -1)
6511 return ERR_PTR(-EALREADY
);
6512 list_del_init(&req
->timeout
.list
);
6516 static int io_timeout_cancel(struct io_ring_ctx
*ctx
, __u64 user_data
)
6517 __must_hold(&ctx
->completion_lock
)
6518 __must_hold(&ctx
->timeout_lock
)
6520 struct io_kiocb
*req
= io_timeout_extract(ctx
, user_data
);
6523 return PTR_ERR(req
);
6524 io_req_task_queue_fail(req
, -ECANCELED
);
6528 static clockid_t
io_timeout_get_clock(struct io_timeout_data
*data
)
6530 switch (data
->flags
& IORING_TIMEOUT_CLOCK_MASK
) {
6531 case IORING_TIMEOUT_BOOTTIME
:
6532 return CLOCK_BOOTTIME
;
6533 case IORING_TIMEOUT_REALTIME
:
6534 return CLOCK_REALTIME
;
6536 /* can't happen, vetted at prep time */
6540 return CLOCK_MONOTONIC
;
6544 static int io_linked_timeout_update(struct io_ring_ctx
*ctx
, __u64 user_data
,
6545 struct timespec64
*ts
, enum hrtimer_mode mode
)
6546 __must_hold(&ctx
->timeout_lock
)
6548 struct io_timeout_data
*io
;
6549 struct io_kiocb
*req
;
6552 list_for_each_entry(req
, &ctx
->ltimeout_list
, timeout
.list
) {
6553 found
= user_data
== req
->user_data
;
6560 io
= req
->async_data
;
6561 if (hrtimer_try_to_cancel(&io
->timer
) == -1)
6563 hrtimer_init(&io
->timer
, io_timeout_get_clock(io
), mode
);
6564 io
->timer
.function
= io_link_timeout_fn
;
6565 hrtimer_start(&io
->timer
, timespec64_to_ktime(*ts
), mode
);
6569 static int io_timeout_update(struct io_ring_ctx
*ctx
, __u64 user_data
,
6570 struct timespec64
*ts
, enum hrtimer_mode mode
)
6571 __must_hold(&ctx
->timeout_lock
)
6573 struct io_kiocb
*req
= io_timeout_extract(ctx
, user_data
);
6574 struct io_timeout_data
*data
;
6577 return PTR_ERR(req
);
6579 req
->timeout
.off
= 0; /* noseq */
6580 data
= req
->async_data
;
6581 list_add_tail(&req
->timeout
.list
, &ctx
->timeout_list
);
6582 hrtimer_init(&data
->timer
, io_timeout_get_clock(data
), mode
);
6583 data
->timer
.function
= io_timeout_fn
;
6584 hrtimer_start(&data
->timer
, timespec64_to_ktime(*ts
), mode
);
6588 static int io_timeout_remove_prep(struct io_kiocb
*req
,
6589 const struct io_uring_sqe
*sqe
)
6591 struct io_timeout_rem
*tr
= &req
->timeout_rem
;
6593 if (unlikely(req
->ctx
->flags
& IORING_SETUP_IOPOLL
))
6595 if (unlikely(req
->flags
& (REQ_F_FIXED_FILE
| REQ_F_BUFFER_SELECT
)))
6597 if (sqe
->ioprio
|| sqe
->buf_index
|| sqe
->len
|| sqe
->splice_fd_in
)
6600 tr
->ltimeout
= false;
6601 tr
->addr
= READ_ONCE(sqe
->addr
);
6602 tr
->flags
= READ_ONCE(sqe
->timeout_flags
);
6603 if (tr
->flags
& IORING_TIMEOUT_UPDATE_MASK
) {
6604 if (hweight32(tr
->flags
& IORING_TIMEOUT_CLOCK_MASK
) > 1)
6606 if (tr
->flags
& IORING_LINK_TIMEOUT_UPDATE
)
6607 tr
->ltimeout
= true;
6608 if (tr
->flags
& ~(IORING_TIMEOUT_UPDATE_MASK
|IORING_TIMEOUT_ABS
))
6610 if (get_timespec64(&tr
->ts
, u64_to_user_ptr(sqe
->addr2
)))
6612 if (tr
->ts
.tv_sec
< 0 || tr
->ts
.tv_nsec
< 0)
6614 } else if (tr
->flags
) {
6615 /* timeout removal doesn't support flags */
6622 static inline enum hrtimer_mode
io_translate_timeout_mode(unsigned int flags
)
6624 return (flags
& IORING_TIMEOUT_ABS
) ? HRTIMER_MODE_ABS
6629 * Remove or update an existing timeout command
6631 static int io_timeout_remove(struct io_kiocb
*req
, unsigned int issue_flags
)
6633 struct io_timeout_rem
*tr
= &req
->timeout_rem
;
6634 struct io_ring_ctx
*ctx
= req
->ctx
;
6637 if (!(req
->timeout_rem
.flags
& IORING_TIMEOUT_UPDATE
)) {
6638 spin_lock(&ctx
->completion_lock
);
6639 spin_lock_irq(&ctx
->timeout_lock
);
6640 ret
= io_timeout_cancel(ctx
, tr
->addr
);
6641 spin_unlock_irq(&ctx
->timeout_lock
);
6642 spin_unlock(&ctx
->completion_lock
);
6644 enum hrtimer_mode mode
= io_translate_timeout_mode(tr
->flags
);
6646 spin_lock_irq(&ctx
->timeout_lock
);
6648 ret
= io_linked_timeout_update(ctx
, tr
->addr
, &tr
->ts
, mode
);
6650 ret
= io_timeout_update(ctx
, tr
->addr
, &tr
->ts
, mode
);
6651 spin_unlock_irq(&ctx
->timeout_lock
);
6656 io_req_complete_post(req
, ret
, 0);
6660 static int io_timeout_prep(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
,
6661 bool is_timeout_link
)
6663 struct io_timeout_data
*data
;
6665 u32 off
= READ_ONCE(sqe
->off
);
6667 if (unlikely(req
->ctx
->flags
& IORING_SETUP_IOPOLL
))
6669 if (sqe
->ioprio
|| sqe
->buf_index
|| sqe
->len
!= 1 ||
6672 if (off
&& is_timeout_link
)
6674 flags
= READ_ONCE(sqe
->timeout_flags
);
6675 if (flags
& ~(IORING_TIMEOUT_ABS
| IORING_TIMEOUT_CLOCK_MASK
|
6676 IORING_TIMEOUT_ETIME_SUCCESS
))
6678 /* more than one clock specified is invalid, obviously */
6679 if (hweight32(flags
& IORING_TIMEOUT_CLOCK_MASK
) > 1)
6682 INIT_LIST_HEAD(&req
->timeout
.list
);
6683 req
->timeout
.off
= off
;
6684 if (unlikely(off
&& !req
->ctx
->off_timeout_used
))
6685 req
->ctx
->off_timeout_used
= true;
6687 if (WARN_ON_ONCE(req_has_async_data(req
)))
6689 if (io_alloc_async_data(req
))
6692 data
= req
->async_data
;
6694 data
->flags
= flags
;
6696 if (get_timespec64(&data
->ts
, u64_to_user_ptr(sqe
->addr
)))
6699 if (data
->ts
.tv_sec
< 0 || data
->ts
.tv_nsec
< 0)
6702 INIT_LIST_HEAD(&req
->timeout
.list
);
6703 data
->mode
= io_translate_timeout_mode(flags
);
6704 hrtimer_init(&data
->timer
, io_timeout_get_clock(data
), data
->mode
);
6706 if (is_timeout_link
) {
6707 struct io_submit_link
*link
= &req
->ctx
->submit_state
.link
;
6711 if (link
->last
->opcode
== IORING_OP_LINK_TIMEOUT
)
6713 req
->timeout
.head
= link
->last
;
6714 link
->last
->flags
|= REQ_F_ARM_LTIMEOUT
;
6719 static int io_timeout(struct io_kiocb
*req
, unsigned int issue_flags
)
6721 struct io_ring_ctx
*ctx
= req
->ctx
;
6722 struct io_timeout_data
*data
= req
->async_data
;
6723 struct list_head
*entry
;
6724 u32 tail
, off
= req
->timeout
.off
;
6726 spin_lock_irq(&ctx
->timeout_lock
);
6729 * sqe->off holds how many events that need to occur for this
6730 * timeout event to be satisfied. If it isn't set, then this is
6731 * a pure timeout request, sequence isn't used.
6733 if (io_is_timeout_noseq(req
)) {
6734 entry
= ctx
->timeout_list
.prev
;
6738 tail
= ctx
->cached_cq_tail
- atomic_read(&ctx
->cq_timeouts
);
6739 req
->timeout
.target_seq
= tail
+ off
;
6741 /* Update the last seq here in case io_flush_timeouts() hasn't.
6742 * This is safe because ->completion_lock is held, and submissions
6743 * and completions are never mixed in the same ->completion_lock section.
6745 ctx
->cq_last_tm_flush
= tail
;
6748 * Insertion sort, ensuring the first entry in the list is always
6749 * the one we need first.
6751 list_for_each_prev(entry
, &ctx
->timeout_list
) {
6752 struct io_kiocb
*nxt
= list_entry(entry
, struct io_kiocb
,
6755 if (io_is_timeout_noseq(nxt
))
6757 /* nxt.seq is behind @tail, otherwise would've been completed */
6758 if (off
>= nxt
->timeout
.target_seq
- tail
)
6762 list_add(&req
->timeout
.list
, entry
);
6763 data
->timer
.function
= io_timeout_fn
;
6764 hrtimer_start(&data
->timer
, timespec64_to_ktime(data
->ts
), data
->mode
);
6765 spin_unlock_irq(&ctx
->timeout_lock
);
6769 struct io_cancel_data
{
6770 struct io_ring_ctx
*ctx
;
6774 static bool io_cancel_cb(struct io_wq_work
*work
, void *data
)
6776 struct io_kiocb
*req
= container_of(work
, struct io_kiocb
, work
);
6777 struct io_cancel_data
*cd
= data
;
6779 return req
->ctx
== cd
->ctx
&& req
->user_data
== cd
->user_data
;
6782 static int io_async_cancel_one(struct io_uring_task
*tctx
, u64 user_data
,
6783 struct io_ring_ctx
*ctx
)
6785 struct io_cancel_data data
= { .ctx
= ctx
, .user_data
= user_data
, };
6786 enum io_wq_cancel cancel_ret
;
6789 if (!tctx
|| !tctx
->io_wq
)
6792 cancel_ret
= io_wq_cancel_cb(tctx
->io_wq
, io_cancel_cb
, &data
, false);
6793 switch (cancel_ret
) {
6794 case IO_WQ_CANCEL_OK
:
6797 case IO_WQ_CANCEL_RUNNING
:
6800 case IO_WQ_CANCEL_NOTFOUND
:
6808 static int io_try_cancel_userdata(struct io_kiocb
*req
, u64 sqe_addr
)
6810 struct io_ring_ctx
*ctx
= req
->ctx
;
6813 WARN_ON_ONCE(!io_wq_current_is_worker() && req
->task
!= current
);
6815 ret
= io_async_cancel_one(req
->task
->io_uring
, sqe_addr
, ctx
);
6817 * Fall-through even for -EALREADY, as we may have poll armed
6818 * that need unarming.
6823 spin_lock(&ctx
->completion_lock
);
6824 ret
= io_poll_cancel(ctx
, sqe_addr
, false);
6828 spin_lock_irq(&ctx
->timeout_lock
);
6829 ret
= io_timeout_cancel(ctx
, sqe_addr
);
6830 spin_unlock_irq(&ctx
->timeout_lock
);
6832 spin_unlock(&ctx
->completion_lock
);
6836 static int io_async_cancel_prep(struct io_kiocb
*req
,
6837 const struct io_uring_sqe
*sqe
)
6839 if (unlikely(req
->ctx
->flags
& IORING_SETUP_IOPOLL
))
6841 if (unlikely(req
->flags
& (REQ_F_FIXED_FILE
| REQ_F_BUFFER_SELECT
)))
6843 if (sqe
->ioprio
|| sqe
->off
|| sqe
->len
|| sqe
->cancel_flags
||
6847 req
->cancel
.addr
= READ_ONCE(sqe
->addr
);
6851 static int io_async_cancel(struct io_kiocb
*req
, unsigned int issue_flags
)
6853 struct io_ring_ctx
*ctx
= req
->ctx
;
6854 u64 sqe_addr
= req
->cancel
.addr
;
6855 bool needs_lock
= issue_flags
& IO_URING_F_UNLOCKED
;
6856 struct io_tctx_node
*node
;
6859 ret
= io_try_cancel_userdata(req
, sqe_addr
);
6863 /* slow path, try all io-wq's */
6864 io_ring_submit_lock(ctx
, needs_lock
);
6866 list_for_each_entry(node
, &ctx
->tctx_list
, ctx_node
) {
6867 struct io_uring_task
*tctx
= node
->task
->io_uring
;
6869 ret
= io_async_cancel_one(tctx
, req
->cancel
.addr
, ctx
);
6873 io_ring_submit_unlock(ctx
, needs_lock
);
6877 io_req_complete_post(req
, ret
, 0);
6881 static int io_rsrc_update_prep(struct io_kiocb
*req
,
6882 const struct io_uring_sqe
*sqe
)
6884 if (unlikely(req
->flags
& (REQ_F_FIXED_FILE
| REQ_F_BUFFER_SELECT
)))
6886 if (sqe
->ioprio
|| sqe
->rw_flags
|| sqe
->splice_fd_in
)
6889 req
->rsrc_update
.offset
= READ_ONCE(sqe
->off
);
6890 req
->rsrc_update
.nr_args
= READ_ONCE(sqe
->len
);
6891 if (!req
->rsrc_update
.nr_args
)
6893 req
->rsrc_update
.arg
= READ_ONCE(sqe
->addr
);
6897 static int io_files_update(struct io_kiocb
*req
, unsigned int issue_flags
)
6899 struct io_ring_ctx
*ctx
= req
->ctx
;
6900 bool needs_lock
= issue_flags
& IO_URING_F_UNLOCKED
;
6901 struct io_uring_rsrc_update2 up
;
6904 up
.offset
= req
->rsrc_update
.offset
;
6905 up
.data
= req
->rsrc_update
.arg
;
6911 io_ring_submit_lock(ctx
, needs_lock
);
6912 ret
= __io_register_rsrc_update(ctx
, IORING_RSRC_FILE
,
6913 &up
, req
->rsrc_update
.nr_args
);
6914 io_ring_submit_unlock(ctx
, needs_lock
);
6918 __io_req_complete(req
, issue_flags
, ret
, 0);
6922 static int io_req_prep(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
6924 switch (req
->opcode
) {
6927 case IORING_OP_READV
:
6928 case IORING_OP_READ_FIXED
:
6929 case IORING_OP_READ
:
6930 case IORING_OP_WRITEV
:
6931 case IORING_OP_WRITE_FIXED
:
6932 case IORING_OP_WRITE
:
6933 return io_prep_rw(req
, sqe
);
6934 case IORING_OP_POLL_ADD
:
6935 return io_poll_add_prep(req
, sqe
);
6936 case IORING_OP_POLL_REMOVE
:
6937 return io_poll_update_prep(req
, sqe
);
6938 case IORING_OP_FSYNC
:
6939 return io_fsync_prep(req
, sqe
);
6940 case IORING_OP_SYNC_FILE_RANGE
:
6941 return io_sfr_prep(req
, sqe
);
6942 case IORING_OP_SENDMSG
:
6943 case IORING_OP_SEND
:
6944 return io_sendmsg_prep(req
, sqe
);
6945 case IORING_OP_RECVMSG
:
6946 case IORING_OP_RECV
:
6947 return io_recvmsg_prep(req
, sqe
);
6948 case IORING_OP_CONNECT
:
6949 return io_connect_prep(req
, sqe
);
6950 case IORING_OP_TIMEOUT
:
6951 return io_timeout_prep(req
, sqe
, false);
6952 case IORING_OP_TIMEOUT_REMOVE
:
6953 return io_timeout_remove_prep(req
, sqe
);
6954 case IORING_OP_ASYNC_CANCEL
:
6955 return io_async_cancel_prep(req
, sqe
);
6956 case IORING_OP_LINK_TIMEOUT
:
6957 return io_timeout_prep(req
, sqe
, true);
6958 case IORING_OP_ACCEPT
:
6959 return io_accept_prep(req
, sqe
);
6960 case IORING_OP_FALLOCATE
:
6961 return io_fallocate_prep(req
, sqe
);
6962 case IORING_OP_OPENAT
:
6963 return io_openat_prep(req
, sqe
);
6964 case IORING_OP_CLOSE
:
6965 return io_close_prep(req
, sqe
);
6966 case IORING_OP_FILES_UPDATE
:
6967 return io_rsrc_update_prep(req
, sqe
);
6968 case IORING_OP_STATX
:
6969 return io_statx_prep(req
, sqe
);
6970 case IORING_OP_FADVISE
:
6971 return io_fadvise_prep(req
, sqe
);
6972 case IORING_OP_MADVISE
:
6973 return io_madvise_prep(req
, sqe
);
6974 case IORING_OP_OPENAT2
:
6975 return io_openat2_prep(req
, sqe
);
6976 case IORING_OP_EPOLL_CTL
:
6977 return io_epoll_ctl_prep(req
, sqe
);
6978 case IORING_OP_SPLICE
:
6979 return io_splice_prep(req
, sqe
);
6980 case IORING_OP_PROVIDE_BUFFERS
:
6981 return io_provide_buffers_prep(req
, sqe
);
6982 case IORING_OP_REMOVE_BUFFERS
:
6983 return io_remove_buffers_prep(req
, sqe
);
6985 return io_tee_prep(req
, sqe
);
6986 case IORING_OP_SHUTDOWN
:
6987 return io_shutdown_prep(req
, sqe
);
6988 case IORING_OP_RENAMEAT
:
6989 return io_renameat_prep(req
, sqe
);
6990 case IORING_OP_UNLINKAT
:
6991 return io_unlinkat_prep(req
, sqe
);
6992 case IORING_OP_MKDIRAT
:
6993 return io_mkdirat_prep(req
, sqe
);
6994 case IORING_OP_SYMLINKAT
:
6995 return io_symlinkat_prep(req
, sqe
);
6996 case IORING_OP_LINKAT
:
6997 return io_linkat_prep(req
, sqe
);
6998 case IORING_OP_MSG_RING
:
6999 return io_msg_ring_prep(req
, sqe
);
7002 printk_once(KERN_WARNING
"io_uring: unhandled opcode %d\n",
7007 static int io_req_prep_async(struct io_kiocb
*req
)
7009 const struct io_op_def
*def
= &io_op_defs
[req
->opcode
];
7011 /* assign early for deferred execution for non-fixed file */
7012 if (def
->needs_file
&& !(req
->flags
& REQ_F_FIXED_FILE
))
7013 req
->file
= io_file_get_normal(req
, req
->fd
);
7014 if (!def
->needs_async_setup
)
7016 if (WARN_ON_ONCE(req_has_async_data(req
)))
7018 if (io_alloc_async_data(req
))
7021 switch (req
->opcode
) {
7022 case IORING_OP_READV
:
7023 return io_rw_prep_async(req
, READ
);
7024 case IORING_OP_WRITEV
:
7025 return io_rw_prep_async(req
, WRITE
);
7026 case IORING_OP_SENDMSG
:
7027 return io_sendmsg_prep_async(req
);
7028 case IORING_OP_RECVMSG
:
7029 return io_recvmsg_prep_async(req
);
7030 case IORING_OP_CONNECT
:
7031 return io_connect_prep_async(req
);
7033 printk_once(KERN_WARNING
"io_uring: prep_async() bad opcode %d\n",
7038 static u32
io_get_sequence(struct io_kiocb
*req
)
7040 u32 seq
= req
->ctx
->cached_sq_head
;
7042 /* need original cached_sq_head, but it was increased for each req */
7043 io_for_each_link(req
, req
)
7048 static __cold
void io_drain_req(struct io_kiocb
*req
)
7050 struct io_ring_ctx
*ctx
= req
->ctx
;
7051 struct io_defer_entry
*de
;
7053 u32 seq
= io_get_sequence(req
);
7055 /* Still need defer if there is pending req in defer list. */
7056 spin_lock(&ctx
->completion_lock
);
7057 if (!req_need_defer(req
, seq
) && list_empty_careful(&ctx
->defer_list
)) {
7058 spin_unlock(&ctx
->completion_lock
);
7060 ctx
->drain_active
= false;
7061 io_req_task_queue(req
);
7064 spin_unlock(&ctx
->completion_lock
);
7066 ret
= io_req_prep_async(req
);
7069 io_req_complete_failed(req
, ret
);
7072 io_prep_async_link(req
);
7073 de
= kmalloc(sizeof(*de
), GFP_KERNEL
);
7079 spin_lock(&ctx
->completion_lock
);
7080 if (!req_need_defer(req
, seq
) && list_empty(&ctx
->defer_list
)) {
7081 spin_unlock(&ctx
->completion_lock
);
7086 trace_io_uring_defer(ctx
, req
, req
->user_data
, req
->opcode
);
7089 list_add_tail(&de
->list
, &ctx
->defer_list
);
7090 spin_unlock(&ctx
->completion_lock
);
7093 static void io_clean_op(struct io_kiocb
*req
)
7095 if (req
->flags
& REQ_F_BUFFER_SELECTED
) {
7096 spin_lock(&req
->ctx
->completion_lock
);
7097 io_put_kbuf_comp(req
);
7098 spin_unlock(&req
->ctx
->completion_lock
);
7101 if (req
->flags
& REQ_F_NEED_CLEANUP
) {
7102 switch (req
->opcode
) {
7103 case IORING_OP_READV
:
7104 case IORING_OP_READ_FIXED
:
7105 case IORING_OP_READ
:
7106 case IORING_OP_WRITEV
:
7107 case IORING_OP_WRITE_FIXED
:
7108 case IORING_OP_WRITE
: {
7109 struct io_async_rw
*io
= req
->async_data
;
7111 kfree(io
->free_iovec
);
7114 case IORING_OP_RECVMSG
:
7115 case IORING_OP_SENDMSG
: {
7116 struct io_async_msghdr
*io
= req
->async_data
;
7118 kfree(io
->free_iov
);
7121 case IORING_OP_OPENAT
:
7122 case IORING_OP_OPENAT2
:
7123 if (req
->open
.filename
)
7124 putname(req
->open
.filename
);
7126 case IORING_OP_RENAMEAT
:
7127 putname(req
->rename
.oldpath
);
7128 putname(req
->rename
.newpath
);
7130 case IORING_OP_UNLINKAT
:
7131 putname(req
->unlink
.filename
);
7133 case IORING_OP_MKDIRAT
:
7134 putname(req
->mkdir
.filename
);
7136 case IORING_OP_SYMLINKAT
:
7137 putname(req
->symlink
.oldpath
);
7138 putname(req
->symlink
.newpath
);
7140 case IORING_OP_LINKAT
:
7141 putname(req
->hardlink
.oldpath
);
7142 putname(req
->hardlink
.newpath
);
7144 case IORING_OP_STATX
:
7145 if (req
->statx
.filename
)
7146 putname(req
->statx
.filename
);
7150 if ((req
->flags
& REQ_F_POLLED
) && req
->apoll
) {
7151 kfree(req
->apoll
->double_poll
);
7155 if (req
->flags
& REQ_F_INFLIGHT
) {
7156 struct io_uring_task
*tctx
= req
->task
->io_uring
;
7158 atomic_dec(&tctx
->inflight_tracked
);
7160 if (req
->flags
& REQ_F_CREDS
)
7161 put_cred(req
->creds
);
7162 if (req
->flags
& REQ_F_ASYNC_DATA
) {
7163 kfree(req
->async_data
);
7164 req
->async_data
= NULL
;
7166 req
->flags
&= ~IO_REQ_CLEAN_FLAGS
;
7169 static bool io_assign_file(struct io_kiocb
*req
, unsigned int issue_flags
)
7171 if (req
->file
|| !io_op_defs
[req
->opcode
].needs_file
)
7174 if (req
->flags
& REQ_F_FIXED_FILE
)
7175 req
->file
= io_file_get_fixed(req
, req
->fd
, issue_flags
);
7177 req
->file
= io_file_get_normal(req
, req
->fd
);
7182 req
->result
= -EBADF
;
7186 static int io_issue_sqe(struct io_kiocb
*req
, unsigned int issue_flags
)
7188 const struct cred
*creds
= NULL
;
7191 if (unlikely(!io_assign_file(req
, issue_flags
)))
7194 if (unlikely((req
->flags
& REQ_F_CREDS
) && req
->creds
!= current_cred()))
7195 creds
= override_creds(req
->creds
);
7197 if (!io_op_defs
[req
->opcode
].audit_skip
)
7198 audit_uring_entry(req
->opcode
);
7200 switch (req
->opcode
) {
7202 ret
= io_nop(req
, issue_flags
);
7204 case IORING_OP_READV
:
7205 case IORING_OP_READ_FIXED
:
7206 case IORING_OP_READ
:
7207 ret
= io_read(req
, issue_flags
);
7209 case IORING_OP_WRITEV
:
7210 case IORING_OP_WRITE_FIXED
:
7211 case IORING_OP_WRITE
:
7212 ret
= io_write(req
, issue_flags
);
7214 case IORING_OP_FSYNC
:
7215 ret
= io_fsync(req
, issue_flags
);
7217 case IORING_OP_POLL_ADD
:
7218 ret
= io_poll_add(req
, issue_flags
);
7220 case IORING_OP_POLL_REMOVE
:
7221 ret
= io_poll_update(req
, issue_flags
);
7223 case IORING_OP_SYNC_FILE_RANGE
:
7224 ret
= io_sync_file_range(req
, issue_flags
);
7226 case IORING_OP_SENDMSG
:
7227 ret
= io_sendmsg(req
, issue_flags
);
7229 case IORING_OP_SEND
:
7230 ret
= io_send(req
, issue_flags
);
7232 case IORING_OP_RECVMSG
:
7233 ret
= io_recvmsg(req
, issue_flags
);
7235 case IORING_OP_RECV
:
7236 ret
= io_recv(req
, issue_flags
);
7238 case IORING_OP_TIMEOUT
:
7239 ret
= io_timeout(req
, issue_flags
);
7241 case IORING_OP_TIMEOUT_REMOVE
:
7242 ret
= io_timeout_remove(req
, issue_flags
);
7244 case IORING_OP_ACCEPT
:
7245 ret
= io_accept(req
, issue_flags
);
7247 case IORING_OP_CONNECT
:
7248 ret
= io_connect(req
, issue_flags
);
7250 case IORING_OP_ASYNC_CANCEL
:
7251 ret
= io_async_cancel(req
, issue_flags
);
7253 case IORING_OP_FALLOCATE
:
7254 ret
= io_fallocate(req
, issue_flags
);
7256 case IORING_OP_OPENAT
:
7257 ret
= io_openat(req
, issue_flags
);
7259 case IORING_OP_CLOSE
:
7260 ret
= io_close(req
, issue_flags
);
7262 case IORING_OP_FILES_UPDATE
:
7263 ret
= io_files_update(req
, issue_flags
);
7265 case IORING_OP_STATX
:
7266 ret
= io_statx(req
, issue_flags
);
7268 case IORING_OP_FADVISE
:
7269 ret
= io_fadvise(req
, issue_flags
);
7271 case IORING_OP_MADVISE
:
7272 ret
= io_madvise(req
, issue_flags
);
7274 case IORING_OP_OPENAT2
:
7275 ret
= io_openat2(req
, issue_flags
);
7277 case IORING_OP_EPOLL_CTL
:
7278 ret
= io_epoll_ctl(req
, issue_flags
);
7280 case IORING_OP_SPLICE
:
7281 ret
= io_splice(req
, issue_flags
);
7283 case IORING_OP_PROVIDE_BUFFERS
:
7284 ret
= io_provide_buffers(req
, issue_flags
);
7286 case IORING_OP_REMOVE_BUFFERS
:
7287 ret
= io_remove_buffers(req
, issue_flags
);
7290 ret
= io_tee(req
, issue_flags
);
7292 case IORING_OP_SHUTDOWN
:
7293 ret
= io_shutdown(req
, issue_flags
);
7295 case IORING_OP_RENAMEAT
:
7296 ret
= io_renameat(req
, issue_flags
);
7298 case IORING_OP_UNLINKAT
:
7299 ret
= io_unlinkat(req
, issue_flags
);
7301 case IORING_OP_MKDIRAT
:
7302 ret
= io_mkdirat(req
, issue_flags
);
7304 case IORING_OP_SYMLINKAT
:
7305 ret
= io_symlinkat(req
, issue_flags
);
7307 case IORING_OP_LINKAT
:
7308 ret
= io_linkat(req
, issue_flags
);
7310 case IORING_OP_MSG_RING
:
7311 ret
= io_msg_ring(req
, issue_flags
);
7318 if (!io_op_defs
[req
->opcode
].audit_skip
)
7319 audit_uring_exit(!ret
, ret
);
7322 revert_creds(creds
);
7325 /* If the op doesn't have a file, we're not polling for it */
7326 if ((req
->ctx
->flags
& IORING_SETUP_IOPOLL
) && req
->file
)
7327 io_iopoll_req_issued(req
, issue_flags
);
7332 static struct io_wq_work
*io_wq_free_work(struct io_wq_work
*work
)
7334 struct io_kiocb
*req
= container_of(work
, struct io_kiocb
, work
);
7336 req
= io_put_req_find_next(req
);
7337 return req
? &req
->work
: NULL
;
7340 static void io_wq_submit_work(struct io_wq_work
*work
)
7342 struct io_kiocb
*req
= container_of(work
, struct io_kiocb
, work
);
7343 const struct io_op_def
*def
= &io_op_defs
[req
->opcode
];
7344 unsigned int issue_flags
= IO_URING_F_UNLOCKED
;
7345 bool needs_poll
= false;
7346 struct io_kiocb
*timeout
;
7347 int ret
= 0, err
= -ECANCELED
;
7349 /* one will be dropped by ->io_free_work() after returning to io-wq */
7350 if (!(req
->flags
& REQ_F_REFCOUNT
))
7351 __io_req_set_refcount(req
, 2);
7355 timeout
= io_prep_linked_timeout(req
);
7357 io_queue_linked_timeout(timeout
);
7360 /* either cancelled or io-wq is dying, so don't touch tctx->iowq */
7361 if (work
->flags
& IO_WQ_WORK_CANCEL
) {
7363 io_req_task_queue_fail(req
, err
);
7366 if (!io_assign_file(req
, issue_flags
)) {
7368 work
->flags
|= IO_WQ_WORK_CANCEL
;
7372 if (req
->flags
& REQ_F_FORCE_ASYNC
) {
7373 bool opcode_poll
= def
->pollin
|| def
->pollout
;
7375 if (opcode_poll
&& file_can_poll(req
->file
)) {
7377 issue_flags
|= IO_URING_F_NONBLOCK
;
7382 ret
= io_issue_sqe(req
, issue_flags
);
7386 * We can get EAGAIN for iopolled IO even though we're
7387 * forcing a sync submission from here, since we can't
7388 * wait for request slots on the block side.
7391 if (!(req
->ctx
->flags
& IORING_SETUP_IOPOLL
))
7397 if (io_arm_poll_handler(req
, issue_flags
) == IO_APOLL_OK
)
7399 /* aborted or ready, in either case retry blocking */
7401 issue_flags
&= ~IO_URING_F_NONBLOCK
;
7404 /* avoid locking problems by failing it from a clean context */
7406 io_req_task_queue_fail(req
, ret
);
7409 static inline struct io_fixed_file
*io_fixed_file_slot(struct io_file_table
*table
,
7412 return &table
->files
[i
];
7415 static inline struct file
*io_file_from_index(struct io_ring_ctx
*ctx
,
7418 struct io_fixed_file
*slot
= io_fixed_file_slot(&ctx
->file_table
, index
);
7420 return (struct file
*) (slot
->file_ptr
& FFS_MASK
);
7423 static void io_fixed_file_set(struct io_fixed_file
*file_slot
, struct file
*file
)
7425 unsigned long file_ptr
= (unsigned long) file
;
7427 file_ptr
|= io_file_get_flags(file
);
7428 file_slot
->file_ptr
= file_ptr
;
7431 static inline struct file
*io_file_get_fixed(struct io_kiocb
*req
, int fd
,
7432 unsigned int issue_flags
)
7434 struct io_ring_ctx
*ctx
= req
->ctx
;
7435 struct file
*file
= NULL
;
7436 unsigned long file_ptr
;
7438 if (issue_flags
& IO_URING_F_UNLOCKED
)
7439 mutex_lock(&ctx
->uring_lock
);
7441 if (unlikely((unsigned int)fd
>= ctx
->nr_user_files
))
7443 fd
= array_index_nospec(fd
, ctx
->nr_user_files
);
7444 file_ptr
= io_fixed_file_slot(&ctx
->file_table
, fd
)->file_ptr
;
7445 file
= (struct file
*) (file_ptr
& FFS_MASK
);
7446 file_ptr
&= ~FFS_MASK
;
7447 /* mask in overlapping REQ_F and FFS bits */
7448 req
->flags
|= (file_ptr
<< REQ_F_SUPPORT_NOWAIT_BIT
);
7449 io_req_set_rsrc_node(req
, ctx
, 0);
7451 if (issue_flags
& IO_URING_F_UNLOCKED
)
7452 mutex_unlock(&ctx
->uring_lock
);
7456 static struct file
*io_file_get_normal(struct io_kiocb
*req
, int fd
)
7458 struct file
*file
= fget(fd
);
7460 trace_io_uring_file_get(req
->ctx
, req
, req
->user_data
, fd
);
7462 /* we don't allow fixed io_uring files */
7463 if (file
&& file
->f_op
== &io_uring_fops
)
7464 io_req_track_inflight(req
);
7468 static void io_req_task_link_timeout(struct io_kiocb
*req
, bool *locked
)
7470 struct io_kiocb
*prev
= req
->timeout
.prev
;
7474 if (!(req
->task
->flags
& PF_EXITING
))
7475 ret
= io_try_cancel_userdata(req
, prev
->user_data
);
7476 io_req_complete_post(req
, ret
?: -ETIME
, 0);
7479 io_req_complete_post(req
, -ETIME
, 0);
7483 static enum hrtimer_restart
io_link_timeout_fn(struct hrtimer
*timer
)
7485 struct io_timeout_data
*data
= container_of(timer
,
7486 struct io_timeout_data
, timer
);
7487 struct io_kiocb
*prev
, *req
= data
->req
;
7488 struct io_ring_ctx
*ctx
= req
->ctx
;
7489 unsigned long flags
;
7491 spin_lock_irqsave(&ctx
->timeout_lock
, flags
);
7492 prev
= req
->timeout
.head
;
7493 req
->timeout
.head
= NULL
;
7496 * We don't expect the list to be empty, that will only happen if we
7497 * race with the completion of the linked work.
7500 io_remove_next_linked(prev
);
7501 if (!req_ref_inc_not_zero(prev
))
7504 list_del(&req
->timeout
.list
);
7505 req
->timeout
.prev
= prev
;
7506 spin_unlock_irqrestore(&ctx
->timeout_lock
, flags
);
7508 req
->io_task_work
.func
= io_req_task_link_timeout
;
7509 io_req_task_work_add(req
, false);
7510 return HRTIMER_NORESTART
;
7513 static void io_queue_linked_timeout(struct io_kiocb
*req
)
7515 struct io_ring_ctx
*ctx
= req
->ctx
;
7517 spin_lock_irq(&ctx
->timeout_lock
);
7519 * If the back reference is NULL, then our linked request finished
7520 * before we got a chance to setup the timer
7522 if (req
->timeout
.head
) {
7523 struct io_timeout_data
*data
= req
->async_data
;
7525 data
->timer
.function
= io_link_timeout_fn
;
7526 hrtimer_start(&data
->timer
, timespec64_to_ktime(data
->ts
),
7528 list_add_tail(&req
->timeout
.list
, &ctx
->ltimeout_list
);
7530 spin_unlock_irq(&ctx
->timeout_lock
);
7531 /* drop submission reference */
7535 static void io_queue_sqe_arm_apoll(struct io_kiocb
*req
)
7536 __must_hold(&req
->ctx
->uring_lock
)
7538 struct io_kiocb
*linked_timeout
= io_prep_linked_timeout(req
);
7540 switch (io_arm_poll_handler(req
, 0)) {
7541 case IO_APOLL_READY
:
7542 io_req_task_queue(req
);
7544 case IO_APOLL_ABORTED
:
7546 * Queued up for async execution, worker will release
7547 * submit reference when the iocb is actually submitted.
7549 io_queue_async_work(req
, NULL
);
7556 io_queue_linked_timeout(linked_timeout
);
7559 static inline void __io_queue_sqe(struct io_kiocb
*req
)
7560 __must_hold(&req
->ctx
->uring_lock
)
7562 struct io_kiocb
*linked_timeout
;
7565 ret
= io_issue_sqe(req
, IO_URING_F_NONBLOCK
|IO_URING_F_COMPLETE_DEFER
);
7567 if (req
->flags
& REQ_F_COMPLETE_INLINE
) {
7568 io_req_add_compl_list(req
);
7572 * We async punt it if the file wasn't marked NOWAIT, or if the file
7573 * doesn't support non-blocking read/write attempts
7576 linked_timeout
= io_prep_linked_timeout(req
);
7578 io_queue_linked_timeout(linked_timeout
);
7579 } else if (ret
== -EAGAIN
&& !(req
->flags
& REQ_F_NOWAIT
)) {
7580 io_queue_sqe_arm_apoll(req
);
7582 io_req_complete_failed(req
, ret
);
7586 static void io_queue_sqe_fallback(struct io_kiocb
*req
)
7587 __must_hold(&req
->ctx
->uring_lock
)
7589 if (req
->flags
& REQ_F_FAIL
) {
7590 io_req_complete_fail_submit(req
);
7591 } else if (unlikely(req
->ctx
->drain_active
)) {
7594 int ret
= io_req_prep_async(req
);
7597 io_req_complete_failed(req
, ret
);
7599 io_queue_async_work(req
, NULL
);
7603 static inline void io_queue_sqe(struct io_kiocb
*req
)
7604 __must_hold(&req
->ctx
->uring_lock
)
7606 if (likely(!(req
->flags
& (REQ_F_FORCE_ASYNC
| REQ_F_FAIL
))))
7607 __io_queue_sqe(req
);
7609 io_queue_sqe_fallback(req
);
7613 * Check SQE restrictions (opcode and flags).
7615 * Returns 'true' if SQE is allowed, 'false' otherwise.
7617 static inline bool io_check_restriction(struct io_ring_ctx
*ctx
,
7618 struct io_kiocb
*req
,
7619 unsigned int sqe_flags
)
7621 if (!test_bit(req
->opcode
, ctx
->restrictions
.sqe_op
))
7624 if ((sqe_flags
& ctx
->restrictions
.sqe_flags_required
) !=
7625 ctx
->restrictions
.sqe_flags_required
)
7628 if (sqe_flags
& ~(ctx
->restrictions
.sqe_flags_allowed
|
7629 ctx
->restrictions
.sqe_flags_required
))
7635 static void io_init_req_drain(struct io_kiocb
*req
)
7637 struct io_ring_ctx
*ctx
= req
->ctx
;
7638 struct io_kiocb
*head
= ctx
->submit_state
.link
.head
;
7640 ctx
->drain_active
= true;
7643 * If we need to drain a request in the middle of a link, drain
7644 * the head request and the next request/link after the current
7645 * link. Considering sequential execution of links,
7646 * REQ_F_IO_DRAIN will be maintained for every request of our
7649 head
->flags
|= REQ_F_IO_DRAIN
| REQ_F_FORCE_ASYNC
;
7650 ctx
->drain_next
= true;
7654 static int io_init_req(struct io_ring_ctx
*ctx
, struct io_kiocb
*req
,
7655 const struct io_uring_sqe
*sqe
)
7656 __must_hold(&ctx
->uring_lock
)
7658 unsigned int sqe_flags
;
7662 /* req is partially pre-initialised, see io_preinit_req() */
7663 req
->opcode
= opcode
= READ_ONCE(sqe
->opcode
);
7664 /* same numerical values with corresponding REQ_F_*, safe to copy */
7665 req
->flags
= sqe_flags
= READ_ONCE(sqe
->flags
);
7666 req
->user_data
= READ_ONCE(sqe
->user_data
);
7668 req
->fixed_rsrc_refs
= NULL
;
7669 req
->task
= current
;
7671 if (unlikely(opcode
>= IORING_OP_LAST
)) {
7675 if (unlikely(sqe_flags
& ~SQE_COMMON_FLAGS
)) {
7676 /* enforce forwards compatibility on users */
7677 if (sqe_flags
& ~SQE_VALID_FLAGS
)
7679 if ((sqe_flags
& IOSQE_BUFFER_SELECT
) &&
7680 !io_op_defs
[opcode
].buffer_select
)
7682 if (sqe_flags
& IOSQE_CQE_SKIP_SUCCESS
)
7683 ctx
->drain_disabled
= true;
7684 if (sqe_flags
& IOSQE_IO_DRAIN
) {
7685 if (ctx
->drain_disabled
)
7687 io_init_req_drain(req
);
7690 if (unlikely(ctx
->restricted
|| ctx
->drain_active
|| ctx
->drain_next
)) {
7691 if (ctx
->restricted
&& !io_check_restriction(ctx
, req
, sqe_flags
))
7693 /* knock it to the slow queue path, will be drained there */
7694 if (ctx
->drain_active
)
7695 req
->flags
|= REQ_F_FORCE_ASYNC
;
7696 /* if there is no link, we're at "next" request and need to drain */
7697 if (unlikely(ctx
->drain_next
) && !ctx
->submit_state
.link
.head
) {
7698 ctx
->drain_next
= false;
7699 ctx
->drain_active
= true;
7700 req
->flags
|= REQ_F_IO_DRAIN
| REQ_F_FORCE_ASYNC
;
7704 if (io_op_defs
[opcode
].needs_file
) {
7705 struct io_submit_state
*state
= &ctx
->submit_state
;
7707 req
->fd
= READ_ONCE(sqe
->fd
);
7710 * Plug now if we have more than 2 IO left after this, and the
7711 * target is potentially a read/write to block based storage.
7713 if (state
->need_plug
&& io_op_defs
[opcode
].plug
) {
7714 state
->plug_started
= true;
7715 state
->need_plug
= false;
7716 blk_start_plug_nr_ios(&state
->plug
, state
->submit_nr
);
7720 personality
= READ_ONCE(sqe
->personality
);
7724 req
->creds
= xa_load(&ctx
->personalities
, personality
);
7727 get_cred(req
->creds
);
7728 ret
= security_uring_override_creds(req
->creds
);
7730 put_cred(req
->creds
);
7733 req
->flags
|= REQ_F_CREDS
;
7736 return io_req_prep(req
, sqe
);
7739 static int io_submit_sqe(struct io_ring_ctx
*ctx
, struct io_kiocb
*req
,
7740 const struct io_uring_sqe
*sqe
)
7741 __must_hold(&ctx
->uring_lock
)
7743 struct io_submit_link
*link
= &ctx
->submit_state
.link
;
7746 ret
= io_init_req(ctx
, req
, sqe
);
7747 if (unlikely(ret
)) {
7748 trace_io_uring_req_failed(sqe
, ctx
, req
, ret
);
7750 /* fail even hard links since we don't submit */
7753 * we can judge a link req is failed or cancelled by if
7754 * REQ_F_FAIL is set, but the head is an exception since
7755 * it may be set REQ_F_FAIL because of other req's failure
7756 * so let's leverage req->result to distinguish if a head
7757 * is set REQ_F_FAIL because of its failure or other req's
7758 * failure so that we can set the correct ret code for it.
7759 * init result here to avoid affecting the normal path.
7761 if (!(link
->head
->flags
& REQ_F_FAIL
))
7762 req_fail_link_node(link
->head
, -ECANCELED
);
7763 } else if (!(req
->flags
& (REQ_F_LINK
| REQ_F_HARDLINK
))) {
7765 * the current req is a normal req, we should return
7766 * error and thus break the submittion loop.
7768 io_req_complete_failed(req
, ret
);
7771 req_fail_link_node(req
, ret
);
7774 /* don't need @sqe from now on */
7775 trace_io_uring_submit_sqe(ctx
, req
, req
->user_data
, req
->opcode
,
7777 ctx
->flags
& IORING_SETUP_SQPOLL
);
7780 * If we already have a head request, queue this one for async
7781 * submittal once the head completes. If we don't have a head but
7782 * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
7783 * submitted sync once the chain is complete. If none of those
7784 * conditions are true (normal request), then just queue it.
7787 struct io_kiocb
*head
= link
->head
;
7789 if (!(req
->flags
& REQ_F_FAIL
)) {
7790 ret
= io_req_prep_async(req
);
7791 if (unlikely(ret
)) {
7792 req_fail_link_node(req
, ret
);
7793 if (!(head
->flags
& REQ_F_FAIL
))
7794 req_fail_link_node(head
, -ECANCELED
);
7797 trace_io_uring_link(ctx
, req
, head
);
7798 link
->last
->link
= req
;
7801 if (req
->flags
& (REQ_F_LINK
| REQ_F_HARDLINK
))
7803 /* last request of a link, enqueue the link */
7806 } else if (req
->flags
& (REQ_F_LINK
| REQ_F_HARDLINK
)) {
7817 * Batched submission is done, ensure local IO is flushed out.
7819 static void io_submit_state_end(struct io_ring_ctx
*ctx
)
7821 struct io_submit_state
*state
= &ctx
->submit_state
;
7823 if (state
->link
.head
)
7824 io_queue_sqe(state
->link
.head
);
7825 /* flush only after queuing links as they can generate completions */
7826 io_submit_flush_completions(ctx
);
7827 if (state
->plug_started
)
7828 blk_finish_plug(&state
->plug
);
7832 * Start submission side cache.
7834 static void io_submit_state_start(struct io_submit_state
*state
,
7835 unsigned int max_ios
)
7837 state
->plug_started
= false;
7838 state
->need_plug
= max_ios
> 2;
7839 state
->submit_nr
= max_ios
;
7840 /* set only head, no need to init link_last in advance */
7841 state
->link
.head
= NULL
;
7844 static void io_commit_sqring(struct io_ring_ctx
*ctx
)
7846 struct io_rings
*rings
= ctx
->rings
;
7849 * Ensure any loads from the SQEs are done at this point,
7850 * since once we write the new head, the application could
7851 * write new data to them.
7853 smp_store_release(&rings
->sq
.head
, ctx
->cached_sq_head
);
7857 * Fetch an sqe, if one is available. Note this returns a pointer to memory
7858 * that is mapped by userspace. This means that care needs to be taken to
7859 * ensure that reads are stable, as we cannot rely on userspace always
7860 * being a good citizen. If members of the sqe are validated and then later
7861 * used, it's important that those reads are done through READ_ONCE() to
7862 * prevent a re-load down the line.
7864 static const struct io_uring_sqe
*io_get_sqe(struct io_ring_ctx
*ctx
)
7866 unsigned head
, mask
= ctx
->sq_entries
- 1;
7867 unsigned sq_idx
= ctx
->cached_sq_head
++ & mask
;
7870 * The cached sq head (or cq tail) serves two purposes:
7872 * 1) allows us to batch the cost of updating the user visible
7874 * 2) allows the kernel side to track the head on its own, even
7875 * though the application is the one updating it.
7877 head
= READ_ONCE(ctx
->sq_array
[sq_idx
]);
7878 if (likely(head
< ctx
->sq_entries
))
7879 return &ctx
->sq_sqes
[head
];
7881 /* drop invalid entries */
7883 WRITE_ONCE(ctx
->rings
->sq_dropped
,
7884 READ_ONCE(ctx
->rings
->sq_dropped
) + 1);
7888 static int io_submit_sqes(struct io_ring_ctx
*ctx
, unsigned int nr
)
7889 __must_hold(&ctx
->uring_lock
)
7891 unsigned int entries
= io_sqring_entries(ctx
);
7894 if (unlikely(!entries
))
7896 /* make sure SQ entry isn't read before tail */
7897 nr
= min3(nr
, ctx
->sq_entries
, entries
);
7898 io_get_task_refs(nr
);
7900 io_submit_state_start(&ctx
->submit_state
, nr
);
7902 const struct io_uring_sqe
*sqe
;
7903 struct io_kiocb
*req
;
7905 if (unlikely(!io_alloc_req_refill(ctx
))) {
7907 submitted
= -EAGAIN
;
7910 req
= io_alloc_req(ctx
);
7911 sqe
= io_get_sqe(ctx
);
7912 if (unlikely(!sqe
)) {
7913 wq_stack_add_head(&req
->comp_list
, &ctx
->submit_state
.free_list
);
7916 /* will complete beyond this point, count as submitted */
7918 if (io_submit_sqe(ctx
, req
, sqe
)) {
7920 * Continue submitting even for sqe failure if the
7921 * ring was setup with IORING_SETUP_SUBMIT_ALL
7923 if (!(ctx
->flags
& IORING_SETUP_SUBMIT_ALL
))
7926 } while (submitted
< nr
);
7928 if (unlikely(submitted
!= nr
)) {
7929 int ref_used
= (submitted
== -EAGAIN
) ? 0 : submitted
;
7930 int unused
= nr
- ref_used
;
7932 current
->io_uring
->cached_refs
+= unused
;
7935 io_submit_state_end(ctx
);
7936 /* Commit SQ ring head once we've consumed and submitted all SQEs */
7937 io_commit_sqring(ctx
);
7942 static inline bool io_sqd_events_pending(struct io_sq_data
*sqd
)
7944 return READ_ONCE(sqd
->state
);
7947 static inline void io_ring_set_wakeup_flag(struct io_ring_ctx
*ctx
)
7949 /* Tell userspace we may need a wakeup call */
7950 spin_lock(&ctx
->completion_lock
);
7951 WRITE_ONCE(ctx
->rings
->sq_flags
,
7952 ctx
->rings
->sq_flags
| IORING_SQ_NEED_WAKEUP
);
7953 spin_unlock(&ctx
->completion_lock
);
7956 static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx
*ctx
)
7958 spin_lock(&ctx
->completion_lock
);
7959 WRITE_ONCE(ctx
->rings
->sq_flags
,
7960 ctx
->rings
->sq_flags
& ~IORING_SQ_NEED_WAKEUP
);
7961 spin_unlock(&ctx
->completion_lock
);
7964 static int __io_sq_thread(struct io_ring_ctx
*ctx
, bool cap_entries
)
7966 unsigned int to_submit
;
7969 to_submit
= io_sqring_entries(ctx
);
7970 /* if we're handling multiple rings, cap submit size for fairness */
7971 if (cap_entries
&& to_submit
> IORING_SQPOLL_CAP_ENTRIES_VALUE
)
7972 to_submit
= IORING_SQPOLL_CAP_ENTRIES_VALUE
;
7974 if (!wq_list_empty(&ctx
->iopoll_list
) || to_submit
) {
7975 const struct cred
*creds
= NULL
;
7977 if (ctx
->sq_creds
!= current_cred())
7978 creds
= override_creds(ctx
->sq_creds
);
7980 mutex_lock(&ctx
->uring_lock
);
7981 if (!wq_list_empty(&ctx
->iopoll_list
))
7982 io_do_iopoll(ctx
, true);
7985 * Don't submit if refs are dying, good for io_uring_register(),
7986 * but also it is relied upon by io_ring_exit_work()
7988 if (to_submit
&& likely(!percpu_ref_is_dying(&ctx
->refs
)) &&
7989 !(ctx
->flags
& IORING_SETUP_R_DISABLED
))
7990 ret
= io_submit_sqes(ctx
, to_submit
);
7991 mutex_unlock(&ctx
->uring_lock
);
7993 if (to_submit
&& wq_has_sleeper(&ctx
->sqo_sq_wait
))
7994 wake_up(&ctx
->sqo_sq_wait
);
7996 revert_creds(creds
);
8002 static __cold
void io_sqd_update_thread_idle(struct io_sq_data
*sqd
)
8004 struct io_ring_ctx
*ctx
;
8005 unsigned sq_thread_idle
= 0;
8007 list_for_each_entry(ctx
, &sqd
->ctx_list
, sqd_list
)
8008 sq_thread_idle
= max(sq_thread_idle
, ctx
->sq_thread_idle
);
8009 sqd
->sq_thread_idle
= sq_thread_idle
;
8012 static bool io_sqd_handle_event(struct io_sq_data
*sqd
)
8014 bool did_sig
= false;
8015 struct ksignal ksig
;
8017 if (test_bit(IO_SQ_THREAD_SHOULD_PARK
, &sqd
->state
) ||
8018 signal_pending(current
)) {
8019 mutex_unlock(&sqd
->lock
);
8020 if (signal_pending(current
))
8021 did_sig
= get_signal(&ksig
);
8023 mutex_lock(&sqd
->lock
);
8025 return did_sig
|| test_bit(IO_SQ_THREAD_SHOULD_STOP
, &sqd
->state
);
8028 static int io_sq_thread(void *data
)
8030 struct io_sq_data
*sqd
= data
;
8031 struct io_ring_ctx
*ctx
;
8032 unsigned long timeout
= 0;
8033 char buf
[TASK_COMM_LEN
];
8036 snprintf(buf
, sizeof(buf
), "iou-sqp-%d", sqd
->task_pid
);
8037 set_task_comm(current
, buf
);
8039 if (sqd
->sq_cpu
!= -1)
8040 set_cpus_allowed_ptr(current
, cpumask_of(sqd
->sq_cpu
));
8042 set_cpus_allowed_ptr(current
, cpu_online_mask
);
8043 current
->flags
|= PF_NO_SETAFFINITY
;
8045 audit_alloc_kernel(current
);
8047 mutex_lock(&sqd
->lock
);
8049 bool cap_entries
, sqt_spin
= false;
8051 if (io_sqd_events_pending(sqd
) || signal_pending(current
)) {
8052 if (io_sqd_handle_event(sqd
))
8054 timeout
= jiffies
+ sqd
->sq_thread_idle
;
8057 cap_entries
= !list_is_singular(&sqd
->ctx_list
);
8058 list_for_each_entry(ctx
, &sqd
->ctx_list
, sqd_list
) {
8059 int ret
= __io_sq_thread(ctx
, cap_entries
);
8061 if (!sqt_spin
&& (ret
> 0 || !wq_list_empty(&ctx
->iopoll_list
)))
8064 if (io_run_task_work())
8067 if (sqt_spin
|| !time_after(jiffies
, timeout
)) {
8070 timeout
= jiffies
+ sqd
->sq_thread_idle
;
8074 prepare_to_wait(&sqd
->wait
, &wait
, TASK_INTERRUPTIBLE
);
8075 if (!io_sqd_events_pending(sqd
) && !task_work_pending(current
)) {
8076 bool needs_sched
= true;
8078 list_for_each_entry(ctx
, &sqd
->ctx_list
, sqd_list
) {
8079 io_ring_set_wakeup_flag(ctx
);
8081 if ((ctx
->flags
& IORING_SETUP_IOPOLL
) &&
8082 !wq_list_empty(&ctx
->iopoll_list
)) {
8083 needs_sched
= false;
8088 * Ensure the store of the wakeup flag is not
8089 * reordered with the load of the SQ tail
8093 if (io_sqring_entries(ctx
)) {
8094 needs_sched
= false;
8100 mutex_unlock(&sqd
->lock
);
8102 mutex_lock(&sqd
->lock
);
8104 list_for_each_entry(ctx
, &sqd
->ctx_list
, sqd_list
)
8105 io_ring_clear_wakeup_flag(ctx
);
8108 finish_wait(&sqd
->wait
, &wait
);
8109 timeout
= jiffies
+ sqd
->sq_thread_idle
;
8112 io_uring_cancel_generic(true, sqd
);
8114 list_for_each_entry(ctx
, &sqd
->ctx_list
, sqd_list
)
8115 io_ring_set_wakeup_flag(ctx
);
8117 mutex_unlock(&sqd
->lock
);
8119 audit_free(current
);
8121 complete(&sqd
->exited
);
8125 struct io_wait_queue
{
8126 struct wait_queue_entry wq
;
8127 struct io_ring_ctx
*ctx
;
8129 unsigned nr_timeouts
;
8132 static inline bool io_should_wake(struct io_wait_queue
*iowq
)
8134 struct io_ring_ctx
*ctx
= iowq
->ctx
;
8135 int dist
= ctx
->cached_cq_tail
- (int) iowq
->cq_tail
;
8138 * Wake up if we have enough events, or if a timeout occurred since we
8139 * started waiting. For timeouts, we always want to return to userspace,
8140 * regardless of event count.
8142 return dist
>= 0 || atomic_read(&ctx
->cq_timeouts
) != iowq
->nr_timeouts
;
8145 static int io_wake_function(struct wait_queue_entry
*curr
, unsigned int mode
,
8146 int wake_flags
, void *key
)
8148 struct io_wait_queue
*iowq
= container_of(curr
, struct io_wait_queue
,
8152 * Cannot safely flush overflowed CQEs from here, ensure we wake up
8153 * the task, and the next invocation will do it.
8155 if (io_should_wake(iowq
) || test_bit(0, &iowq
->ctx
->check_cq_overflow
))
8156 return autoremove_wake_function(curr
, mode
, wake_flags
, key
);
8160 static int io_run_task_work_sig(void)
8162 if (io_run_task_work())
8164 if (test_thread_flag(TIF_NOTIFY_SIGNAL
))
8165 return -ERESTARTSYS
;
8166 if (task_sigpending(current
))
8171 /* when returns >0, the caller should retry */
8172 static inline int io_cqring_wait_schedule(struct io_ring_ctx
*ctx
,
8173 struct io_wait_queue
*iowq
,
8178 /* make sure we run task_work before checking for signals */
8179 ret
= io_run_task_work_sig();
8180 if (ret
|| io_should_wake(iowq
))
8182 /* let the caller flush overflows, retry */
8183 if (test_bit(0, &ctx
->check_cq_overflow
))
8186 if (!schedule_hrtimeout(&timeout
, HRTIMER_MODE_ABS
))
8192 * Wait until events become available, if we don't already have some. The
8193 * application must reap them itself, as they reside on the shared cq ring.
8195 static int io_cqring_wait(struct io_ring_ctx
*ctx
, int min_events
,
8196 const sigset_t __user
*sig
, size_t sigsz
,
8197 struct __kernel_timespec __user
*uts
)
8199 struct io_wait_queue iowq
;
8200 struct io_rings
*rings
= ctx
->rings
;
8201 ktime_t timeout
= KTIME_MAX
;
8205 io_cqring_overflow_flush(ctx
);
8206 if (io_cqring_events(ctx
) >= min_events
)
8208 if (!io_run_task_work())
8213 #ifdef CONFIG_COMPAT
8214 if (in_compat_syscall())
8215 ret
= set_compat_user_sigmask((const compat_sigset_t __user
*)sig
,
8219 ret
= set_user_sigmask(sig
, sigsz
);
8226 struct timespec64 ts
;
8228 if (get_timespec64(&ts
, uts
))
8230 timeout
= ktime_add_ns(timespec64_to_ktime(ts
), ktime_get_ns());
8233 init_waitqueue_func_entry(&iowq
.wq
, io_wake_function
);
8234 iowq
.wq
.private = current
;
8235 INIT_LIST_HEAD(&iowq
.wq
.entry
);
8237 iowq
.nr_timeouts
= atomic_read(&ctx
->cq_timeouts
);
8238 iowq
.cq_tail
= READ_ONCE(ctx
->rings
->cq
.head
) + min_events
;
8240 trace_io_uring_cqring_wait(ctx
, min_events
);
8242 /* if we can't even flush overflow, don't wait for more */
8243 if (!io_cqring_overflow_flush(ctx
)) {
8247 prepare_to_wait_exclusive(&ctx
->cq_wait
, &iowq
.wq
,
8248 TASK_INTERRUPTIBLE
);
8249 ret
= io_cqring_wait_schedule(ctx
, &iowq
, timeout
);
8250 finish_wait(&ctx
->cq_wait
, &iowq
.wq
);
8254 restore_saved_sigmask_unless(ret
== -EINTR
);
8256 return READ_ONCE(rings
->cq
.head
) == READ_ONCE(rings
->cq
.tail
) ? ret
: 0;
8259 static void io_free_page_table(void **table
, size_t size
)
8261 unsigned i
, nr_tables
= DIV_ROUND_UP(size
, PAGE_SIZE
);
8263 for (i
= 0; i
< nr_tables
; i
++)
8268 static __cold
void **io_alloc_page_table(size_t size
)
8270 unsigned i
, nr_tables
= DIV_ROUND_UP(size
, PAGE_SIZE
);
8271 size_t init_size
= size
;
8274 table
= kcalloc(nr_tables
, sizeof(*table
), GFP_KERNEL_ACCOUNT
);
8278 for (i
= 0; i
< nr_tables
; i
++) {
8279 unsigned int this_size
= min_t(size_t, size
, PAGE_SIZE
);
8281 table
[i
] = kzalloc(this_size
, GFP_KERNEL_ACCOUNT
);
8283 io_free_page_table(table
, init_size
);
8291 static void io_rsrc_node_destroy(struct io_rsrc_node
*ref_node
)
8293 percpu_ref_exit(&ref_node
->refs
);
8297 static __cold
void io_rsrc_node_ref_zero(struct percpu_ref
*ref
)
8299 struct io_rsrc_node
*node
= container_of(ref
, struct io_rsrc_node
, refs
);
8300 struct io_ring_ctx
*ctx
= node
->rsrc_data
->ctx
;
8301 unsigned long flags
;
8302 bool first_add
= false;
8303 unsigned long delay
= HZ
;
8305 spin_lock_irqsave(&ctx
->rsrc_ref_lock
, flags
);
8308 /* if we are mid-quiesce then do not delay */
8309 if (node
->rsrc_data
->quiesce
)
8312 while (!list_empty(&ctx
->rsrc_ref_list
)) {
8313 node
= list_first_entry(&ctx
->rsrc_ref_list
,
8314 struct io_rsrc_node
, node
);
8315 /* recycle ref nodes in order */
8318 list_del(&node
->node
);
8319 first_add
|= llist_add(&node
->llist
, &ctx
->rsrc_put_llist
);
8321 spin_unlock_irqrestore(&ctx
->rsrc_ref_lock
, flags
);
8324 mod_delayed_work(system_wq
, &ctx
->rsrc_put_work
, delay
);
8327 static struct io_rsrc_node
*io_rsrc_node_alloc(void)
8329 struct io_rsrc_node
*ref_node
;
8331 ref_node
= kzalloc(sizeof(*ref_node
), GFP_KERNEL
);
8335 if (percpu_ref_init(&ref_node
->refs
, io_rsrc_node_ref_zero
,
8340 INIT_LIST_HEAD(&ref_node
->node
);
8341 INIT_LIST_HEAD(&ref_node
->rsrc_list
);
8342 ref_node
->done
= false;
8346 static void io_rsrc_node_switch(struct io_ring_ctx
*ctx
,
8347 struct io_rsrc_data
*data_to_kill
)
8348 __must_hold(&ctx
->uring_lock
)
8350 WARN_ON_ONCE(!ctx
->rsrc_backup_node
);
8351 WARN_ON_ONCE(data_to_kill
&& !ctx
->rsrc_node
);
8353 io_rsrc_refs_drop(ctx
);
8356 struct io_rsrc_node
*rsrc_node
= ctx
->rsrc_node
;
8358 rsrc_node
->rsrc_data
= data_to_kill
;
8359 spin_lock_irq(&ctx
->rsrc_ref_lock
);
8360 list_add_tail(&rsrc_node
->node
, &ctx
->rsrc_ref_list
);
8361 spin_unlock_irq(&ctx
->rsrc_ref_lock
);
8363 atomic_inc(&data_to_kill
->refs
);
8364 percpu_ref_kill(&rsrc_node
->refs
);
8365 ctx
->rsrc_node
= NULL
;
8368 if (!ctx
->rsrc_node
) {
8369 ctx
->rsrc_node
= ctx
->rsrc_backup_node
;
8370 ctx
->rsrc_backup_node
= NULL
;
8374 static int io_rsrc_node_switch_start(struct io_ring_ctx
*ctx
)
8376 if (ctx
->rsrc_backup_node
)
8378 ctx
->rsrc_backup_node
= io_rsrc_node_alloc();
8379 return ctx
->rsrc_backup_node
? 0 : -ENOMEM
;
8382 static __cold
int io_rsrc_ref_quiesce(struct io_rsrc_data
*data
,
8383 struct io_ring_ctx
*ctx
)
8387 /* As we may drop ->uring_lock, other task may have started quiesce */
8391 data
->quiesce
= true;
8393 ret
= io_rsrc_node_switch_start(ctx
);
8396 io_rsrc_node_switch(ctx
, data
);
8398 /* kill initial ref, already quiesced if zero */
8399 if (atomic_dec_and_test(&data
->refs
))
8401 mutex_unlock(&ctx
->uring_lock
);
8402 flush_delayed_work(&ctx
->rsrc_put_work
);
8403 ret
= wait_for_completion_interruptible(&data
->done
);
8405 mutex_lock(&ctx
->uring_lock
);
8406 if (atomic_read(&data
->refs
) > 0) {
8408 * it has been revived by another thread while
8411 mutex_unlock(&ctx
->uring_lock
);
8417 atomic_inc(&data
->refs
);
8418 /* wait for all works potentially completing data->done */
8419 flush_delayed_work(&ctx
->rsrc_put_work
);
8420 reinit_completion(&data
->done
);
8422 ret
= io_run_task_work_sig();
8423 mutex_lock(&ctx
->uring_lock
);
8425 data
->quiesce
= false;
8430 static u64
*io_get_tag_slot(struct io_rsrc_data
*data
, unsigned int idx
)
8432 unsigned int off
= idx
& IO_RSRC_TAG_TABLE_MASK
;
8433 unsigned int table_idx
= idx
>> IO_RSRC_TAG_TABLE_SHIFT
;
8435 return &data
->tags
[table_idx
][off
];
8438 static void io_rsrc_data_free(struct io_rsrc_data
*data
)
8440 size_t size
= data
->nr
* sizeof(data
->tags
[0][0]);
8443 io_free_page_table((void **)data
->tags
, size
);
8447 static __cold
int io_rsrc_data_alloc(struct io_ring_ctx
*ctx
, rsrc_put_fn
*do_put
,
8448 u64 __user
*utags
, unsigned nr
,
8449 struct io_rsrc_data
**pdata
)
8451 struct io_rsrc_data
*data
;
8455 data
= kzalloc(sizeof(*data
), GFP_KERNEL
);
8458 data
->tags
= (u64
**)io_alloc_page_table(nr
* sizeof(data
->tags
[0][0]));
8466 data
->do_put
= do_put
;
8469 for (i
= 0; i
< nr
; i
++) {
8470 u64
*tag_slot
= io_get_tag_slot(data
, i
);
8472 if (copy_from_user(tag_slot
, &utags
[i
],
8478 atomic_set(&data
->refs
, 1);
8479 init_completion(&data
->done
);
8483 io_rsrc_data_free(data
);
8487 static bool io_alloc_file_tables(struct io_file_table
*table
, unsigned nr_files
)
8489 table
->files
= kvcalloc(nr_files
, sizeof(table
->files
[0]),
8490 GFP_KERNEL_ACCOUNT
);
8491 return !!table
->files
;
8494 static void io_free_file_tables(struct io_file_table
*table
)
8496 kvfree(table
->files
);
8497 table
->files
= NULL
;
8500 static void __io_sqe_files_unregister(struct io_ring_ctx
*ctx
)
8502 #if defined(CONFIG_UNIX)
8503 if (ctx
->ring_sock
) {
8504 struct sock
*sock
= ctx
->ring_sock
->sk
;
8505 struct sk_buff
*skb
;
8507 while ((skb
= skb_dequeue(&sock
->sk_receive_queue
)) != NULL
)
8513 for (i
= 0; i
< ctx
->nr_user_files
; i
++) {
8516 file
= io_file_from_index(ctx
, i
);
8521 io_free_file_tables(&ctx
->file_table
);
8522 io_rsrc_data_free(ctx
->file_data
);
8523 ctx
->file_data
= NULL
;
8524 ctx
->nr_user_files
= 0;
8527 static int io_sqe_files_unregister(struct io_ring_ctx
*ctx
)
8529 unsigned nr
= ctx
->nr_user_files
;
8532 if (!ctx
->file_data
)
8536 * Quiesce may unlock ->uring_lock, and while it's not held
8537 * prevent new requests using the table.
8539 ctx
->nr_user_files
= 0;
8540 ret
= io_rsrc_ref_quiesce(ctx
->file_data
, ctx
);
8541 ctx
->nr_user_files
= nr
;
8543 __io_sqe_files_unregister(ctx
);
8547 static void io_sq_thread_unpark(struct io_sq_data
*sqd
)
8548 __releases(&sqd
->lock
)
8550 WARN_ON_ONCE(sqd
->thread
== current
);
8553 * Do the dance but not conditional clear_bit() because it'd race with
8554 * other threads incrementing park_pending and setting the bit.
8556 clear_bit(IO_SQ_THREAD_SHOULD_PARK
, &sqd
->state
);
8557 if (atomic_dec_return(&sqd
->park_pending
))
8558 set_bit(IO_SQ_THREAD_SHOULD_PARK
, &sqd
->state
);
8559 mutex_unlock(&sqd
->lock
);
8562 static void io_sq_thread_park(struct io_sq_data
*sqd
)
8563 __acquires(&sqd
->lock
)
8565 WARN_ON_ONCE(sqd
->thread
== current
);
8567 atomic_inc(&sqd
->park_pending
);
8568 set_bit(IO_SQ_THREAD_SHOULD_PARK
, &sqd
->state
);
8569 mutex_lock(&sqd
->lock
);
8571 wake_up_process(sqd
->thread
);
8574 static void io_sq_thread_stop(struct io_sq_data
*sqd
)
8576 WARN_ON_ONCE(sqd
->thread
== current
);
8577 WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP
, &sqd
->state
));
8579 set_bit(IO_SQ_THREAD_SHOULD_STOP
, &sqd
->state
);
8580 mutex_lock(&sqd
->lock
);
8582 wake_up_process(sqd
->thread
);
8583 mutex_unlock(&sqd
->lock
);
8584 wait_for_completion(&sqd
->exited
);
8587 static void io_put_sq_data(struct io_sq_data
*sqd
)
8589 if (refcount_dec_and_test(&sqd
->refs
)) {
8590 WARN_ON_ONCE(atomic_read(&sqd
->park_pending
));
8592 io_sq_thread_stop(sqd
);
8597 static void io_sq_thread_finish(struct io_ring_ctx
*ctx
)
8599 struct io_sq_data
*sqd
= ctx
->sq_data
;
8602 io_sq_thread_park(sqd
);
8603 list_del_init(&ctx
->sqd_list
);
8604 io_sqd_update_thread_idle(sqd
);
8605 io_sq_thread_unpark(sqd
);
8607 io_put_sq_data(sqd
);
8608 ctx
->sq_data
= NULL
;
8612 static struct io_sq_data
*io_attach_sq_data(struct io_uring_params
*p
)
8614 struct io_ring_ctx
*ctx_attach
;
8615 struct io_sq_data
*sqd
;
8618 f
= fdget(p
->wq_fd
);
8620 return ERR_PTR(-ENXIO
);
8621 if (f
.file
->f_op
!= &io_uring_fops
) {
8623 return ERR_PTR(-EINVAL
);
8626 ctx_attach
= f
.file
->private_data
;
8627 sqd
= ctx_attach
->sq_data
;
8630 return ERR_PTR(-EINVAL
);
8632 if (sqd
->task_tgid
!= current
->tgid
) {
8634 return ERR_PTR(-EPERM
);
8637 refcount_inc(&sqd
->refs
);
8642 static struct io_sq_data
*io_get_sq_data(struct io_uring_params
*p
,
8645 struct io_sq_data
*sqd
;
8648 if (p
->flags
& IORING_SETUP_ATTACH_WQ
) {
8649 sqd
= io_attach_sq_data(p
);
8654 /* fall through for EPERM case, setup new sqd/task */
8655 if (PTR_ERR(sqd
) != -EPERM
)
8659 sqd
= kzalloc(sizeof(*sqd
), GFP_KERNEL
);
8661 return ERR_PTR(-ENOMEM
);
8663 atomic_set(&sqd
->park_pending
, 0);
8664 refcount_set(&sqd
->refs
, 1);
8665 INIT_LIST_HEAD(&sqd
->ctx_list
);
8666 mutex_init(&sqd
->lock
);
8667 init_waitqueue_head(&sqd
->wait
);
8668 init_completion(&sqd
->exited
);
8672 #if defined(CONFIG_UNIX)
8674 * Ensure the UNIX gc is aware of our file set, so we are certain that
8675 * the io_uring can be safely unregistered on process exit, even if we have
8676 * loops in the file referencing.
8678 static int __io_sqe_files_scm(struct io_ring_ctx
*ctx
, int nr
, int offset
)
8680 struct sock
*sk
= ctx
->ring_sock
->sk
;
8681 struct scm_fp_list
*fpl
;
8682 struct sk_buff
*skb
;
8685 fpl
= kzalloc(sizeof(*fpl
), GFP_KERNEL
);
8689 skb
= alloc_skb(0, GFP_KERNEL
);
8698 fpl
->user
= get_uid(current_user());
8699 for (i
= 0; i
< nr
; i
++) {
8700 struct file
*file
= io_file_from_index(ctx
, i
+ offset
);
8704 fpl
->fp
[nr_files
] = get_file(file
);
8705 unix_inflight(fpl
->user
, fpl
->fp
[nr_files
]);
8710 fpl
->max
= SCM_MAX_FD
;
8711 fpl
->count
= nr_files
;
8712 UNIXCB(skb
).fp
= fpl
;
8713 skb
->destructor
= unix_destruct_scm
;
8714 refcount_add(skb
->truesize
, &sk
->sk_wmem_alloc
);
8715 skb_queue_head(&sk
->sk_receive_queue
, skb
);
8717 for (i
= 0; i
< nr
; i
++) {
8718 struct file
*file
= io_file_from_index(ctx
, i
+ offset
);
8725 free_uid(fpl
->user
);
8733 * If UNIX sockets are enabled, fd passing can cause a reference cycle which
8734 * causes regular reference counting to break down. We rely on the UNIX
8735 * garbage collection to take care of this problem for us.
8737 static int io_sqe_files_scm(struct io_ring_ctx
*ctx
)
8739 unsigned left
, total
;
8743 left
= ctx
->nr_user_files
;
8745 unsigned this_files
= min_t(unsigned, left
, SCM_MAX_FD
);
8747 ret
= __io_sqe_files_scm(ctx
, this_files
, total
);
8751 total
+= this_files
;
8757 while (total
< ctx
->nr_user_files
) {
8758 struct file
*file
= io_file_from_index(ctx
, total
);
8768 static int io_sqe_files_scm(struct io_ring_ctx
*ctx
)
8774 static void io_rsrc_file_put(struct io_ring_ctx
*ctx
, struct io_rsrc_put
*prsrc
)
8776 struct file
*file
= prsrc
->file
;
8777 #if defined(CONFIG_UNIX)
8778 struct sock
*sock
= ctx
->ring_sock
->sk
;
8779 struct sk_buff_head list
, *head
= &sock
->sk_receive_queue
;
8780 struct sk_buff
*skb
;
8783 __skb_queue_head_init(&list
);
8786 * Find the skb that holds this file in its SCM_RIGHTS. When found,
8787 * remove this entry and rearrange the file array.
8789 skb
= skb_dequeue(head
);
8791 struct scm_fp_list
*fp
;
8793 fp
= UNIXCB(skb
).fp
;
8794 for (i
= 0; i
< fp
->count
; i
++) {
8797 if (fp
->fp
[i
] != file
)
8800 unix_notinflight(fp
->user
, fp
->fp
[i
]);
8801 left
= fp
->count
- 1 - i
;
8803 memmove(&fp
->fp
[i
], &fp
->fp
[i
+ 1],
8804 left
* sizeof(struct file
*));
8811 __skb_queue_tail(&list
, skb
);
8821 __skb_queue_tail(&list
, skb
);
8823 skb
= skb_dequeue(head
);
8826 if (skb_peek(&list
)) {
8827 spin_lock_irq(&head
->lock
);
8828 while ((skb
= __skb_dequeue(&list
)) != NULL
)
8829 __skb_queue_tail(head
, skb
);
8830 spin_unlock_irq(&head
->lock
);
8837 static void __io_rsrc_put_work(struct io_rsrc_node
*ref_node
)
8839 struct io_rsrc_data
*rsrc_data
= ref_node
->rsrc_data
;
8840 struct io_ring_ctx
*ctx
= rsrc_data
->ctx
;
8841 struct io_rsrc_put
*prsrc
, *tmp
;
8843 list_for_each_entry_safe(prsrc
, tmp
, &ref_node
->rsrc_list
, list
) {
8844 list_del(&prsrc
->list
);
8847 bool lock_ring
= ctx
->flags
& IORING_SETUP_IOPOLL
;
8849 io_ring_submit_lock(ctx
, lock_ring
);
8850 spin_lock(&ctx
->completion_lock
);
8851 io_fill_cqe_aux(ctx
, prsrc
->tag
, 0, 0);
8852 io_commit_cqring(ctx
);
8853 spin_unlock(&ctx
->completion_lock
);
8854 io_cqring_ev_posted(ctx
);
8855 io_ring_submit_unlock(ctx
, lock_ring
);
8858 rsrc_data
->do_put(ctx
, prsrc
);
8862 io_rsrc_node_destroy(ref_node
);
8863 if (atomic_dec_and_test(&rsrc_data
->refs
))
8864 complete(&rsrc_data
->done
);
8867 static void io_rsrc_put_work(struct work_struct
*work
)
8869 struct io_ring_ctx
*ctx
;
8870 struct llist_node
*node
;
8872 ctx
= container_of(work
, struct io_ring_ctx
, rsrc_put_work
.work
);
8873 node
= llist_del_all(&ctx
->rsrc_put_llist
);
8876 struct io_rsrc_node
*ref_node
;
8877 struct llist_node
*next
= node
->next
;
8879 ref_node
= llist_entry(node
, struct io_rsrc_node
, llist
);
8880 __io_rsrc_put_work(ref_node
);
8885 static int io_sqe_files_register(struct io_ring_ctx
*ctx
, void __user
*arg
,
8886 unsigned nr_args
, u64 __user
*tags
)
8888 __s32 __user
*fds
= (__s32 __user
*) arg
;
8897 if (nr_args
> IORING_MAX_FIXED_FILES
)
8899 if (nr_args
> rlimit(RLIMIT_NOFILE
))
8901 ret
= io_rsrc_node_switch_start(ctx
);
8904 ret
= io_rsrc_data_alloc(ctx
, io_rsrc_file_put
, tags
, nr_args
,
8910 if (!io_alloc_file_tables(&ctx
->file_table
, nr_args
))
8913 for (i
= 0; i
< nr_args
; i
++, ctx
->nr_user_files
++) {
8914 if (copy_from_user(&fd
, &fds
[i
], sizeof(fd
))) {
8918 /* allow sparse sets */
8921 if (unlikely(*io_get_tag_slot(ctx
->file_data
, i
)))
8928 if (unlikely(!file
))
8932 * Don't allow io_uring instances to be registered. If UNIX
8933 * isn't enabled, then this causes a reference cycle and this
8934 * instance can never get freed. If UNIX is enabled we'll
8935 * handle it just fine, but there's still no point in allowing
8936 * a ring fd as it doesn't support regular read/write anyway.
8938 if (file
->f_op
== &io_uring_fops
) {
8942 io_fixed_file_set(io_fixed_file_slot(&ctx
->file_table
, i
), file
);
8945 ret
= io_sqe_files_scm(ctx
);
8947 __io_sqe_files_unregister(ctx
);
8951 io_rsrc_node_switch(ctx
, NULL
);
8954 for (i
= 0; i
< ctx
->nr_user_files
; i
++) {
8955 file
= io_file_from_index(ctx
, i
);
8959 io_free_file_tables(&ctx
->file_table
);
8960 ctx
->nr_user_files
= 0;
8962 io_rsrc_data_free(ctx
->file_data
);
8963 ctx
->file_data
= NULL
;
8967 static int io_sqe_file_register(struct io_ring_ctx
*ctx
, struct file
*file
,
8970 #if defined(CONFIG_UNIX)
8971 struct sock
*sock
= ctx
->ring_sock
->sk
;
8972 struct sk_buff_head
*head
= &sock
->sk_receive_queue
;
8973 struct sk_buff
*skb
;
8976 * See if we can merge this file into an existing skb SCM_RIGHTS
8977 * file set. If there's no room, fall back to allocating a new skb
8978 * and filling it in.
8980 spin_lock_irq(&head
->lock
);
8981 skb
= skb_peek(head
);
8983 struct scm_fp_list
*fpl
= UNIXCB(skb
).fp
;
8985 if (fpl
->count
< SCM_MAX_FD
) {
8986 __skb_unlink(skb
, head
);
8987 spin_unlock_irq(&head
->lock
);
8988 fpl
->fp
[fpl
->count
] = get_file(file
);
8989 unix_inflight(fpl
->user
, fpl
->fp
[fpl
->count
]);
8991 spin_lock_irq(&head
->lock
);
8992 __skb_queue_head(head
, skb
);
8997 spin_unlock_irq(&head
->lock
);
9004 return __io_sqe_files_scm(ctx
, 1, index
);
9010 static int io_queue_rsrc_removal(struct io_rsrc_data
*data
, unsigned idx
,
9011 struct io_rsrc_node
*node
, void *rsrc
)
9013 u64
*tag_slot
= io_get_tag_slot(data
, idx
);
9014 struct io_rsrc_put
*prsrc
;
9016 prsrc
= kzalloc(sizeof(*prsrc
), GFP_KERNEL
);
9020 prsrc
->tag
= *tag_slot
;
9023 list_add(&prsrc
->list
, &node
->rsrc_list
);
9027 static int io_install_fixed_file(struct io_kiocb
*req
, struct file
*file
,
9028 unsigned int issue_flags
, u32 slot_index
)
9030 struct io_ring_ctx
*ctx
= req
->ctx
;
9031 bool needs_lock
= issue_flags
& IO_URING_F_UNLOCKED
;
9032 bool needs_switch
= false;
9033 struct io_fixed_file
*file_slot
;
9036 io_ring_submit_lock(ctx
, needs_lock
);
9037 if (file
->f_op
== &io_uring_fops
)
9040 if (!ctx
->file_data
)
9043 if (slot_index
>= ctx
->nr_user_files
)
9046 slot_index
= array_index_nospec(slot_index
, ctx
->nr_user_files
);
9047 file_slot
= io_fixed_file_slot(&ctx
->file_table
, slot_index
);
9049 if (file_slot
->file_ptr
) {
9050 struct file
*old_file
;
9052 ret
= io_rsrc_node_switch_start(ctx
);
9056 old_file
= (struct file
*)(file_slot
->file_ptr
& FFS_MASK
);
9057 ret
= io_queue_rsrc_removal(ctx
->file_data
, slot_index
,
9058 ctx
->rsrc_node
, old_file
);
9061 file_slot
->file_ptr
= 0;
9062 needs_switch
= true;
9065 *io_get_tag_slot(ctx
->file_data
, slot_index
) = 0;
9066 io_fixed_file_set(file_slot
, file
);
9067 ret
= io_sqe_file_register(ctx
, file
, slot_index
);
9069 file_slot
->file_ptr
= 0;
9076 io_rsrc_node_switch(ctx
, ctx
->file_data
);
9077 io_ring_submit_unlock(ctx
, needs_lock
);
9083 static int io_close_fixed(struct io_kiocb
*req
, unsigned int issue_flags
)
9085 unsigned int offset
= req
->close
.file_slot
- 1;
9086 struct io_ring_ctx
*ctx
= req
->ctx
;
9087 bool needs_lock
= issue_flags
& IO_URING_F_UNLOCKED
;
9088 struct io_fixed_file
*file_slot
;
9092 io_ring_submit_lock(ctx
, needs_lock
);
9094 if (unlikely(!ctx
->file_data
))
9097 if (offset
>= ctx
->nr_user_files
)
9099 ret
= io_rsrc_node_switch_start(ctx
);
9103 offset
= array_index_nospec(offset
, ctx
->nr_user_files
);
9104 file_slot
= io_fixed_file_slot(&ctx
->file_table
, offset
);
9106 if (!file_slot
->file_ptr
)
9109 file
= (struct file
*)(file_slot
->file_ptr
& FFS_MASK
);
9110 ret
= io_queue_rsrc_removal(ctx
->file_data
, offset
, ctx
->rsrc_node
, file
);
9114 file_slot
->file_ptr
= 0;
9115 io_rsrc_node_switch(ctx
, ctx
->file_data
);
9118 io_ring_submit_unlock(ctx
, needs_lock
);
9122 static int __io_sqe_files_update(struct io_ring_ctx
*ctx
,
9123 struct io_uring_rsrc_update2
*up
,
9126 u64 __user
*tags
= u64_to_user_ptr(up
->tags
);
9127 __s32 __user
*fds
= u64_to_user_ptr(up
->data
);
9128 struct io_rsrc_data
*data
= ctx
->file_data
;
9129 struct io_fixed_file
*file_slot
;
9133 bool needs_switch
= false;
9135 if (!ctx
->file_data
)
9137 if (up
->offset
+ nr_args
> ctx
->nr_user_files
)
9140 for (done
= 0; done
< nr_args
; done
++) {
9143 if ((tags
&& copy_from_user(&tag
, &tags
[done
], sizeof(tag
))) ||
9144 copy_from_user(&fd
, &fds
[done
], sizeof(fd
))) {
9148 if ((fd
== IORING_REGISTER_FILES_SKIP
|| fd
== -1) && tag
) {
9152 if (fd
== IORING_REGISTER_FILES_SKIP
)
9155 i
= array_index_nospec(up
->offset
+ done
, ctx
->nr_user_files
);
9156 file_slot
= io_fixed_file_slot(&ctx
->file_table
, i
);
9158 if (file_slot
->file_ptr
) {
9159 file
= (struct file
*)(file_slot
->file_ptr
& FFS_MASK
);
9160 err
= io_queue_rsrc_removal(data
, i
, ctx
->rsrc_node
, file
);
9163 file_slot
->file_ptr
= 0;
9164 needs_switch
= true;
9173 * Don't allow io_uring instances to be registered. If
9174 * UNIX isn't enabled, then this causes a reference
9175 * cycle and this instance can never get freed. If UNIX
9176 * is enabled we'll handle it just fine, but there's
9177 * still no point in allowing a ring fd as it doesn't
9178 * support regular read/write anyway.
9180 if (file
->f_op
== &io_uring_fops
) {
9185 *io_get_tag_slot(data
, i
) = tag
;
9186 io_fixed_file_set(file_slot
, file
);
9187 err
= io_sqe_file_register(ctx
, file
, i
);
9189 file_slot
->file_ptr
= 0;
9197 io_rsrc_node_switch(ctx
, data
);
9198 return done
? done
: err
;
9201 static struct io_wq
*io_init_wq_offload(struct io_ring_ctx
*ctx
,
9202 struct task_struct
*task
)
9204 struct io_wq_hash
*hash
;
9205 struct io_wq_data data
;
9206 unsigned int concurrency
;
9208 mutex_lock(&ctx
->uring_lock
);
9209 hash
= ctx
->hash_map
;
9211 hash
= kzalloc(sizeof(*hash
), GFP_KERNEL
);
9213 mutex_unlock(&ctx
->uring_lock
);
9214 return ERR_PTR(-ENOMEM
);
9216 refcount_set(&hash
->refs
, 1);
9217 init_waitqueue_head(&hash
->wait
);
9218 ctx
->hash_map
= hash
;
9220 mutex_unlock(&ctx
->uring_lock
);
9224 data
.free_work
= io_wq_free_work
;
9225 data
.do_work
= io_wq_submit_work
;
9227 /* Do QD, or 4 * CPUS, whatever is smallest */
9228 concurrency
= min(ctx
->sq_entries
, 4 * num_online_cpus());
9230 return io_wq_create(concurrency
, &data
);
9233 static __cold
int io_uring_alloc_task_context(struct task_struct
*task
,
9234 struct io_ring_ctx
*ctx
)
9236 struct io_uring_task
*tctx
;
9239 tctx
= kzalloc(sizeof(*tctx
), GFP_KERNEL
);
9240 if (unlikely(!tctx
))
9243 tctx
->registered_rings
= kcalloc(IO_RINGFD_REG_MAX
,
9244 sizeof(struct file
*), GFP_KERNEL
);
9245 if (unlikely(!tctx
->registered_rings
)) {
9250 ret
= percpu_counter_init(&tctx
->inflight
, 0, GFP_KERNEL
);
9251 if (unlikely(ret
)) {
9252 kfree(tctx
->registered_rings
);
9257 tctx
->io_wq
= io_init_wq_offload(ctx
, task
);
9258 if (IS_ERR(tctx
->io_wq
)) {
9259 ret
= PTR_ERR(tctx
->io_wq
);
9260 percpu_counter_destroy(&tctx
->inflight
);
9261 kfree(tctx
->registered_rings
);
9267 init_waitqueue_head(&tctx
->wait
);
9268 atomic_set(&tctx
->in_idle
, 0);
9269 atomic_set(&tctx
->inflight_tracked
, 0);
9270 task
->io_uring
= tctx
;
9271 spin_lock_init(&tctx
->task_lock
);
9272 INIT_WQ_LIST(&tctx
->task_list
);
9273 INIT_WQ_LIST(&tctx
->prior_task_list
);
9274 init_task_work(&tctx
->task_work
, tctx_task_work
);
9278 void __io_uring_free(struct task_struct
*tsk
)
9280 struct io_uring_task
*tctx
= tsk
->io_uring
;
9282 WARN_ON_ONCE(!xa_empty(&tctx
->xa
));
9283 WARN_ON_ONCE(tctx
->io_wq
);
9284 WARN_ON_ONCE(tctx
->cached_refs
);
9286 kfree(tctx
->registered_rings
);
9287 percpu_counter_destroy(&tctx
->inflight
);
9289 tsk
->io_uring
= NULL
;
9292 static __cold
int io_sq_offload_create(struct io_ring_ctx
*ctx
,
9293 struct io_uring_params
*p
)
9297 /* Retain compatibility with failing for an invalid attach attempt */
9298 if ((ctx
->flags
& (IORING_SETUP_ATTACH_WQ
| IORING_SETUP_SQPOLL
)) ==
9299 IORING_SETUP_ATTACH_WQ
) {
9302 f
= fdget(p
->wq_fd
);
9305 if (f
.file
->f_op
!= &io_uring_fops
) {
9311 if (ctx
->flags
& IORING_SETUP_SQPOLL
) {
9312 struct task_struct
*tsk
;
9313 struct io_sq_data
*sqd
;
9316 ret
= security_uring_sqpoll();
9320 sqd
= io_get_sq_data(p
, &attached
);
9326 ctx
->sq_creds
= get_current_cred();
9328 ctx
->sq_thread_idle
= msecs_to_jiffies(p
->sq_thread_idle
);
9329 if (!ctx
->sq_thread_idle
)
9330 ctx
->sq_thread_idle
= HZ
;
9332 io_sq_thread_park(sqd
);
9333 list_add(&ctx
->sqd_list
, &sqd
->ctx_list
);
9334 io_sqd_update_thread_idle(sqd
);
9335 /* don't attach to a dying SQPOLL thread, would be racy */
9336 ret
= (attached
&& !sqd
->thread
) ? -ENXIO
: 0;
9337 io_sq_thread_unpark(sqd
);
9344 if (p
->flags
& IORING_SETUP_SQ_AFF
) {
9345 int cpu
= p
->sq_thread_cpu
;
9348 if (cpu
>= nr_cpu_ids
|| !cpu_online(cpu
))
9355 sqd
->task_pid
= current
->pid
;
9356 sqd
->task_tgid
= current
->tgid
;
9357 tsk
= create_io_thread(io_sq_thread
, sqd
, NUMA_NO_NODE
);
9364 ret
= io_uring_alloc_task_context(tsk
, ctx
);
9365 wake_up_new_task(tsk
);
9368 } else if (p
->flags
& IORING_SETUP_SQ_AFF
) {
9369 /* Can't have SQ_AFF without SQPOLL */
9376 complete(&ctx
->sq_data
->exited
);
9378 io_sq_thread_finish(ctx
);
9382 static inline void __io_unaccount_mem(struct user_struct
*user
,
9383 unsigned long nr_pages
)
9385 atomic_long_sub(nr_pages
, &user
->locked_vm
);
9388 static inline int __io_account_mem(struct user_struct
*user
,
9389 unsigned long nr_pages
)
9391 unsigned long page_limit
, cur_pages
, new_pages
;
9393 /* Don't allow more pages than we can safely lock */
9394 page_limit
= rlimit(RLIMIT_MEMLOCK
) >> PAGE_SHIFT
;
9397 cur_pages
= atomic_long_read(&user
->locked_vm
);
9398 new_pages
= cur_pages
+ nr_pages
;
9399 if (new_pages
> page_limit
)
9401 } while (atomic_long_cmpxchg(&user
->locked_vm
, cur_pages
,
9402 new_pages
) != cur_pages
);
9407 static void io_unaccount_mem(struct io_ring_ctx
*ctx
, unsigned long nr_pages
)
9410 __io_unaccount_mem(ctx
->user
, nr_pages
);
9412 if (ctx
->mm_account
)
9413 atomic64_sub(nr_pages
, &ctx
->mm_account
->pinned_vm
);
9416 static int io_account_mem(struct io_ring_ctx
*ctx
, unsigned long nr_pages
)
9421 ret
= __io_account_mem(ctx
->user
, nr_pages
);
9426 if (ctx
->mm_account
)
9427 atomic64_add(nr_pages
, &ctx
->mm_account
->pinned_vm
);
9432 static void io_mem_free(void *ptr
)
9439 page
= virt_to_head_page(ptr
);
9440 if (put_page_testzero(page
))
9441 free_compound_page(page
);
9444 static void *io_mem_alloc(size_t size
)
9446 gfp_t gfp
= GFP_KERNEL_ACCOUNT
| __GFP_ZERO
| __GFP_NOWARN
| __GFP_COMP
;
9448 return (void *) __get_free_pages(gfp
, get_order(size
));
9451 static unsigned long rings_size(unsigned sq_entries
, unsigned cq_entries
,
9454 struct io_rings
*rings
;
9455 size_t off
, sq_array_size
;
9457 off
= struct_size(rings
, cqes
, cq_entries
);
9458 if (off
== SIZE_MAX
)
9462 off
= ALIGN(off
, SMP_CACHE_BYTES
);
9470 sq_array_size
= array_size(sizeof(u32
), sq_entries
);
9471 if (sq_array_size
== SIZE_MAX
)
9474 if (check_add_overflow(off
, sq_array_size
, &off
))
9480 static void io_buffer_unmap(struct io_ring_ctx
*ctx
, struct io_mapped_ubuf
**slot
)
9482 struct io_mapped_ubuf
*imu
= *slot
;
9485 if (imu
!= ctx
->dummy_ubuf
) {
9486 for (i
= 0; i
< imu
->nr_bvecs
; i
++)
9487 unpin_user_page(imu
->bvec
[i
].bv_page
);
9488 if (imu
->acct_pages
)
9489 io_unaccount_mem(ctx
, imu
->acct_pages
);
9495 static void io_rsrc_buf_put(struct io_ring_ctx
*ctx
, struct io_rsrc_put
*prsrc
)
9497 io_buffer_unmap(ctx
, &prsrc
->buf
);
9501 static void __io_sqe_buffers_unregister(struct io_ring_ctx
*ctx
)
9505 for (i
= 0; i
< ctx
->nr_user_bufs
; i
++)
9506 io_buffer_unmap(ctx
, &ctx
->user_bufs
[i
]);
9507 kfree(ctx
->user_bufs
);
9508 io_rsrc_data_free(ctx
->buf_data
);
9509 ctx
->user_bufs
= NULL
;
9510 ctx
->buf_data
= NULL
;
9511 ctx
->nr_user_bufs
= 0;
9514 static int io_sqe_buffers_unregister(struct io_ring_ctx
*ctx
)
9516 unsigned nr
= ctx
->nr_user_bufs
;
9523 * Quiesce may unlock ->uring_lock, and while it's not held
9524 * prevent new requests using the table.
9526 ctx
->nr_user_bufs
= 0;
9527 ret
= io_rsrc_ref_quiesce(ctx
->buf_data
, ctx
);
9528 ctx
->nr_user_bufs
= nr
;
9530 __io_sqe_buffers_unregister(ctx
);
9534 static int io_copy_iov(struct io_ring_ctx
*ctx
, struct iovec
*dst
,
9535 void __user
*arg
, unsigned index
)
9537 struct iovec __user
*src
;
9539 #ifdef CONFIG_COMPAT
9541 struct compat_iovec __user
*ciovs
;
9542 struct compat_iovec ciov
;
9544 ciovs
= (struct compat_iovec __user
*) arg
;
9545 if (copy_from_user(&ciov
, &ciovs
[index
], sizeof(ciov
)))
9548 dst
->iov_base
= u64_to_user_ptr((u64
)ciov
.iov_base
);
9549 dst
->iov_len
= ciov
.iov_len
;
9553 src
= (struct iovec __user
*) arg
;
9554 if (copy_from_user(dst
, &src
[index
], sizeof(*dst
)))
9560 * Not super efficient, but this is just a registration time. And we do cache
9561 * the last compound head, so generally we'll only do a full search if we don't
9564 * We check if the given compound head page has already been accounted, to
9565 * avoid double accounting it. This allows us to account the full size of the
9566 * page, not just the constituent pages of a huge page.
9568 static bool headpage_already_acct(struct io_ring_ctx
*ctx
, struct page
**pages
,
9569 int nr_pages
, struct page
*hpage
)
9573 /* check current page array */
9574 for (i
= 0; i
< nr_pages
; i
++) {
9575 if (!PageCompound(pages
[i
]))
9577 if (compound_head(pages
[i
]) == hpage
)
9581 /* check previously registered pages */
9582 for (i
= 0; i
< ctx
->nr_user_bufs
; i
++) {
9583 struct io_mapped_ubuf
*imu
= ctx
->user_bufs
[i
];
9585 for (j
= 0; j
< imu
->nr_bvecs
; j
++) {
9586 if (!PageCompound(imu
->bvec
[j
].bv_page
))
9588 if (compound_head(imu
->bvec
[j
].bv_page
) == hpage
)
9596 static int io_buffer_account_pin(struct io_ring_ctx
*ctx
, struct page
**pages
,
9597 int nr_pages
, struct io_mapped_ubuf
*imu
,
9598 struct page
**last_hpage
)
9602 imu
->acct_pages
= 0;
9603 for (i
= 0; i
< nr_pages
; i
++) {
9604 if (!PageCompound(pages
[i
])) {
9609 hpage
= compound_head(pages
[i
]);
9610 if (hpage
== *last_hpage
)
9612 *last_hpage
= hpage
;
9613 if (headpage_already_acct(ctx
, pages
, i
, hpage
))
9615 imu
->acct_pages
+= page_size(hpage
) >> PAGE_SHIFT
;
9619 if (!imu
->acct_pages
)
9622 ret
= io_account_mem(ctx
, imu
->acct_pages
);
9624 imu
->acct_pages
= 0;
9628 static int io_sqe_buffer_register(struct io_ring_ctx
*ctx
, struct iovec
*iov
,
9629 struct io_mapped_ubuf
**pimu
,
9630 struct page
**last_hpage
)
9632 struct io_mapped_ubuf
*imu
= NULL
;
9633 struct vm_area_struct
**vmas
= NULL
;
9634 struct page
**pages
= NULL
;
9635 unsigned long off
, start
, end
, ubuf
;
9637 int ret
, pret
, nr_pages
, i
;
9639 if (!iov
->iov_base
) {
9640 *pimu
= ctx
->dummy_ubuf
;
9644 ubuf
= (unsigned long) iov
->iov_base
;
9645 end
= (ubuf
+ iov
->iov_len
+ PAGE_SIZE
- 1) >> PAGE_SHIFT
;
9646 start
= ubuf
>> PAGE_SHIFT
;
9647 nr_pages
= end
- start
;
9652 pages
= kvmalloc_array(nr_pages
, sizeof(struct page
*), GFP_KERNEL
);
9656 vmas
= kvmalloc_array(nr_pages
, sizeof(struct vm_area_struct
*),
9661 imu
= kvmalloc(struct_size(imu
, bvec
, nr_pages
), GFP_KERNEL
);
9666 mmap_read_lock(current
->mm
);
9667 pret
= pin_user_pages(ubuf
, nr_pages
, FOLL_WRITE
| FOLL_LONGTERM
,
9669 if (pret
== nr_pages
) {
9670 /* don't support file backed memory */
9671 for (i
= 0; i
< nr_pages
; i
++) {
9672 struct vm_area_struct
*vma
= vmas
[i
];
9674 if (vma_is_shmem(vma
))
9677 !is_file_hugepages(vma
->vm_file
)) {
9683 ret
= pret
< 0 ? pret
: -EFAULT
;
9685 mmap_read_unlock(current
->mm
);
9688 * if we did partial map, or found file backed vmas,
9689 * release any pages we did get
9692 unpin_user_pages(pages
, pret
);
9696 ret
= io_buffer_account_pin(ctx
, pages
, pret
, imu
, last_hpage
);
9698 unpin_user_pages(pages
, pret
);
9702 off
= ubuf
& ~PAGE_MASK
;
9703 size
= iov
->iov_len
;
9704 for (i
= 0; i
< nr_pages
; i
++) {
9707 vec_len
= min_t(size_t, size
, PAGE_SIZE
- off
);
9708 imu
->bvec
[i
].bv_page
= pages
[i
];
9709 imu
->bvec
[i
].bv_len
= vec_len
;
9710 imu
->bvec
[i
].bv_offset
= off
;
9714 /* store original address for later verification */
9716 imu
->ubuf_end
= ubuf
+ iov
->iov_len
;
9717 imu
->nr_bvecs
= nr_pages
;
9728 static int io_buffers_map_alloc(struct io_ring_ctx
*ctx
, unsigned int nr_args
)
9730 ctx
->user_bufs
= kcalloc(nr_args
, sizeof(*ctx
->user_bufs
), GFP_KERNEL
);
9731 return ctx
->user_bufs
? 0 : -ENOMEM
;
9734 static int io_buffer_validate(struct iovec
*iov
)
9736 unsigned long tmp
, acct_len
= iov
->iov_len
+ (PAGE_SIZE
- 1);
9739 * Don't impose further limits on the size and buffer
9740 * constraints here, we'll -EINVAL later when IO is
9741 * submitted if they are wrong.
9744 return iov
->iov_len
? -EFAULT
: 0;
9748 /* arbitrary limit, but we need something */
9749 if (iov
->iov_len
> SZ_1G
)
9752 if (check_add_overflow((unsigned long)iov
->iov_base
, acct_len
, &tmp
))
9758 static int io_sqe_buffers_register(struct io_ring_ctx
*ctx
, void __user
*arg
,
9759 unsigned int nr_args
, u64 __user
*tags
)
9761 struct page
*last_hpage
= NULL
;
9762 struct io_rsrc_data
*data
;
9768 if (!nr_args
|| nr_args
> IORING_MAX_REG_BUFFERS
)
9770 ret
= io_rsrc_node_switch_start(ctx
);
9773 ret
= io_rsrc_data_alloc(ctx
, io_rsrc_buf_put
, tags
, nr_args
, &data
);
9776 ret
= io_buffers_map_alloc(ctx
, nr_args
);
9778 io_rsrc_data_free(data
);
9782 for (i
= 0; i
< nr_args
; i
++, ctx
->nr_user_bufs
++) {
9783 ret
= io_copy_iov(ctx
, &iov
, arg
, i
);
9786 ret
= io_buffer_validate(&iov
);
9789 if (!iov
.iov_base
&& *io_get_tag_slot(data
, i
)) {
9794 ret
= io_sqe_buffer_register(ctx
, &iov
, &ctx
->user_bufs
[i
],
9800 WARN_ON_ONCE(ctx
->buf_data
);
9802 ctx
->buf_data
= data
;
9804 __io_sqe_buffers_unregister(ctx
);
9806 io_rsrc_node_switch(ctx
, NULL
);
9810 static int __io_sqe_buffers_update(struct io_ring_ctx
*ctx
,
9811 struct io_uring_rsrc_update2
*up
,
9812 unsigned int nr_args
)
9814 u64 __user
*tags
= u64_to_user_ptr(up
->tags
);
9815 struct iovec iov
, __user
*iovs
= u64_to_user_ptr(up
->data
);
9816 struct page
*last_hpage
= NULL
;
9817 bool needs_switch
= false;
9823 if (up
->offset
+ nr_args
> ctx
->nr_user_bufs
)
9826 for (done
= 0; done
< nr_args
; done
++) {
9827 struct io_mapped_ubuf
*imu
;
9828 int offset
= up
->offset
+ done
;
9831 err
= io_copy_iov(ctx
, &iov
, iovs
, done
);
9834 if (tags
&& copy_from_user(&tag
, &tags
[done
], sizeof(tag
))) {
9838 err
= io_buffer_validate(&iov
);
9841 if (!iov
.iov_base
&& tag
) {
9845 err
= io_sqe_buffer_register(ctx
, &iov
, &imu
, &last_hpage
);
9849 i
= array_index_nospec(offset
, ctx
->nr_user_bufs
);
9850 if (ctx
->user_bufs
[i
] != ctx
->dummy_ubuf
) {
9851 err
= io_queue_rsrc_removal(ctx
->buf_data
, i
,
9852 ctx
->rsrc_node
, ctx
->user_bufs
[i
]);
9853 if (unlikely(err
)) {
9854 io_buffer_unmap(ctx
, &imu
);
9857 ctx
->user_bufs
[i
] = NULL
;
9858 needs_switch
= true;
9861 ctx
->user_bufs
[i
] = imu
;
9862 *io_get_tag_slot(ctx
->buf_data
, offset
) = tag
;
9866 io_rsrc_node_switch(ctx
, ctx
->buf_data
);
9867 return done
? done
: err
;
9870 static int io_eventfd_register(struct io_ring_ctx
*ctx
, void __user
*arg
,
9871 unsigned int eventfd_async
)
9873 struct io_ev_fd
*ev_fd
;
9874 __s32 __user
*fds
= arg
;
9877 ev_fd
= rcu_dereference_protected(ctx
->io_ev_fd
,
9878 lockdep_is_held(&ctx
->uring_lock
));
9882 if (copy_from_user(&fd
, fds
, sizeof(*fds
)))
9885 ev_fd
= kmalloc(sizeof(*ev_fd
), GFP_KERNEL
);
9889 ev_fd
->cq_ev_fd
= eventfd_ctx_fdget(fd
);
9890 if (IS_ERR(ev_fd
->cq_ev_fd
)) {
9891 int ret
= PTR_ERR(ev_fd
->cq_ev_fd
);
9895 ev_fd
->eventfd_async
= eventfd_async
;
9896 ctx
->has_evfd
= true;
9897 rcu_assign_pointer(ctx
->io_ev_fd
, ev_fd
);
9901 static void io_eventfd_put(struct rcu_head
*rcu
)
9903 struct io_ev_fd
*ev_fd
= container_of(rcu
, struct io_ev_fd
, rcu
);
9905 eventfd_ctx_put(ev_fd
->cq_ev_fd
);
9909 static int io_eventfd_unregister(struct io_ring_ctx
*ctx
)
9911 struct io_ev_fd
*ev_fd
;
9913 ev_fd
= rcu_dereference_protected(ctx
->io_ev_fd
,
9914 lockdep_is_held(&ctx
->uring_lock
));
9916 ctx
->has_evfd
= false;
9917 rcu_assign_pointer(ctx
->io_ev_fd
, NULL
);
9918 call_rcu(&ev_fd
->rcu
, io_eventfd_put
);
9925 static void io_destroy_buffers(struct io_ring_ctx
*ctx
)
9929 for (i
= 0; i
< (1U << IO_BUFFERS_HASH_BITS
); i
++) {
9930 struct list_head
*list
= &ctx
->io_buffers
[i
];
9932 while (!list_empty(list
)) {
9933 struct io_buffer_list
*bl
;
9935 bl
= list_first_entry(list
, struct io_buffer_list
, list
);
9936 __io_remove_buffers(ctx
, bl
, -1U);
9937 list_del(&bl
->list
);
9942 while (!list_empty(&ctx
->io_buffers_pages
)) {
9945 page
= list_first_entry(&ctx
->io_buffers_pages
, struct page
, lru
);
9946 list_del_init(&page
->lru
);
9951 static void io_req_caches_free(struct io_ring_ctx
*ctx
)
9953 struct io_submit_state
*state
= &ctx
->submit_state
;
9956 mutex_lock(&ctx
->uring_lock
);
9957 io_flush_cached_locked_reqs(ctx
, state
);
9959 while (state
->free_list
.next
) {
9960 struct io_wq_work_node
*node
;
9961 struct io_kiocb
*req
;
9963 node
= wq_stack_extract(&state
->free_list
);
9964 req
= container_of(node
, struct io_kiocb
, comp_list
);
9965 kmem_cache_free(req_cachep
, req
);
9969 percpu_ref_put_many(&ctx
->refs
, nr
);
9970 mutex_unlock(&ctx
->uring_lock
);
9973 static void io_wait_rsrc_data(struct io_rsrc_data
*data
)
9975 if (data
&& !atomic_dec_and_test(&data
->refs
))
9976 wait_for_completion(&data
->done
);
9979 static void io_flush_apoll_cache(struct io_ring_ctx
*ctx
)
9981 struct async_poll
*apoll
;
9983 while (!list_empty(&ctx
->apoll_cache
)) {
9984 apoll
= list_first_entry(&ctx
->apoll_cache
, struct async_poll
,
9986 list_del(&apoll
->poll
.wait
.entry
);
9991 static __cold
void io_ring_ctx_free(struct io_ring_ctx
*ctx
)
9993 io_sq_thread_finish(ctx
);
9995 if (ctx
->mm_account
) {
9996 mmdrop(ctx
->mm_account
);
9997 ctx
->mm_account
= NULL
;
10000 io_rsrc_refs_drop(ctx
);
10001 /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
10002 io_wait_rsrc_data(ctx
->buf_data
);
10003 io_wait_rsrc_data(ctx
->file_data
);
10005 mutex_lock(&ctx
->uring_lock
);
10007 __io_sqe_buffers_unregister(ctx
);
10008 if (ctx
->file_data
)
10009 __io_sqe_files_unregister(ctx
);
10011 __io_cqring_overflow_flush(ctx
, true);
10012 io_eventfd_unregister(ctx
);
10013 io_flush_apoll_cache(ctx
);
10014 mutex_unlock(&ctx
->uring_lock
);
10015 io_destroy_buffers(ctx
);
10017 put_cred(ctx
->sq_creds
);
10019 /* there are no registered resources left, nobody uses it */
10020 if (ctx
->rsrc_node
)
10021 io_rsrc_node_destroy(ctx
->rsrc_node
);
10022 if (ctx
->rsrc_backup_node
)
10023 io_rsrc_node_destroy(ctx
->rsrc_backup_node
);
10024 flush_delayed_work(&ctx
->rsrc_put_work
);
10025 flush_delayed_work(&ctx
->fallback_work
);
10027 WARN_ON_ONCE(!list_empty(&ctx
->rsrc_ref_list
));
10028 WARN_ON_ONCE(!llist_empty(&ctx
->rsrc_put_llist
));
10030 #if defined(CONFIG_UNIX)
10031 if (ctx
->ring_sock
) {
10032 ctx
->ring_sock
->file
= NULL
; /* so that iput() is called */
10033 sock_release(ctx
->ring_sock
);
10036 WARN_ON_ONCE(!list_empty(&ctx
->ltimeout_list
));
10038 io_mem_free(ctx
->rings
);
10039 io_mem_free(ctx
->sq_sqes
);
10041 percpu_ref_exit(&ctx
->refs
);
10042 free_uid(ctx
->user
);
10043 io_req_caches_free(ctx
);
10045 io_wq_put_hash(ctx
->hash_map
);
10046 kfree(ctx
->cancel_hash
);
10047 kfree(ctx
->dummy_ubuf
);
10048 kfree(ctx
->io_buffers
);
10052 static __poll_t
io_uring_poll(struct file
*file
, poll_table
*wait
)
10054 struct io_ring_ctx
*ctx
= file
->private_data
;
10057 poll_wait(file
, &ctx
->cq_wait
, wait
);
10059 * synchronizes with barrier from wq_has_sleeper call in
10063 if (!io_sqring_full(ctx
))
10064 mask
|= EPOLLOUT
| EPOLLWRNORM
;
10067 * Don't flush cqring overflow list here, just do a simple check.
10068 * Otherwise there could possible be ABBA deadlock:
10071 * lock(&ctx->uring_lock);
10073 * lock(&ctx->uring_lock);
10076 * Users may get EPOLLIN meanwhile seeing nothing in cqring, this
10077 * pushs them to do the flush.
10079 if (io_cqring_events(ctx
) || test_bit(0, &ctx
->check_cq_overflow
))
10080 mask
|= EPOLLIN
| EPOLLRDNORM
;
10085 static int io_unregister_personality(struct io_ring_ctx
*ctx
, unsigned id
)
10087 const struct cred
*creds
;
10089 creds
= xa_erase(&ctx
->personalities
, id
);
10098 struct io_tctx_exit
{
10099 struct callback_head task_work
;
10100 struct completion completion
;
10101 struct io_ring_ctx
*ctx
;
10104 static __cold
void io_tctx_exit_cb(struct callback_head
*cb
)
10106 struct io_uring_task
*tctx
= current
->io_uring
;
10107 struct io_tctx_exit
*work
;
10109 work
= container_of(cb
, struct io_tctx_exit
, task_work
);
10111 * When @in_idle, we're in cancellation and it's racy to remove the
10112 * node. It'll be removed by the end of cancellation, just ignore it.
10114 if (!atomic_read(&tctx
->in_idle
))
10115 io_uring_del_tctx_node((unsigned long)work
->ctx
);
10116 complete(&work
->completion
);
10119 static __cold
bool io_cancel_ctx_cb(struct io_wq_work
*work
, void *data
)
10121 struct io_kiocb
*req
= container_of(work
, struct io_kiocb
, work
);
10123 return req
->ctx
== data
;
10126 static __cold
void io_ring_exit_work(struct work_struct
*work
)
10128 struct io_ring_ctx
*ctx
= container_of(work
, struct io_ring_ctx
, exit_work
);
10129 unsigned long timeout
= jiffies
+ HZ
* 60 * 5;
10130 unsigned long interval
= HZ
/ 20;
10131 struct io_tctx_exit exit
;
10132 struct io_tctx_node
*node
;
10136 * If we're doing polled IO and end up having requests being
10137 * submitted async (out-of-line), then completions can come in while
10138 * we're waiting for refs to drop. We need to reap these manually,
10139 * as nobody else will be looking for them.
10142 io_uring_try_cancel_requests(ctx
, NULL
, true);
10143 if (ctx
->sq_data
) {
10144 struct io_sq_data
*sqd
= ctx
->sq_data
;
10145 struct task_struct
*tsk
;
10147 io_sq_thread_park(sqd
);
10149 if (tsk
&& tsk
->io_uring
&& tsk
->io_uring
->io_wq
)
10150 io_wq_cancel_cb(tsk
->io_uring
->io_wq
,
10151 io_cancel_ctx_cb
, ctx
, true);
10152 io_sq_thread_unpark(sqd
);
10155 io_req_caches_free(ctx
);
10157 if (WARN_ON_ONCE(time_after(jiffies
, timeout
))) {
10158 /* there is little hope left, don't run it too often */
10159 interval
= HZ
* 60;
10161 } while (!wait_for_completion_timeout(&ctx
->ref_comp
, interval
));
10163 init_completion(&exit
.completion
);
10164 init_task_work(&exit
.task_work
, io_tctx_exit_cb
);
10167 * Some may use context even when all refs and requests have been put,
10168 * and they are free to do so while still holding uring_lock or
10169 * completion_lock, see io_req_task_submit(). Apart from other work,
10170 * this lock/unlock section also waits them to finish.
10172 mutex_lock(&ctx
->uring_lock
);
10173 while (!list_empty(&ctx
->tctx_list
)) {
10174 WARN_ON_ONCE(time_after(jiffies
, timeout
));
10176 node
= list_first_entry(&ctx
->tctx_list
, struct io_tctx_node
,
10178 /* don't spin on a single task if cancellation failed */
10179 list_rotate_left(&ctx
->tctx_list
);
10180 ret
= task_work_add(node
->task
, &exit
.task_work
, TWA_SIGNAL
);
10181 if (WARN_ON_ONCE(ret
))
10184 mutex_unlock(&ctx
->uring_lock
);
10185 wait_for_completion(&exit
.completion
);
10186 mutex_lock(&ctx
->uring_lock
);
10188 mutex_unlock(&ctx
->uring_lock
);
10189 spin_lock(&ctx
->completion_lock
);
10190 spin_unlock(&ctx
->completion_lock
);
10192 io_ring_ctx_free(ctx
);
10195 /* Returns true if we found and killed one or more timeouts */
10196 static __cold
bool io_kill_timeouts(struct io_ring_ctx
*ctx
,
10197 struct task_struct
*tsk
, bool cancel_all
)
10199 struct io_kiocb
*req
, *tmp
;
10202 spin_lock(&ctx
->completion_lock
);
10203 spin_lock_irq(&ctx
->timeout_lock
);
10204 list_for_each_entry_safe(req
, tmp
, &ctx
->timeout_list
, timeout
.list
) {
10205 if (io_match_task(req
, tsk
, cancel_all
)) {
10206 io_kill_timeout(req
, -ECANCELED
);
10210 spin_unlock_irq(&ctx
->timeout_lock
);
10212 io_commit_cqring(ctx
);
10213 spin_unlock(&ctx
->completion_lock
);
10215 io_cqring_ev_posted(ctx
);
10216 return canceled
!= 0;
10219 static __cold
void io_ring_ctx_wait_and_kill(struct io_ring_ctx
*ctx
)
10221 unsigned long index
;
10222 struct creds
*creds
;
10224 mutex_lock(&ctx
->uring_lock
);
10225 percpu_ref_kill(&ctx
->refs
);
10227 __io_cqring_overflow_flush(ctx
, true);
10228 xa_for_each(&ctx
->personalities
, index
, creds
)
10229 io_unregister_personality(ctx
, index
);
10230 mutex_unlock(&ctx
->uring_lock
);
10232 io_kill_timeouts(ctx
, NULL
, true);
10233 io_poll_remove_all(ctx
, NULL
, true);
10235 /* if we failed setting up the ctx, we might not have any rings */
10236 io_iopoll_try_reap_events(ctx
);
10238 INIT_WORK(&ctx
->exit_work
, io_ring_exit_work
);
10240 * Use system_unbound_wq to avoid spawning tons of event kworkers
10241 * if we're exiting a ton of rings at the same time. It just adds
10242 * noise and overhead, there's no discernable change in runtime
10243 * over using system_wq.
10245 queue_work(system_unbound_wq
, &ctx
->exit_work
);
10248 static int io_uring_release(struct inode
*inode
, struct file
*file
)
10250 struct io_ring_ctx
*ctx
= file
->private_data
;
10252 file
->private_data
= NULL
;
10253 io_ring_ctx_wait_and_kill(ctx
);
10257 struct io_task_cancel
{
10258 struct task_struct
*task
;
10262 static bool io_cancel_task_cb(struct io_wq_work
*work
, void *data
)
10264 struct io_kiocb
*req
= container_of(work
, struct io_kiocb
, work
);
10265 struct io_task_cancel
*cancel
= data
;
10267 return io_match_task_safe(req
, cancel
->task
, cancel
->all
);
10270 static __cold
bool io_cancel_defer_files(struct io_ring_ctx
*ctx
,
10271 struct task_struct
*task
,
10274 struct io_defer_entry
*de
;
10277 spin_lock(&ctx
->completion_lock
);
10278 list_for_each_entry_reverse(de
, &ctx
->defer_list
, list
) {
10279 if (io_match_task_safe(de
->req
, task
, cancel_all
)) {
10280 list_cut_position(&list
, &ctx
->defer_list
, &de
->list
);
10284 spin_unlock(&ctx
->completion_lock
);
10285 if (list_empty(&list
))
10288 while (!list_empty(&list
)) {
10289 de
= list_first_entry(&list
, struct io_defer_entry
, list
);
10290 list_del_init(&de
->list
);
10291 io_req_complete_failed(de
->req
, -ECANCELED
);
10297 static __cold
bool io_uring_try_cancel_iowq(struct io_ring_ctx
*ctx
)
10299 struct io_tctx_node
*node
;
10300 enum io_wq_cancel cret
;
10303 mutex_lock(&ctx
->uring_lock
);
10304 list_for_each_entry(node
, &ctx
->tctx_list
, ctx_node
) {
10305 struct io_uring_task
*tctx
= node
->task
->io_uring
;
10308 * io_wq will stay alive while we hold uring_lock, because it's
10309 * killed after ctx nodes, which requires to take the lock.
10311 if (!tctx
|| !tctx
->io_wq
)
10313 cret
= io_wq_cancel_cb(tctx
->io_wq
, io_cancel_ctx_cb
, ctx
, true);
10314 ret
|= (cret
!= IO_WQ_CANCEL_NOTFOUND
);
10316 mutex_unlock(&ctx
->uring_lock
);
10321 static __cold
void io_uring_try_cancel_requests(struct io_ring_ctx
*ctx
,
10322 struct task_struct
*task
,
10325 struct io_task_cancel cancel
= { .task
= task
, .all
= cancel_all
, };
10326 struct io_uring_task
*tctx
= task
? task
->io_uring
: NULL
;
10329 enum io_wq_cancel cret
;
10333 ret
|= io_uring_try_cancel_iowq(ctx
);
10334 } else if (tctx
&& tctx
->io_wq
) {
10336 * Cancels requests of all rings, not only @ctx, but
10337 * it's fine as the task is in exit/exec.
10339 cret
= io_wq_cancel_cb(tctx
->io_wq
, io_cancel_task_cb
,
10341 ret
|= (cret
!= IO_WQ_CANCEL_NOTFOUND
);
10344 /* SQPOLL thread does its own polling */
10345 if ((!(ctx
->flags
& IORING_SETUP_SQPOLL
) && cancel_all
) ||
10346 (ctx
->sq_data
&& ctx
->sq_data
->thread
== current
)) {
10347 while (!wq_list_empty(&ctx
->iopoll_list
)) {
10348 io_iopoll_try_reap_events(ctx
);
10353 ret
|= io_cancel_defer_files(ctx
, task
, cancel_all
);
10354 ret
|= io_poll_remove_all(ctx
, task
, cancel_all
);
10355 ret
|= io_kill_timeouts(ctx
, task
, cancel_all
);
10357 ret
|= io_run_task_work();
10364 static int __io_uring_add_tctx_node(struct io_ring_ctx
*ctx
)
10366 struct io_uring_task
*tctx
= current
->io_uring
;
10367 struct io_tctx_node
*node
;
10370 if (unlikely(!tctx
)) {
10371 ret
= io_uring_alloc_task_context(current
, ctx
);
10375 tctx
= current
->io_uring
;
10376 if (ctx
->iowq_limits_set
) {
10377 unsigned int limits
[2] = { ctx
->iowq_limits
[0],
10378 ctx
->iowq_limits
[1], };
10380 ret
= io_wq_max_workers(tctx
->io_wq
, limits
);
10385 if (!xa_load(&tctx
->xa
, (unsigned long)ctx
)) {
10386 node
= kmalloc(sizeof(*node
), GFP_KERNEL
);
10390 node
->task
= current
;
10392 ret
= xa_err(xa_store(&tctx
->xa
, (unsigned long)ctx
,
10393 node
, GFP_KERNEL
));
10399 mutex_lock(&ctx
->uring_lock
);
10400 list_add(&node
->ctx_node
, &ctx
->tctx_list
);
10401 mutex_unlock(&ctx
->uring_lock
);
10408 * Note that this task has used io_uring. We use it for cancelation purposes.
10410 static inline int io_uring_add_tctx_node(struct io_ring_ctx
*ctx
)
10412 struct io_uring_task
*tctx
= current
->io_uring
;
10414 if (likely(tctx
&& tctx
->last
== ctx
))
10416 return __io_uring_add_tctx_node(ctx
);
10420 * Remove this io_uring_file -> task mapping.
10422 static __cold
void io_uring_del_tctx_node(unsigned long index
)
10424 struct io_uring_task
*tctx
= current
->io_uring
;
10425 struct io_tctx_node
*node
;
10429 node
= xa_erase(&tctx
->xa
, index
);
10433 WARN_ON_ONCE(current
!= node
->task
);
10434 WARN_ON_ONCE(list_empty(&node
->ctx_node
));
10436 mutex_lock(&node
->ctx
->uring_lock
);
10437 list_del(&node
->ctx_node
);
10438 mutex_unlock(&node
->ctx
->uring_lock
);
10440 if (tctx
->last
== node
->ctx
)
10445 static __cold
void io_uring_clean_tctx(struct io_uring_task
*tctx
)
10447 struct io_wq
*wq
= tctx
->io_wq
;
10448 struct io_tctx_node
*node
;
10449 unsigned long index
;
10451 xa_for_each(&tctx
->xa
, index
, node
) {
10452 io_uring_del_tctx_node(index
);
10457 * Must be after io_uring_del_tctx_node() (removes nodes under
10458 * uring_lock) to avoid race with io_uring_try_cancel_iowq().
10460 io_wq_put_and_exit(wq
);
10461 tctx
->io_wq
= NULL
;
10465 static s64
tctx_inflight(struct io_uring_task
*tctx
, bool tracked
)
10468 return atomic_read(&tctx
->inflight_tracked
);
10469 return percpu_counter_sum(&tctx
->inflight
);
10473 * Find any io_uring ctx that this task has registered or done IO on, and cancel
10474 * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation.
10476 static __cold
void io_uring_cancel_generic(bool cancel_all
,
10477 struct io_sq_data
*sqd
)
10479 struct io_uring_task
*tctx
= current
->io_uring
;
10480 struct io_ring_ctx
*ctx
;
10484 WARN_ON_ONCE(sqd
&& sqd
->thread
!= current
);
10486 if (!current
->io_uring
)
10489 io_wq_exit_start(tctx
->io_wq
);
10491 atomic_inc(&tctx
->in_idle
);
10493 io_uring_drop_tctx_refs(current
);
10494 /* read completions before cancelations */
10495 inflight
= tctx_inflight(tctx
, !cancel_all
);
10500 struct io_tctx_node
*node
;
10501 unsigned long index
;
10503 xa_for_each(&tctx
->xa
, index
, node
) {
10504 /* sqpoll task will cancel all its requests */
10505 if (node
->ctx
->sq_data
)
10507 io_uring_try_cancel_requests(node
->ctx
, current
,
10511 list_for_each_entry(ctx
, &sqd
->ctx_list
, sqd_list
)
10512 io_uring_try_cancel_requests(ctx
, current
,
10516 prepare_to_wait(&tctx
->wait
, &wait
, TASK_INTERRUPTIBLE
);
10517 io_run_task_work();
10518 io_uring_drop_tctx_refs(current
);
10521 * If we've seen completions, retry without waiting. This
10522 * avoids a race where a completion comes in before we did
10523 * prepare_to_wait().
10525 if (inflight
== tctx_inflight(tctx
, !cancel_all
))
10527 finish_wait(&tctx
->wait
, &wait
);
10530 io_uring_clean_tctx(tctx
);
10533 * We shouldn't run task_works after cancel, so just leave
10534 * ->in_idle set for normal exit.
10536 atomic_dec(&tctx
->in_idle
);
10537 /* for exec all current's requests should be gone, kill tctx */
10538 __io_uring_free(current
);
10542 void __io_uring_cancel(bool cancel_all
)
10544 io_uring_cancel_generic(cancel_all
, NULL
);
10547 void io_uring_unreg_ringfd(void)
10549 struct io_uring_task
*tctx
= current
->io_uring
;
10552 for (i
= 0; i
< IO_RINGFD_REG_MAX
; i
++) {
10553 if (tctx
->registered_rings
[i
]) {
10554 fput(tctx
->registered_rings
[i
]);
10555 tctx
->registered_rings
[i
] = NULL
;
10560 static int io_ring_add_registered_fd(struct io_uring_task
*tctx
, int fd
,
10561 int start
, int end
)
10566 for (offset
= start
; offset
< end
; offset
++) {
10567 offset
= array_index_nospec(offset
, IO_RINGFD_REG_MAX
);
10568 if (tctx
->registered_rings
[offset
])
10574 } else if (file
->f_op
!= &io_uring_fops
) {
10576 return -EOPNOTSUPP
;
10578 tctx
->registered_rings
[offset
] = file
;
10586 * Register a ring fd to avoid fdget/fdput for each io_uring_enter()
10587 * invocation. User passes in an array of struct io_uring_rsrc_update
10588 * with ->data set to the ring_fd, and ->offset given for the desired
10589 * index. If no index is desired, application may set ->offset == -1U
10590 * and we'll find an available index. Returns number of entries
10591 * successfully processed, or < 0 on error if none were processed.
10593 static int io_ringfd_register(struct io_ring_ctx
*ctx
, void __user
*__arg
,
10596 struct io_uring_rsrc_update __user
*arg
= __arg
;
10597 struct io_uring_rsrc_update reg
;
10598 struct io_uring_task
*tctx
;
10601 if (!nr_args
|| nr_args
> IO_RINGFD_REG_MAX
)
10604 mutex_unlock(&ctx
->uring_lock
);
10605 ret
= io_uring_add_tctx_node(ctx
);
10606 mutex_lock(&ctx
->uring_lock
);
10610 tctx
= current
->io_uring
;
10611 for (i
= 0; i
< nr_args
; i
++) {
10614 if (copy_from_user(®
, &arg
[i
], sizeof(reg
))) {
10624 if (reg
.offset
== -1U) {
10626 end
= IO_RINGFD_REG_MAX
;
10628 if (reg
.offset
>= IO_RINGFD_REG_MAX
) {
10632 start
= reg
.offset
;
10636 ret
= io_ring_add_registered_fd(tctx
, reg
.data
, start
, end
);
10641 if (copy_to_user(&arg
[i
], ®
, sizeof(reg
))) {
10642 fput(tctx
->registered_rings
[reg
.offset
]);
10643 tctx
->registered_rings
[reg
.offset
] = NULL
;
10649 return i
? i
: ret
;
10652 static int io_ringfd_unregister(struct io_ring_ctx
*ctx
, void __user
*__arg
,
10655 struct io_uring_rsrc_update __user
*arg
= __arg
;
10656 struct io_uring_task
*tctx
= current
->io_uring
;
10657 struct io_uring_rsrc_update reg
;
10660 if (!nr_args
|| nr_args
> IO_RINGFD_REG_MAX
)
10665 for (i
= 0; i
< nr_args
; i
++) {
10666 if (copy_from_user(®
, &arg
[i
], sizeof(reg
))) {
10670 if (reg
.resv
|| reg
.data
|| reg
.offset
>= IO_RINGFD_REG_MAX
) {
10675 reg
.offset
= array_index_nospec(reg
.offset
, IO_RINGFD_REG_MAX
);
10676 if (tctx
->registered_rings
[reg
.offset
]) {
10677 fput(tctx
->registered_rings
[reg
.offset
]);
10678 tctx
->registered_rings
[reg
.offset
] = NULL
;
10682 return i
? i
: ret
;
10685 static void *io_uring_validate_mmap_request(struct file
*file
,
10686 loff_t pgoff
, size_t sz
)
10688 struct io_ring_ctx
*ctx
= file
->private_data
;
10689 loff_t offset
= pgoff
<< PAGE_SHIFT
;
10694 case IORING_OFF_SQ_RING
:
10695 case IORING_OFF_CQ_RING
:
10698 case IORING_OFF_SQES
:
10699 ptr
= ctx
->sq_sqes
;
10702 return ERR_PTR(-EINVAL
);
10705 page
= virt_to_head_page(ptr
);
10706 if (sz
> page_size(page
))
10707 return ERR_PTR(-EINVAL
);
10714 static __cold
int io_uring_mmap(struct file
*file
, struct vm_area_struct
*vma
)
10716 size_t sz
= vma
->vm_end
- vma
->vm_start
;
10720 ptr
= io_uring_validate_mmap_request(file
, vma
->vm_pgoff
, sz
);
10722 return PTR_ERR(ptr
);
10724 pfn
= virt_to_phys(ptr
) >> PAGE_SHIFT
;
10725 return remap_pfn_range(vma
, vma
->vm_start
, pfn
, sz
, vma
->vm_page_prot
);
10728 #else /* !CONFIG_MMU */
10730 static int io_uring_mmap(struct file
*file
, struct vm_area_struct
*vma
)
10732 return vma
->vm_flags
& (VM_SHARED
| VM_MAYSHARE
) ? 0 : -EINVAL
;
10735 static unsigned int io_uring_nommu_mmap_capabilities(struct file
*file
)
10737 return NOMMU_MAP_DIRECT
| NOMMU_MAP_READ
| NOMMU_MAP_WRITE
;
10740 static unsigned long io_uring_nommu_get_unmapped_area(struct file
*file
,
10741 unsigned long addr
, unsigned long len
,
10742 unsigned long pgoff
, unsigned long flags
)
10746 ptr
= io_uring_validate_mmap_request(file
, pgoff
, len
);
10748 return PTR_ERR(ptr
);
10750 return (unsigned long) ptr
;
10753 #endif /* !CONFIG_MMU */
10755 static int io_sqpoll_wait_sq(struct io_ring_ctx
*ctx
)
10760 if (!io_sqring_full(ctx
))
10762 prepare_to_wait(&ctx
->sqo_sq_wait
, &wait
, TASK_INTERRUPTIBLE
);
10764 if (!io_sqring_full(ctx
))
10767 } while (!signal_pending(current
));
10769 finish_wait(&ctx
->sqo_sq_wait
, &wait
);
10773 static int io_get_ext_arg(unsigned flags
, const void __user
*argp
, size_t *argsz
,
10774 struct __kernel_timespec __user
**ts
,
10775 const sigset_t __user
**sig
)
10777 struct io_uring_getevents_arg arg
;
10780 * If EXT_ARG isn't set, then we have no timespec and the argp pointer
10781 * is just a pointer to the sigset_t.
10783 if (!(flags
& IORING_ENTER_EXT_ARG
)) {
10784 *sig
= (const sigset_t __user
*) argp
;
10790 * EXT_ARG is set - ensure we agree on the size of it and copy in our
10791 * timespec and sigset_t pointers if good.
10793 if (*argsz
!= sizeof(arg
))
10795 if (copy_from_user(&arg
, argp
, sizeof(arg
)))
10799 *sig
= u64_to_user_ptr(arg
.sigmask
);
10800 *argsz
= arg
.sigmask_sz
;
10801 *ts
= u64_to_user_ptr(arg
.ts
);
10805 SYSCALL_DEFINE6(io_uring_enter
, unsigned int, fd
, u32
, to_submit
,
10806 u32
, min_complete
, u32
, flags
, const void __user
*, argp
,
10809 struct io_ring_ctx
*ctx
;
10814 io_run_task_work();
10816 if (unlikely(flags
& ~(IORING_ENTER_GETEVENTS
| IORING_ENTER_SQ_WAKEUP
|
10817 IORING_ENTER_SQ_WAIT
| IORING_ENTER_EXT_ARG
|
10818 IORING_ENTER_REGISTERED_RING
)))
10822 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
10823 * need only dereference our task private array to find it.
10825 if (flags
& IORING_ENTER_REGISTERED_RING
) {
10826 struct io_uring_task
*tctx
= current
->io_uring
;
10828 if (!tctx
|| fd
>= IO_RINGFD_REG_MAX
)
10830 fd
= array_index_nospec(fd
, IO_RINGFD_REG_MAX
);
10831 f
.file
= tctx
->registered_rings
[fd
];
10832 if (unlikely(!f
.file
))
10836 if (unlikely(!f
.file
))
10841 if (unlikely(f
.file
->f_op
!= &io_uring_fops
))
10845 ctx
= f
.file
->private_data
;
10846 if (unlikely(!percpu_ref_tryget(&ctx
->refs
)))
10850 if (unlikely(ctx
->flags
& IORING_SETUP_R_DISABLED
))
10854 * For SQ polling, the thread will do all submissions and completions.
10855 * Just return the requested submit count, and wake the thread if
10856 * we were asked to.
10859 if (ctx
->flags
& IORING_SETUP_SQPOLL
) {
10860 io_cqring_overflow_flush(ctx
);
10862 if (unlikely(ctx
->sq_data
->thread
== NULL
)) {
10866 if (flags
& IORING_ENTER_SQ_WAKEUP
)
10867 wake_up(&ctx
->sq_data
->wait
);
10868 if (flags
& IORING_ENTER_SQ_WAIT
) {
10869 ret
= io_sqpoll_wait_sq(ctx
);
10873 submitted
= to_submit
;
10874 } else if (to_submit
) {
10875 ret
= io_uring_add_tctx_node(ctx
);
10878 mutex_lock(&ctx
->uring_lock
);
10879 submitted
= io_submit_sqes(ctx
, to_submit
);
10880 mutex_unlock(&ctx
->uring_lock
);
10882 if (submitted
!= to_submit
)
10885 if (flags
& IORING_ENTER_GETEVENTS
) {
10886 const sigset_t __user
*sig
;
10887 struct __kernel_timespec __user
*ts
;
10889 ret
= io_get_ext_arg(flags
, argp
, &argsz
, &ts
, &sig
);
10893 min_complete
= min(min_complete
, ctx
->cq_entries
);
10896 * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
10897 * space applications don't need to do io completion events
10898 * polling again, they can rely on io_sq_thread to do polling
10899 * work, which can reduce cpu usage and uring_lock contention.
10901 if (ctx
->flags
& IORING_SETUP_IOPOLL
&&
10902 !(ctx
->flags
& IORING_SETUP_SQPOLL
)) {
10903 ret
= io_iopoll_check(ctx
, min_complete
);
10905 ret
= io_cqring_wait(ctx
, min_complete
, sig
, argsz
, ts
);
10910 percpu_ref_put(&ctx
->refs
);
10912 if (!(flags
& IORING_ENTER_REGISTERED_RING
))
10914 return submitted
? submitted
: ret
;
10917 #ifdef CONFIG_PROC_FS
10918 static __cold
int io_uring_show_cred(struct seq_file
*m
, unsigned int id
,
10919 const struct cred
*cred
)
10921 struct user_namespace
*uns
= seq_user_ns(m
);
10922 struct group_info
*gi
;
10927 seq_printf(m
, "%5d\n", id
);
10928 seq_put_decimal_ull(m
, "\tUid:\t", from_kuid_munged(uns
, cred
->uid
));
10929 seq_put_decimal_ull(m
, "\t\t", from_kuid_munged(uns
, cred
->euid
));
10930 seq_put_decimal_ull(m
, "\t\t", from_kuid_munged(uns
, cred
->suid
));
10931 seq_put_decimal_ull(m
, "\t\t", from_kuid_munged(uns
, cred
->fsuid
));
10932 seq_put_decimal_ull(m
, "\n\tGid:\t", from_kgid_munged(uns
, cred
->gid
));
10933 seq_put_decimal_ull(m
, "\t\t", from_kgid_munged(uns
, cred
->egid
));
10934 seq_put_decimal_ull(m
, "\t\t", from_kgid_munged(uns
, cred
->sgid
));
10935 seq_put_decimal_ull(m
, "\t\t", from_kgid_munged(uns
, cred
->fsgid
));
10936 seq_puts(m
, "\n\tGroups:\t");
10937 gi
= cred
->group_info
;
10938 for (g
= 0; g
< gi
->ngroups
; g
++) {
10939 seq_put_decimal_ull(m
, g
? " " : "",
10940 from_kgid_munged(uns
, gi
->gid
[g
]));
10942 seq_puts(m
, "\n\tCapEff:\t");
10943 cap
= cred
->cap_effective
;
10944 CAP_FOR_EACH_U32(__capi
)
10945 seq_put_hex_ll(m
, NULL
, cap
.cap
[CAP_LAST_U32
- __capi
], 8);
10950 static __cold
void __io_uring_show_fdinfo(struct io_ring_ctx
*ctx
,
10951 struct seq_file
*m
)
10953 struct io_sq_data
*sq
= NULL
;
10954 struct io_overflow_cqe
*ocqe
;
10955 struct io_rings
*r
= ctx
->rings
;
10956 unsigned int sq_mask
= ctx
->sq_entries
- 1, cq_mask
= ctx
->cq_entries
- 1;
10957 unsigned int sq_head
= READ_ONCE(r
->sq
.head
);
10958 unsigned int sq_tail
= READ_ONCE(r
->sq
.tail
);
10959 unsigned int cq_head
= READ_ONCE(r
->cq
.head
);
10960 unsigned int cq_tail
= READ_ONCE(r
->cq
.tail
);
10961 unsigned int sq_entries
, cq_entries
;
10966 * we may get imprecise sqe and cqe info if uring is actively running
10967 * since we get cached_sq_head and cached_cq_tail without uring_lock
10968 * and sq_tail and cq_head are changed by userspace. But it's ok since
10969 * we usually use these info when it is stuck.
10971 seq_printf(m
, "SqMask:\t0x%x\n", sq_mask
);
10972 seq_printf(m
, "SqHead:\t%u\n", sq_head
);
10973 seq_printf(m
, "SqTail:\t%u\n", sq_tail
);
10974 seq_printf(m
, "CachedSqHead:\t%u\n", ctx
->cached_sq_head
);
10975 seq_printf(m
, "CqMask:\t0x%x\n", cq_mask
);
10976 seq_printf(m
, "CqHead:\t%u\n", cq_head
);
10977 seq_printf(m
, "CqTail:\t%u\n", cq_tail
);
10978 seq_printf(m
, "CachedCqTail:\t%u\n", ctx
->cached_cq_tail
);
10979 seq_printf(m
, "SQEs:\t%u\n", sq_tail
- ctx
->cached_sq_head
);
10980 sq_entries
= min(sq_tail
- sq_head
, ctx
->sq_entries
);
10981 for (i
= 0; i
< sq_entries
; i
++) {
10982 unsigned int entry
= i
+ sq_head
;
10983 unsigned int sq_idx
= READ_ONCE(ctx
->sq_array
[entry
& sq_mask
]);
10984 struct io_uring_sqe
*sqe
;
10986 if (sq_idx
> sq_mask
)
10988 sqe
= &ctx
->sq_sqes
[sq_idx
];
10989 seq_printf(m
, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n",
10990 sq_idx
, sqe
->opcode
, sqe
->fd
, sqe
->flags
,
10993 seq_printf(m
, "CQEs:\t%u\n", cq_tail
- cq_head
);
10994 cq_entries
= min(cq_tail
- cq_head
, ctx
->cq_entries
);
10995 for (i
= 0; i
< cq_entries
; i
++) {
10996 unsigned int entry
= i
+ cq_head
;
10997 struct io_uring_cqe
*cqe
= &r
->cqes
[entry
& cq_mask
];
10999 seq_printf(m
, "%5u: user_data:%llu, res:%d, flag:%x\n",
11000 entry
& cq_mask
, cqe
->user_data
, cqe
->res
,
11005 * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
11006 * since fdinfo case grabs it in the opposite direction of normal use
11007 * cases. If we fail to get the lock, we just don't iterate any
11008 * structures that could be going away outside the io_uring mutex.
11010 has_lock
= mutex_trylock(&ctx
->uring_lock
);
11012 if (has_lock
&& (ctx
->flags
& IORING_SETUP_SQPOLL
)) {
11018 seq_printf(m
, "SqThread:\t%d\n", sq
? task_pid_nr(sq
->thread
) : -1);
11019 seq_printf(m
, "SqThreadCpu:\t%d\n", sq
? task_cpu(sq
->thread
) : -1);
11020 seq_printf(m
, "UserFiles:\t%u\n", ctx
->nr_user_files
);
11021 for (i
= 0; has_lock
&& i
< ctx
->nr_user_files
; i
++) {
11022 struct file
*f
= io_file_from_index(ctx
, i
);
11025 seq_printf(m
, "%5u: %s\n", i
, file_dentry(f
)->d_iname
);
11027 seq_printf(m
, "%5u: <none>\n", i
);
11029 seq_printf(m
, "UserBufs:\t%u\n", ctx
->nr_user_bufs
);
11030 for (i
= 0; has_lock
&& i
< ctx
->nr_user_bufs
; i
++) {
11031 struct io_mapped_ubuf
*buf
= ctx
->user_bufs
[i
];
11032 unsigned int len
= buf
->ubuf_end
- buf
->ubuf
;
11034 seq_printf(m
, "%5u: 0x%llx/%u\n", i
, buf
->ubuf
, len
);
11036 if (has_lock
&& !xa_empty(&ctx
->personalities
)) {
11037 unsigned long index
;
11038 const struct cred
*cred
;
11040 seq_printf(m
, "Personalities:\n");
11041 xa_for_each(&ctx
->personalities
, index
, cred
)
11042 io_uring_show_cred(m
, index
, cred
);
11045 mutex_unlock(&ctx
->uring_lock
);
11047 seq_puts(m
, "PollList:\n");
11048 spin_lock(&ctx
->completion_lock
);
11049 for (i
= 0; i
< (1U << ctx
->cancel_hash_bits
); i
++) {
11050 struct hlist_head
*list
= &ctx
->cancel_hash
[i
];
11051 struct io_kiocb
*req
;
11053 hlist_for_each_entry(req
, list
, hash_node
)
11054 seq_printf(m
, " op=%d, task_works=%d\n", req
->opcode
,
11055 task_work_pending(req
->task
));
11058 seq_puts(m
, "CqOverflowList:\n");
11059 list_for_each_entry(ocqe
, &ctx
->cq_overflow_list
, list
) {
11060 struct io_uring_cqe
*cqe
= &ocqe
->cqe
;
11062 seq_printf(m
, " user_data=%llu, res=%d, flags=%x\n",
11063 cqe
->user_data
, cqe
->res
, cqe
->flags
);
11067 spin_unlock(&ctx
->completion_lock
);
11070 static __cold
void io_uring_show_fdinfo(struct seq_file
*m
, struct file
*f
)
11072 struct io_ring_ctx
*ctx
= f
->private_data
;
11074 if (percpu_ref_tryget(&ctx
->refs
)) {
11075 __io_uring_show_fdinfo(ctx
, m
);
11076 percpu_ref_put(&ctx
->refs
);
11081 static const struct file_operations io_uring_fops
= {
11082 .release
= io_uring_release
,
11083 .mmap
= io_uring_mmap
,
11085 .get_unmapped_area
= io_uring_nommu_get_unmapped_area
,
11086 .mmap_capabilities
= io_uring_nommu_mmap_capabilities
,
11088 .poll
= io_uring_poll
,
11089 #ifdef CONFIG_PROC_FS
11090 .show_fdinfo
= io_uring_show_fdinfo
,
11094 static __cold
int io_allocate_scq_urings(struct io_ring_ctx
*ctx
,
11095 struct io_uring_params
*p
)
11097 struct io_rings
*rings
;
11098 size_t size
, sq_array_offset
;
11100 /* make sure these are sane, as we already accounted them */
11101 ctx
->sq_entries
= p
->sq_entries
;
11102 ctx
->cq_entries
= p
->cq_entries
;
11104 size
= rings_size(p
->sq_entries
, p
->cq_entries
, &sq_array_offset
);
11105 if (size
== SIZE_MAX
)
11108 rings
= io_mem_alloc(size
);
11112 ctx
->rings
= rings
;
11113 ctx
->sq_array
= (u32
*)((char *)rings
+ sq_array_offset
);
11114 rings
->sq_ring_mask
= p
->sq_entries
- 1;
11115 rings
->cq_ring_mask
= p
->cq_entries
- 1;
11116 rings
->sq_ring_entries
= p
->sq_entries
;
11117 rings
->cq_ring_entries
= p
->cq_entries
;
11119 size
= array_size(sizeof(struct io_uring_sqe
), p
->sq_entries
);
11120 if (size
== SIZE_MAX
) {
11121 io_mem_free(ctx
->rings
);
11126 ctx
->sq_sqes
= io_mem_alloc(size
);
11127 if (!ctx
->sq_sqes
) {
11128 io_mem_free(ctx
->rings
);
11136 static int io_uring_install_fd(struct io_ring_ctx
*ctx
, struct file
*file
)
11140 fd
= get_unused_fd_flags(O_RDWR
| O_CLOEXEC
);
11144 ret
= io_uring_add_tctx_node(ctx
);
11149 fd_install(fd
, file
);
11154 * Allocate an anonymous fd, this is what constitutes the application
11155 * visible backing of an io_uring instance. The application mmaps this
11156 * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
11157 * we have to tie this fd to a socket for file garbage collection purposes.
11159 static struct file
*io_uring_get_file(struct io_ring_ctx
*ctx
)
11162 #if defined(CONFIG_UNIX)
11165 ret
= sock_create_kern(&init_net
, PF_UNIX
, SOCK_RAW
, IPPROTO_IP
,
11168 return ERR_PTR(ret
);
11171 file
= anon_inode_getfile_secure("[io_uring]", &io_uring_fops
, ctx
,
11172 O_RDWR
| O_CLOEXEC
, NULL
);
11173 #if defined(CONFIG_UNIX)
11174 if (IS_ERR(file
)) {
11175 sock_release(ctx
->ring_sock
);
11176 ctx
->ring_sock
= NULL
;
11178 ctx
->ring_sock
->file
= file
;
11184 static __cold
int io_uring_create(unsigned entries
, struct io_uring_params
*p
,
11185 struct io_uring_params __user
*params
)
11187 struct io_ring_ctx
*ctx
;
11193 if (entries
> IORING_MAX_ENTRIES
) {
11194 if (!(p
->flags
& IORING_SETUP_CLAMP
))
11196 entries
= IORING_MAX_ENTRIES
;
11200 * Use twice as many entries for the CQ ring. It's possible for the
11201 * application to drive a higher depth than the size of the SQ ring,
11202 * since the sqes are only used at submission time. This allows for
11203 * some flexibility in overcommitting a bit. If the application has
11204 * set IORING_SETUP_CQSIZE, it will have passed in the desired number
11205 * of CQ ring entries manually.
11207 p
->sq_entries
= roundup_pow_of_two(entries
);
11208 if (p
->flags
& IORING_SETUP_CQSIZE
) {
11210 * If IORING_SETUP_CQSIZE is set, we do the same roundup
11211 * to a power-of-two, if it isn't already. We do NOT impose
11212 * any cq vs sq ring sizing.
11214 if (!p
->cq_entries
)
11216 if (p
->cq_entries
> IORING_MAX_CQ_ENTRIES
) {
11217 if (!(p
->flags
& IORING_SETUP_CLAMP
))
11219 p
->cq_entries
= IORING_MAX_CQ_ENTRIES
;
11221 p
->cq_entries
= roundup_pow_of_two(p
->cq_entries
);
11222 if (p
->cq_entries
< p
->sq_entries
)
11225 p
->cq_entries
= 2 * p
->sq_entries
;
11228 ctx
= io_ring_ctx_alloc(p
);
11231 ctx
->compat
= in_compat_syscall();
11232 if (!capable(CAP_IPC_LOCK
))
11233 ctx
->user
= get_uid(current_user());
11236 * This is just grabbed for accounting purposes. When a process exits,
11237 * the mm is exited and dropped before the files, hence we need to hang
11238 * on to this mm purely for the purposes of being able to unaccount
11239 * memory (locked/pinned vm). It's not used for anything else.
11241 mmgrab(current
->mm
);
11242 ctx
->mm_account
= current
->mm
;
11244 ret
= io_allocate_scq_urings(ctx
, p
);
11248 ret
= io_sq_offload_create(ctx
, p
);
11251 /* always set a rsrc node */
11252 ret
= io_rsrc_node_switch_start(ctx
);
11255 io_rsrc_node_switch(ctx
, NULL
);
11257 memset(&p
->sq_off
, 0, sizeof(p
->sq_off
));
11258 p
->sq_off
.head
= offsetof(struct io_rings
, sq
.head
);
11259 p
->sq_off
.tail
= offsetof(struct io_rings
, sq
.tail
);
11260 p
->sq_off
.ring_mask
= offsetof(struct io_rings
, sq_ring_mask
);
11261 p
->sq_off
.ring_entries
= offsetof(struct io_rings
, sq_ring_entries
);
11262 p
->sq_off
.flags
= offsetof(struct io_rings
, sq_flags
);
11263 p
->sq_off
.dropped
= offsetof(struct io_rings
, sq_dropped
);
11264 p
->sq_off
.array
= (char *)ctx
->sq_array
- (char *)ctx
->rings
;
11266 memset(&p
->cq_off
, 0, sizeof(p
->cq_off
));
11267 p
->cq_off
.head
= offsetof(struct io_rings
, cq
.head
);
11268 p
->cq_off
.tail
= offsetof(struct io_rings
, cq
.tail
);
11269 p
->cq_off
.ring_mask
= offsetof(struct io_rings
, cq_ring_mask
);
11270 p
->cq_off
.ring_entries
= offsetof(struct io_rings
, cq_ring_entries
);
11271 p
->cq_off
.overflow
= offsetof(struct io_rings
, cq_overflow
);
11272 p
->cq_off
.cqes
= offsetof(struct io_rings
, cqes
);
11273 p
->cq_off
.flags
= offsetof(struct io_rings
, cq_flags
);
11275 p
->features
= IORING_FEAT_SINGLE_MMAP
| IORING_FEAT_NODROP
|
11276 IORING_FEAT_SUBMIT_STABLE
| IORING_FEAT_RW_CUR_POS
|
11277 IORING_FEAT_CUR_PERSONALITY
| IORING_FEAT_FAST_POLL
|
11278 IORING_FEAT_POLL_32BITS
| IORING_FEAT_SQPOLL_NONFIXED
|
11279 IORING_FEAT_EXT_ARG
| IORING_FEAT_NATIVE_WORKERS
|
11280 IORING_FEAT_RSRC_TAGS
| IORING_FEAT_CQE_SKIP
|
11281 IORING_FEAT_LINKED_FILE
;
11283 if (copy_to_user(params
, p
, sizeof(*p
))) {
11288 file
= io_uring_get_file(ctx
);
11289 if (IS_ERR(file
)) {
11290 ret
= PTR_ERR(file
);
11295 * Install ring fd as the very last thing, so we don't risk someone
11296 * having closed it before we finish setup
11298 ret
= io_uring_install_fd(ctx
, file
);
11300 /* fput will clean it up */
11305 trace_io_uring_create(ret
, ctx
, p
->sq_entries
, p
->cq_entries
, p
->flags
);
11308 io_ring_ctx_wait_and_kill(ctx
);
11313 * Sets up an aio uring context, and returns the fd. Applications asks for a
11314 * ring size, we return the actual sq/cq ring sizes (among other things) in the
11315 * params structure passed in.
11317 static long io_uring_setup(u32 entries
, struct io_uring_params __user
*params
)
11319 struct io_uring_params p
;
11322 if (copy_from_user(&p
, params
, sizeof(p
)))
11324 for (i
= 0; i
< ARRAY_SIZE(p
.resv
); i
++) {
11329 if (p
.flags
& ~(IORING_SETUP_IOPOLL
| IORING_SETUP_SQPOLL
|
11330 IORING_SETUP_SQ_AFF
| IORING_SETUP_CQSIZE
|
11331 IORING_SETUP_CLAMP
| IORING_SETUP_ATTACH_WQ
|
11332 IORING_SETUP_R_DISABLED
| IORING_SETUP_SUBMIT_ALL
))
11335 return io_uring_create(entries
, &p
, params
);
11338 SYSCALL_DEFINE2(io_uring_setup
, u32
, entries
,
11339 struct io_uring_params __user
*, params
)
11341 return io_uring_setup(entries
, params
);
11344 static __cold
int io_probe(struct io_ring_ctx
*ctx
, void __user
*arg
,
11347 struct io_uring_probe
*p
;
11351 size
= struct_size(p
, ops
, nr_args
);
11352 if (size
== SIZE_MAX
)
11354 p
= kzalloc(size
, GFP_KERNEL
);
11359 if (copy_from_user(p
, arg
, size
))
11362 if (memchr_inv(p
, 0, size
))
11365 p
->last_op
= IORING_OP_LAST
- 1;
11366 if (nr_args
> IORING_OP_LAST
)
11367 nr_args
= IORING_OP_LAST
;
11369 for (i
= 0; i
< nr_args
; i
++) {
11371 if (!io_op_defs
[i
].not_supported
)
11372 p
->ops
[i
].flags
= IO_URING_OP_SUPPORTED
;
11377 if (copy_to_user(arg
, p
, size
))
11384 static int io_register_personality(struct io_ring_ctx
*ctx
)
11386 const struct cred
*creds
;
11390 creds
= get_current_cred();
11392 ret
= xa_alloc_cyclic(&ctx
->personalities
, &id
, (void *)creds
,
11393 XA_LIMIT(0, USHRT_MAX
), &ctx
->pers_next
, GFP_KERNEL
);
11401 static __cold
int io_register_restrictions(struct io_ring_ctx
*ctx
,
11402 void __user
*arg
, unsigned int nr_args
)
11404 struct io_uring_restriction
*res
;
11408 /* Restrictions allowed only if rings started disabled */
11409 if (!(ctx
->flags
& IORING_SETUP_R_DISABLED
))
11412 /* We allow only a single restrictions registration */
11413 if (ctx
->restrictions
.registered
)
11416 if (!arg
|| nr_args
> IORING_MAX_RESTRICTIONS
)
11419 size
= array_size(nr_args
, sizeof(*res
));
11420 if (size
== SIZE_MAX
)
11423 res
= memdup_user(arg
, size
);
11425 return PTR_ERR(res
);
11429 for (i
= 0; i
< nr_args
; i
++) {
11430 switch (res
[i
].opcode
) {
11431 case IORING_RESTRICTION_REGISTER_OP
:
11432 if (res
[i
].register_op
>= IORING_REGISTER_LAST
) {
11437 __set_bit(res
[i
].register_op
,
11438 ctx
->restrictions
.register_op
);
11440 case IORING_RESTRICTION_SQE_OP
:
11441 if (res
[i
].sqe_op
>= IORING_OP_LAST
) {
11446 __set_bit(res
[i
].sqe_op
, ctx
->restrictions
.sqe_op
);
11448 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED
:
11449 ctx
->restrictions
.sqe_flags_allowed
= res
[i
].sqe_flags
;
11451 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED
:
11452 ctx
->restrictions
.sqe_flags_required
= res
[i
].sqe_flags
;
11461 /* Reset all restrictions if an error happened */
11463 memset(&ctx
->restrictions
, 0, sizeof(ctx
->restrictions
));
11465 ctx
->restrictions
.registered
= true;
11471 static int io_register_enable_rings(struct io_ring_ctx
*ctx
)
11473 if (!(ctx
->flags
& IORING_SETUP_R_DISABLED
))
11476 if (ctx
->restrictions
.registered
)
11477 ctx
->restricted
= 1;
11479 ctx
->flags
&= ~IORING_SETUP_R_DISABLED
;
11480 if (ctx
->sq_data
&& wq_has_sleeper(&ctx
->sq_data
->wait
))
11481 wake_up(&ctx
->sq_data
->wait
);
11485 static int __io_register_rsrc_update(struct io_ring_ctx
*ctx
, unsigned type
,
11486 struct io_uring_rsrc_update2
*up
,
11492 if (check_add_overflow(up
->offset
, nr_args
, &tmp
))
11494 err
= io_rsrc_node_switch_start(ctx
);
11499 case IORING_RSRC_FILE
:
11500 return __io_sqe_files_update(ctx
, up
, nr_args
);
11501 case IORING_RSRC_BUFFER
:
11502 return __io_sqe_buffers_update(ctx
, up
, nr_args
);
11507 static int io_register_files_update(struct io_ring_ctx
*ctx
, void __user
*arg
,
11510 struct io_uring_rsrc_update2 up
;
11514 memset(&up
, 0, sizeof(up
));
11515 if (copy_from_user(&up
, arg
, sizeof(struct io_uring_rsrc_update
)))
11517 if (up
.resv
|| up
.resv2
)
11519 return __io_register_rsrc_update(ctx
, IORING_RSRC_FILE
, &up
, nr_args
);
11522 static int io_register_rsrc_update(struct io_ring_ctx
*ctx
, void __user
*arg
,
11523 unsigned size
, unsigned type
)
11525 struct io_uring_rsrc_update2 up
;
11527 if (size
!= sizeof(up
))
11529 if (copy_from_user(&up
, arg
, sizeof(up
)))
11531 if (!up
.nr
|| up
.resv
|| up
.resv2
)
11533 return __io_register_rsrc_update(ctx
, type
, &up
, up
.nr
);
11536 static __cold
int io_register_rsrc(struct io_ring_ctx
*ctx
, void __user
*arg
,
11537 unsigned int size
, unsigned int type
)
11539 struct io_uring_rsrc_register rr
;
11541 /* keep it extendible */
11542 if (size
!= sizeof(rr
))
11545 memset(&rr
, 0, sizeof(rr
));
11546 if (copy_from_user(&rr
, arg
, size
))
11548 if (!rr
.nr
|| rr
.resv
|| rr
.resv2
)
11552 case IORING_RSRC_FILE
:
11553 return io_sqe_files_register(ctx
, u64_to_user_ptr(rr
.data
),
11554 rr
.nr
, u64_to_user_ptr(rr
.tags
));
11555 case IORING_RSRC_BUFFER
:
11556 return io_sqe_buffers_register(ctx
, u64_to_user_ptr(rr
.data
),
11557 rr
.nr
, u64_to_user_ptr(rr
.tags
));
11562 static __cold
int io_register_iowq_aff(struct io_ring_ctx
*ctx
,
11563 void __user
*arg
, unsigned len
)
11565 struct io_uring_task
*tctx
= current
->io_uring
;
11566 cpumask_var_t new_mask
;
11569 if (!tctx
|| !tctx
->io_wq
)
11572 if (!alloc_cpumask_var(&new_mask
, GFP_KERNEL
))
11575 cpumask_clear(new_mask
);
11576 if (len
> cpumask_size())
11577 len
= cpumask_size();
11579 if (in_compat_syscall()) {
11580 ret
= compat_get_bitmap(cpumask_bits(new_mask
),
11581 (const compat_ulong_t __user
*)arg
,
11582 len
* 8 /* CHAR_BIT */);
11584 ret
= copy_from_user(new_mask
, arg
, len
);
11588 free_cpumask_var(new_mask
);
11592 ret
= io_wq_cpu_affinity(tctx
->io_wq
, new_mask
);
11593 free_cpumask_var(new_mask
);
11597 static __cold
int io_unregister_iowq_aff(struct io_ring_ctx
*ctx
)
11599 struct io_uring_task
*tctx
= current
->io_uring
;
11601 if (!tctx
|| !tctx
->io_wq
)
11604 return io_wq_cpu_affinity(tctx
->io_wq
, NULL
);
11607 static __cold
int io_register_iowq_max_workers(struct io_ring_ctx
*ctx
,
11609 __must_hold(&ctx
->uring_lock
)
11611 struct io_tctx_node
*node
;
11612 struct io_uring_task
*tctx
= NULL
;
11613 struct io_sq_data
*sqd
= NULL
;
11614 __u32 new_count
[2];
11617 if (copy_from_user(new_count
, arg
, sizeof(new_count
)))
11619 for (i
= 0; i
< ARRAY_SIZE(new_count
); i
++)
11620 if (new_count
[i
] > INT_MAX
)
11623 if (ctx
->flags
& IORING_SETUP_SQPOLL
) {
11624 sqd
= ctx
->sq_data
;
11627 * Observe the correct sqd->lock -> ctx->uring_lock
11628 * ordering. Fine to drop uring_lock here, we hold
11629 * a ref to the ctx.
11631 refcount_inc(&sqd
->refs
);
11632 mutex_unlock(&ctx
->uring_lock
);
11633 mutex_lock(&sqd
->lock
);
11634 mutex_lock(&ctx
->uring_lock
);
11636 tctx
= sqd
->thread
->io_uring
;
11639 tctx
= current
->io_uring
;
11642 BUILD_BUG_ON(sizeof(new_count
) != sizeof(ctx
->iowq_limits
));
11644 for (i
= 0; i
< ARRAY_SIZE(new_count
); i
++)
11646 ctx
->iowq_limits
[i
] = new_count
[i
];
11647 ctx
->iowq_limits_set
= true;
11649 if (tctx
&& tctx
->io_wq
) {
11650 ret
= io_wq_max_workers(tctx
->io_wq
, new_count
);
11654 memset(new_count
, 0, sizeof(new_count
));
11658 mutex_unlock(&sqd
->lock
);
11659 io_put_sq_data(sqd
);
11662 if (copy_to_user(arg
, new_count
, sizeof(new_count
)))
11665 /* that's it for SQPOLL, only the SQPOLL task creates requests */
11669 /* now propagate the restriction to all registered users */
11670 list_for_each_entry(node
, &ctx
->tctx_list
, ctx_node
) {
11671 struct io_uring_task
*tctx
= node
->task
->io_uring
;
11673 if (WARN_ON_ONCE(!tctx
->io_wq
))
11676 for (i
= 0; i
< ARRAY_SIZE(new_count
); i
++)
11677 new_count
[i
] = ctx
->iowq_limits
[i
];
11678 /* ignore errors, it always returns zero anyway */
11679 (void)io_wq_max_workers(tctx
->io_wq
, new_count
);
11684 mutex_unlock(&sqd
->lock
);
11685 io_put_sq_data(sqd
);
11690 static int __io_uring_register(struct io_ring_ctx
*ctx
, unsigned opcode
,
11691 void __user
*arg
, unsigned nr_args
)
11692 __releases(ctx
->uring_lock
)
11693 __acquires(ctx
->uring_lock
)
11698 * We're inside the ring mutex, if the ref is already dying, then
11699 * someone else killed the ctx or is already going through
11700 * io_uring_register().
11702 if (percpu_ref_is_dying(&ctx
->refs
))
11705 if (ctx
->restricted
) {
11706 if (opcode
>= IORING_REGISTER_LAST
)
11708 opcode
= array_index_nospec(opcode
, IORING_REGISTER_LAST
);
11709 if (!test_bit(opcode
, ctx
->restrictions
.register_op
))
11714 case IORING_REGISTER_BUFFERS
:
11715 ret
= io_sqe_buffers_register(ctx
, arg
, nr_args
, NULL
);
11717 case IORING_UNREGISTER_BUFFERS
:
11719 if (arg
|| nr_args
)
11721 ret
= io_sqe_buffers_unregister(ctx
);
11723 case IORING_REGISTER_FILES
:
11724 ret
= io_sqe_files_register(ctx
, arg
, nr_args
, NULL
);
11726 case IORING_UNREGISTER_FILES
:
11728 if (arg
|| nr_args
)
11730 ret
= io_sqe_files_unregister(ctx
);
11732 case IORING_REGISTER_FILES_UPDATE
:
11733 ret
= io_register_files_update(ctx
, arg
, nr_args
);
11735 case IORING_REGISTER_EVENTFD
:
11739 ret
= io_eventfd_register(ctx
, arg
, 0);
11741 case IORING_REGISTER_EVENTFD_ASYNC
:
11745 ret
= io_eventfd_register(ctx
, arg
, 1);
11747 case IORING_UNREGISTER_EVENTFD
:
11749 if (arg
|| nr_args
)
11751 ret
= io_eventfd_unregister(ctx
);
11753 case IORING_REGISTER_PROBE
:
11755 if (!arg
|| nr_args
> 256)
11757 ret
= io_probe(ctx
, arg
, nr_args
);
11759 case IORING_REGISTER_PERSONALITY
:
11761 if (arg
|| nr_args
)
11763 ret
= io_register_personality(ctx
);
11765 case IORING_UNREGISTER_PERSONALITY
:
11769 ret
= io_unregister_personality(ctx
, nr_args
);
11771 case IORING_REGISTER_ENABLE_RINGS
:
11773 if (arg
|| nr_args
)
11775 ret
= io_register_enable_rings(ctx
);
11777 case IORING_REGISTER_RESTRICTIONS
:
11778 ret
= io_register_restrictions(ctx
, arg
, nr_args
);
11780 case IORING_REGISTER_FILES2
:
11781 ret
= io_register_rsrc(ctx
, arg
, nr_args
, IORING_RSRC_FILE
);
11783 case IORING_REGISTER_FILES_UPDATE2
:
11784 ret
= io_register_rsrc_update(ctx
, arg
, nr_args
,
11787 case IORING_REGISTER_BUFFERS2
:
11788 ret
= io_register_rsrc(ctx
, arg
, nr_args
, IORING_RSRC_BUFFER
);
11790 case IORING_REGISTER_BUFFERS_UPDATE
:
11791 ret
= io_register_rsrc_update(ctx
, arg
, nr_args
,
11792 IORING_RSRC_BUFFER
);
11794 case IORING_REGISTER_IOWQ_AFF
:
11796 if (!arg
|| !nr_args
)
11798 ret
= io_register_iowq_aff(ctx
, arg
, nr_args
);
11800 case IORING_UNREGISTER_IOWQ_AFF
:
11802 if (arg
|| nr_args
)
11804 ret
= io_unregister_iowq_aff(ctx
);
11806 case IORING_REGISTER_IOWQ_MAX_WORKERS
:
11808 if (!arg
|| nr_args
!= 2)
11810 ret
= io_register_iowq_max_workers(ctx
, arg
);
11812 case IORING_REGISTER_RING_FDS
:
11813 ret
= io_ringfd_register(ctx
, arg
, nr_args
);
11815 case IORING_UNREGISTER_RING_FDS
:
11816 ret
= io_ringfd_unregister(ctx
, arg
, nr_args
);
11826 SYSCALL_DEFINE4(io_uring_register
, unsigned int, fd
, unsigned int, opcode
,
11827 void __user
*, arg
, unsigned int, nr_args
)
11829 struct io_ring_ctx
*ctx
;
11838 if (f
.file
->f_op
!= &io_uring_fops
)
11841 ctx
= f
.file
->private_data
;
11843 io_run_task_work();
11845 mutex_lock(&ctx
->uring_lock
);
11846 ret
= __io_uring_register(ctx
, opcode
, arg
, nr_args
);
11847 mutex_unlock(&ctx
->uring_lock
);
11848 trace_io_uring_register(ctx
, opcode
, ctx
->nr_user_files
, ctx
->nr_user_bufs
, ret
);
11854 static int __init
io_uring_init(void)
11856 #define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
11857 BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
11858 BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
11861 #define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
11862 __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
11863 BUILD_BUG_ON(sizeof(struct io_uring_sqe
) != 64);
11864 BUILD_BUG_SQE_ELEM(0, __u8
, opcode
);
11865 BUILD_BUG_SQE_ELEM(1, __u8
, flags
);
11866 BUILD_BUG_SQE_ELEM(2, __u16
, ioprio
);
11867 BUILD_BUG_SQE_ELEM(4, __s32
, fd
);
11868 BUILD_BUG_SQE_ELEM(8, __u64
, off
);
11869 BUILD_BUG_SQE_ELEM(8, __u64
, addr2
);
11870 BUILD_BUG_SQE_ELEM(16, __u64
, addr
);
11871 BUILD_BUG_SQE_ELEM(16, __u64
, splice_off_in
);
11872 BUILD_BUG_SQE_ELEM(24, __u32
, len
);
11873 BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t
, rw_flags
);
11874 BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags
);
11875 BUILD_BUG_SQE_ELEM(28, /* compat */ __u32
, rw_flags
);
11876 BUILD_BUG_SQE_ELEM(28, __u32
, fsync_flags
);
11877 BUILD_BUG_SQE_ELEM(28, /* compat */ __u16
, poll_events
);
11878 BUILD_BUG_SQE_ELEM(28, __u32
, poll32_events
);
11879 BUILD_BUG_SQE_ELEM(28, __u32
, sync_range_flags
);
11880 BUILD_BUG_SQE_ELEM(28, __u32
, msg_flags
);
11881 BUILD_BUG_SQE_ELEM(28, __u32
, timeout_flags
);
11882 BUILD_BUG_SQE_ELEM(28, __u32
, accept_flags
);
11883 BUILD_BUG_SQE_ELEM(28, __u32
, cancel_flags
);
11884 BUILD_BUG_SQE_ELEM(28, __u32
, open_flags
);
11885 BUILD_BUG_SQE_ELEM(28, __u32
, statx_flags
);
11886 BUILD_BUG_SQE_ELEM(28, __u32
, fadvise_advice
);
11887 BUILD_BUG_SQE_ELEM(28, __u32
, splice_flags
);
11888 BUILD_BUG_SQE_ELEM(32, __u64
, user_data
);
11889 BUILD_BUG_SQE_ELEM(40, __u16
, buf_index
);
11890 BUILD_BUG_SQE_ELEM(40, __u16
, buf_group
);
11891 BUILD_BUG_SQE_ELEM(42, __u16
, personality
);
11892 BUILD_BUG_SQE_ELEM(44, __s32
, splice_fd_in
);
11893 BUILD_BUG_SQE_ELEM(44, __u32
, file_index
);
11895 BUILD_BUG_ON(sizeof(struct io_uring_files_update
) !=
11896 sizeof(struct io_uring_rsrc_update
));
11897 BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update
) >
11898 sizeof(struct io_uring_rsrc_update2
));
11900 /* ->buf_index is u16 */
11901 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS
>= (1u << 16));
11903 /* should fit into one byte */
11904 BUILD_BUG_ON(SQE_VALID_FLAGS
>= (1 << 8));
11905 BUILD_BUG_ON(SQE_COMMON_FLAGS
>= (1 << 8));
11906 BUILD_BUG_ON((SQE_VALID_FLAGS
| SQE_COMMON_FLAGS
) != SQE_VALID_FLAGS
);
11908 BUILD_BUG_ON(ARRAY_SIZE(io_op_defs
) != IORING_OP_LAST
);
11909 BUILD_BUG_ON(__REQ_F_LAST_BIT
> 8 * sizeof(int));
11911 req_cachep
= KMEM_CACHE(io_kiocb
, SLAB_HWCACHE_ALIGN
| SLAB_PANIC
|
11915 __initcall(io_uring_init
);