]> git.ipfire.org Git - thirdparty/linux.git/blame - fs/io_uring.c
io_uring: add IORING_OP_PROVIDE_BUFFERS
[thirdparty/linux.git] / fs / io_uring.c
CommitLineData
2b188cc1
JA
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Shared application/kernel submission and completion ring pairs, for
4 * supporting fast/efficient IO.
5 *
6 * A note on the read/write ordering memory barriers that are matched between
1e84b97b
SB
7 * the application and kernel side.
8 *
9 * After the application reads the CQ ring tail, it must use an
10 * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
11 * before writing the tail (using smp_load_acquire to read the tail will
12 * do). It also needs a smp_mb() before updating CQ head (ordering the
13 * entry load(s) with the head store), pairing with an implicit barrier
14 * through a control-dependency in io_get_cqring (smp_store_release to
15 * store head will do). Failure to do so could lead to reading invalid
16 * CQ entries.
17 *
18 * Likewise, the application must use an appropriate smp_wmb() before
19 * writing the SQ tail (ordering SQ entry stores with the tail store),
20 * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
21 * to store the tail will do). And it needs a barrier ordering the SQ
22 * head load before writing new SQ entries (smp_load_acquire to read
23 * head will do).
24 *
25 * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
26 * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
27 * updating the SQ tail; a full memory barrier smp_mb() is needed
28 * between.
2b188cc1
JA
29 *
30 * Also see the examples in the liburing library:
31 *
32 * git://git.kernel.dk/liburing
33 *
34 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
35 * from data shared between the kernel and application. This is done both
36 * for ordering purposes, but also to ensure that once a value is loaded from
37 * data that the application could potentially modify, it remains stable.
38 *
39 * Copyright (C) 2018-2019 Jens Axboe
c992fe29 40 * Copyright (c) 2018-2019 Christoph Hellwig
2b188cc1
JA
41 */
42#include <linux/kernel.h>
43#include <linux/init.h>
44#include <linux/errno.h>
45#include <linux/syscalls.h>
46#include <linux/compat.h>
47#include <linux/refcount.h>
48#include <linux/uio.h>
6b47ee6e 49#include <linux/bits.h>
2b188cc1
JA
50
51#include <linux/sched/signal.h>
52#include <linux/fs.h>
53#include <linux/file.h>
54#include <linux/fdtable.h>
55#include <linux/mm.h>
56#include <linux/mman.h>
57#include <linux/mmu_context.h>
58#include <linux/percpu.h>
59#include <linux/slab.h>
6c271ce2 60#include <linux/kthread.h>
2b188cc1 61#include <linux/blkdev.h>
edafccee 62#include <linux/bvec.h>
2b188cc1
JA
63#include <linux/net.h>
64#include <net/sock.h>
65#include <net/af_unix.h>
6b06314c 66#include <net/scm.h>
2b188cc1
JA
67#include <linux/anon_inodes.h>
68#include <linux/sched/mm.h>
69#include <linux/uaccess.h>
70#include <linux/nospec.h>
edafccee
JA
71#include <linux/sizes.h>
72#include <linux/hugetlb.h>
aa4c3967 73#include <linux/highmem.h>
15b71abe
JA
74#include <linux/namei.h>
75#include <linux/fsnotify.h>
4840e418 76#include <linux/fadvise.h>
3e4827b0 77#include <linux/eventpoll.h>
ff002b30 78#include <linux/fs_struct.h>
7d67af2c 79#include <linux/splice.h>
b41e9852 80#include <linux/task_work.h>
2b188cc1 81
c826bd7a
DD
82#define CREATE_TRACE_POINTS
83#include <trace/events/io_uring.h>
84
2b188cc1
JA
85#include <uapi/linux/io_uring.h>
86
87#include "internal.h"
561fb04a 88#include "io-wq.h"
2b188cc1 89
5277deaa 90#define IORING_MAX_ENTRIES 32768
33a107f0 91#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
65e19f54
JA
92
93/*
94 * Shift of 9 is 512 entries, or exactly one page on 64-bit archs
95 */
96#define IORING_FILE_TABLE_SHIFT 9
97#define IORING_MAX_FILES_TABLE (1U << IORING_FILE_TABLE_SHIFT)
98#define IORING_FILE_TABLE_MASK (IORING_MAX_FILES_TABLE - 1)
99#define IORING_MAX_FIXED_FILES (64 * IORING_MAX_FILES_TABLE)
2b188cc1
JA
100
101struct io_uring {
102 u32 head ____cacheline_aligned_in_smp;
103 u32 tail ____cacheline_aligned_in_smp;
104};
105
1e84b97b 106/*
75b28aff
HV
107 * This data is shared with the application through the mmap at offsets
108 * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
1e84b97b
SB
109 *
110 * The offsets to the member fields are published through struct
111 * io_sqring_offsets when calling io_uring_setup.
112 */
75b28aff 113struct io_rings {
1e84b97b
SB
114 /*
115 * Head and tail offsets into the ring; the offsets need to be
116 * masked to get valid indices.
117 *
75b28aff
HV
118 * The kernel controls head of the sq ring and the tail of the cq ring,
119 * and the application controls tail of the sq ring and the head of the
120 * cq ring.
1e84b97b 121 */
75b28aff 122 struct io_uring sq, cq;
1e84b97b 123 /*
75b28aff 124 * Bitmasks to apply to head and tail offsets (constant, equals
1e84b97b
SB
125 * ring_entries - 1)
126 */
75b28aff
HV
127 u32 sq_ring_mask, cq_ring_mask;
128 /* Ring sizes (constant, power of 2) */
129 u32 sq_ring_entries, cq_ring_entries;
1e84b97b
SB
130 /*
131 * Number of invalid entries dropped by the kernel due to
132 * invalid index stored in array
133 *
134 * Written by the kernel, shouldn't be modified by the
135 * application (i.e. get number of "new events" by comparing to
136 * cached value).
137 *
138 * After a new SQ head value was read by the application this
139 * counter includes all submissions that were dropped reaching
140 * the new SQ head (and possibly more).
141 */
75b28aff 142 u32 sq_dropped;
1e84b97b
SB
143 /*
144 * Runtime flags
145 *
146 * Written by the kernel, shouldn't be modified by the
147 * application.
148 *
149 * The application needs a full memory barrier before checking
150 * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
151 */
75b28aff 152 u32 sq_flags;
1e84b97b
SB
153 /*
154 * Number of completion events lost because the queue was full;
155 * this should be avoided by the application by making sure
0b4295b5 156 * there are not more requests pending than there is space in
1e84b97b
SB
157 * the completion queue.
158 *
159 * Written by the kernel, shouldn't be modified by the
160 * application (i.e. get number of "new events" by comparing to
161 * cached value).
162 *
163 * As completion events come in out of order this counter is not
164 * ordered with any other data.
165 */
75b28aff 166 u32 cq_overflow;
1e84b97b
SB
167 /*
168 * Ring buffer of completion events.
169 *
170 * The kernel writes completion events fresh every time they are
171 * produced, so the application is allowed to modify pending
172 * entries.
173 */
75b28aff 174 struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp;
2b188cc1
JA
175};
176
edafccee
JA
177struct io_mapped_ubuf {
178 u64 ubuf;
179 size_t len;
180 struct bio_vec *bvec;
181 unsigned int nr_bvecs;
182};
183
65e19f54
JA
184struct fixed_file_table {
185 struct file **files;
31b51510
JA
186};
187
05f3fb3c
JA
188struct fixed_file_data {
189 struct fixed_file_table *table;
190 struct io_ring_ctx *ctx;
191
192 struct percpu_ref refs;
193 struct llist_head put_llist;
05f3fb3c
JA
194 struct work_struct ref_work;
195 struct completion done;
196};
197
5a2e745d
JA
198struct io_buffer {
199 struct list_head list;
200 __u64 addr;
201 __s32 len;
202 __u16 bid;
203};
204
2b188cc1
JA
205struct io_ring_ctx {
206 struct {
207 struct percpu_ref refs;
208 } ____cacheline_aligned_in_smp;
209
210 struct {
211 unsigned int flags;
e1d85334
RD
212 unsigned int compat: 1;
213 unsigned int account_mem: 1;
214 unsigned int cq_overflow_flushed: 1;
215 unsigned int drain_next: 1;
216 unsigned int eventfd_async: 1;
2b188cc1 217
75b28aff
HV
218 /*
219 * Ring buffer of indices into array of io_uring_sqe, which is
220 * mmapped by the application using the IORING_OFF_SQES offset.
221 *
222 * This indirection could e.g. be used to assign fixed
223 * io_uring_sqe entries to operations and only submit them to
224 * the queue when needed.
225 *
226 * The kernel modifies neither the indices array nor the entries
227 * array.
228 */
229 u32 *sq_array;
2b188cc1
JA
230 unsigned cached_sq_head;
231 unsigned sq_entries;
232 unsigned sq_mask;
6c271ce2 233 unsigned sq_thread_idle;
498ccd9e 234 unsigned cached_sq_dropped;
206aefde 235 atomic_t cached_cq_overflow;
ad3eb2c8 236 unsigned long sq_check_overflow;
de0617e4
JA
237
238 struct list_head defer_list;
5262f567 239 struct list_head timeout_list;
1d7bb1d5 240 struct list_head cq_overflow_list;
fcb323cc
JA
241
242 wait_queue_head_t inflight_wait;
ad3eb2c8 243 struct io_uring_sqe *sq_sqes;
2b188cc1
JA
244 } ____cacheline_aligned_in_smp;
245
206aefde
JA
246 struct io_rings *rings;
247
2b188cc1 248 /* IO offload */
561fb04a 249 struct io_wq *io_wq;
6c271ce2 250 struct task_struct *sqo_thread; /* if using sq thread polling */
2b188cc1 251 struct mm_struct *sqo_mm;
6c271ce2 252 wait_queue_head_t sqo_wait;
75b28aff 253
6b06314c
JA
254 /*
255 * If used, fixed file set. Writers must ensure that ->refs is dead,
256 * readers must ensure that ->refs is alive as long as the file* is
257 * used. Only updated through io_uring_register(2).
258 */
05f3fb3c 259 struct fixed_file_data *file_data;
6b06314c 260 unsigned nr_user_files;
b14cca0c
PB
261 int ring_fd;
262 struct file *ring_file;
6b06314c 263
edafccee
JA
264 /* if used, fixed mapped user buffers */
265 unsigned nr_user_bufs;
266 struct io_mapped_ubuf *user_bufs;
267
2b188cc1
JA
268 struct user_struct *user;
269
0b8c0ec7 270 const struct cred *creds;
181e448d 271
206aefde
JA
272 /* 0 is for ctx quiesce/reinit/free, 1 is for sqo_thread started */
273 struct completion *completions;
274
0ddf92e8
JA
275 /* if all else fails... */
276 struct io_kiocb *fallback_req;
277
206aefde
JA
278#if defined(CONFIG_UNIX)
279 struct socket *ring_sock;
280#endif
281
5a2e745d
JA
282 struct idr io_buffer_idr;
283
071698e1
JA
284 struct idr personality_idr;
285
206aefde
JA
286 struct {
287 unsigned cached_cq_tail;
288 unsigned cq_entries;
289 unsigned cq_mask;
290 atomic_t cq_timeouts;
ad3eb2c8 291 unsigned long cq_check_overflow;
206aefde
JA
292 struct wait_queue_head cq_wait;
293 struct fasync_struct *cq_fasync;
294 struct eventfd_ctx *cq_ev_fd;
295 } ____cacheline_aligned_in_smp;
2b188cc1
JA
296
297 struct {
298 struct mutex uring_lock;
299 wait_queue_head_t wait;
300 } ____cacheline_aligned_in_smp;
301
302 struct {
303 spinlock_t completion_lock;
e94f141b 304
def596e9
JA
305 /*
306 * ->poll_list is protected by the ctx->uring_lock for
307 * io_uring instances that don't use IORING_SETUP_SQPOLL.
308 * For SQPOLL, only the single threaded io_sq_thread() will
309 * manipulate the list, hence no extra locking is needed there.
310 */
311 struct list_head poll_list;
78076bb6
JA
312 struct hlist_head *cancel_hash;
313 unsigned cancel_hash_bits;
e94f141b 314 bool poll_multi_file;
31b51510 315
fcb323cc
JA
316 spinlock_t inflight_lock;
317 struct list_head inflight_list;
2b188cc1 318 } ____cacheline_aligned_in_smp;
2b188cc1
JA
319};
320
09bb8394
JA
321/*
322 * First field must be the file pointer in all the
323 * iocb unions! See also 'struct kiocb' in <linux/fs.h>
324 */
221c5eb2
JA
325struct io_poll_iocb {
326 struct file *file;
0969e783
JA
327 union {
328 struct wait_queue_head *head;
329 u64 addr;
330 };
221c5eb2 331 __poll_t events;
8c838788 332 bool done;
221c5eb2 333 bool canceled;
392edb45 334 struct wait_queue_entry wait;
221c5eb2
JA
335};
336
b5dba59e
JA
337struct io_close {
338 struct file *file;
339 struct file *put_file;
340 int fd;
341};
342
ad8a48ac
JA
343struct io_timeout_data {
344 struct io_kiocb *req;
345 struct hrtimer timer;
346 struct timespec64 ts;
347 enum hrtimer_mode mode;
cc42e0ac 348 u32 seq_offset;
ad8a48ac
JA
349};
350
8ed8d3c3
JA
351struct io_accept {
352 struct file *file;
353 struct sockaddr __user *addr;
354 int __user *addr_len;
355 int flags;
356};
357
358struct io_sync {
359 struct file *file;
360 loff_t len;
361 loff_t off;
362 int flags;
d63d1b5e 363 int mode;
8ed8d3c3
JA
364};
365
fbf23849
JA
366struct io_cancel {
367 struct file *file;
368 u64 addr;
369};
370
b29472ee
JA
371struct io_timeout {
372 struct file *file;
373 u64 addr;
374 int flags;
26a61679 375 unsigned count;
b29472ee
JA
376};
377
9adbd45d
JA
378struct io_rw {
379 /* NOTE: kiocb has the file as the first member, so don't do it here */
380 struct kiocb kiocb;
381 u64 addr;
382 u64 len;
383};
384
3fbb51c1
JA
385struct io_connect {
386 struct file *file;
387 struct sockaddr __user *addr;
388 int addr_len;
389};
390
e47293fd
JA
391struct io_sr_msg {
392 struct file *file;
fddaface
JA
393 union {
394 struct user_msghdr __user *msg;
395 void __user *buf;
396 };
e47293fd 397 int msg_flags;
fddaface 398 size_t len;
e47293fd
JA
399};
400
15b71abe
JA
401struct io_open {
402 struct file *file;
403 int dfd;
eddc7ef5 404 union {
eddc7ef5
JA
405 unsigned mask;
406 };
15b71abe 407 struct filename *filename;
eddc7ef5 408 struct statx __user *buffer;
c12cedf2 409 struct open_how how;
15b71abe
JA
410};
411
05f3fb3c
JA
412struct io_files_update {
413 struct file *file;
414 u64 arg;
415 u32 nr_args;
416 u32 offset;
417};
418
4840e418
JA
419struct io_fadvise {
420 struct file *file;
421 u64 offset;
422 u32 len;
423 u32 advice;
424};
425
c1ca757b
JA
426struct io_madvise {
427 struct file *file;
428 u64 addr;
429 u32 len;
430 u32 advice;
431};
432
3e4827b0
JA
433struct io_epoll {
434 struct file *file;
435 int epfd;
436 int op;
437 int fd;
438 struct epoll_event event;
e47293fd
JA
439};
440
7d67af2c
PB
441struct io_splice {
442 struct file *file_out;
443 struct file *file_in;
444 loff_t off_out;
445 loff_t off_in;
446 u64 len;
447 unsigned int flags;
448};
449
ddf0322d
JA
450struct io_provide_buf {
451 struct file *file;
452 __u64 addr;
453 __s32 len;
454 __u32 bgid;
455 __u16 nbufs;
456 __u16 bid;
457};
458
f499a021
JA
459struct io_async_connect {
460 struct sockaddr_storage address;
461};
462
03b1230c
JA
463struct io_async_msghdr {
464 struct iovec fast_iov[UIO_FASTIOV];
465 struct iovec *iov;
466 struct sockaddr __user *uaddr;
467 struct msghdr msg;
b537916c 468 struct sockaddr_storage addr;
03b1230c
JA
469};
470
f67676d1
JA
471struct io_async_rw {
472 struct iovec fast_iov[UIO_FASTIOV];
473 struct iovec *iov;
474 ssize_t nr_segs;
475 ssize_t size;
476};
477
1a6b74fc 478struct io_async_ctx {
f67676d1
JA
479 union {
480 struct io_async_rw rw;
03b1230c 481 struct io_async_msghdr msg;
f499a021 482 struct io_async_connect connect;
2d28390a 483 struct io_timeout_data timeout;
f67676d1 484 };
1a6b74fc
JA
485};
486
6b47ee6e
PB
487enum {
488 REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT,
489 REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT,
490 REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT,
491 REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT,
492 REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
493
494 REQ_F_LINK_NEXT_BIT,
495 REQ_F_FAIL_LINK_BIT,
496 REQ_F_INFLIGHT_BIT,
497 REQ_F_CUR_POS_BIT,
498 REQ_F_NOWAIT_BIT,
499 REQ_F_IOPOLL_COMPLETED_BIT,
500 REQ_F_LINK_TIMEOUT_BIT,
501 REQ_F_TIMEOUT_BIT,
502 REQ_F_ISREG_BIT,
503 REQ_F_MUST_PUNT_BIT,
504 REQ_F_TIMEOUT_NOSEQ_BIT,
505 REQ_F_COMP_LOCKED_BIT,
99bc4c38 506 REQ_F_NEED_CLEANUP_BIT,
2ca10259 507 REQ_F_OVERFLOW_BIT,
d7718a9d 508 REQ_F_POLLED_BIT,
6b47ee6e
PB
509};
510
511enum {
512 /* ctx owns file */
513 REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT),
514 /* drain existing IO first */
515 REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT),
516 /* linked sqes */
517 REQ_F_LINK = BIT(REQ_F_LINK_BIT),
518 /* doesn't sever on completion < 0 */
519 REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT),
520 /* IOSQE_ASYNC */
521 REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT),
522
523 /* already grabbed next link */
524 REQ_F_LINK_NEXT = BIT(REQ_F_LINK_NEXT_BIT),
525 /* fail rest of links */
526 REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT),
527 /* on inflight list */
528 REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT),
529 /* read/write uses file position */
530 REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT),
531 /* must not punt to workers */
532 REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),
533 /* polled IO has completed */
534 REQ_F_IOPOLL_COMPLETED = BIT(REQ_F_IOPOLL_COMPLETED_BIT),
535 /* has linked timeout */
536 REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),
537 /* timeout request */
538 REQ_F_TIMEOUT = BIT(REQ_F_TIMEOUT_BIT),
539 /* regular file */
540 REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
541 /* must be punted even for NONBLOCK */
542 REQ_F_MUST_PUNT = BIT(REQ_F_MUST_PUNT_BIT),
543 /* no timeout sequence */
544 REQ_F_TIMEOUT_NOSEQ = BIT(REQ_F_TIMEOUT_NOSEQ_BIT),
545 /* completion under lock */
546 REQ_F_COMP_LOCKED = BIT(REQ_F_COMP_LOCKED_BIT),
99bc4c38
PB
547 /* needs cleanup */
548 REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT),
2ca10259
JA
549 /* in overflow list */
550 REQ_F_OVERFLOW = BIT(REQ_F_OVERFLOW_BIT),
d7718a9d
JA
551 /* already went through poll handler */
552 REQ_F_POLLED = BIT(REQ_F_POLLED_BIT),
553};
554
555struct async_poll {
556 struct io_poll_iocb poll;
557 struct io_wq_work work;
6b47ee6e
PB
558};
559
09bb8394
JA
560/*
561 * NOTE! Each of the iocb union members has the file pointer
562 * as the first entry in their struct definition. So you can
563 * access the file pointer through any of the sub-structs,
564 * or directly as just 'ki_filp' in this struct.
565 */
2b188cc1 566struct io_kiocb {
221c5eb2 567 union {
09bb8394 568 struct file *file;
9adbd45d 569 struct io_rw rw;
221c5eb2 570 struct io_poll_iocb poll;
8ed8d3c3
JA
571 struct io_accept accept;
572 struct io_sync sync;
fbf23849 573 struct io_cancel cancel;
b29472ee 574 struct io_timeout timeout;
3fbb51c1 575 struct io_connect connect;
e47293fd 576 struct io_sr_msg sr_msg;
15b71abe 577 struct io_open open;
b5dba59e 578 struct io_close close;
05f3fb3c 579 struct io_files_update files_update;
4840e418 580 struct io_fadvise fadvise;
c1ca757b 581 struct io_madvise madvise;
3e4827b0 582 struct io_epoll epoll;
7d67af2c 583 struct io_splice splice;
ddf0322d 584 struct io_provide_buf pbuf;
221c5eb2 585 };
2b188cc1 586
1a6b74fc 587 struct io_async_ctx *io;
cf6fd4bd 588 bool needs_fixed_file;
d625c6ee 589 u8 opcode;
2b188cc1
JA
590
591 struct io_ring_ctx *ctx;
d7718a9d 592 struct list_head list;
2b188cc1 593 unsigned int flags;
c16361c1 594 refcount_t refs;
d7718a9d 595 struct task_struct *task;
2b188cc1 596 u64 user_data;
9e645e11 597 u32 result;
de0617e4 598 u32 sequence;
2b188cc1 599
d7718a9d
JA
600 struct list_head link_list;
601
fcb323cc
JA
602 struct list_head inflight_entry;
603
b41e9852
JA
604 union {
605 /*
606 * Only commands that never go async can use the below fields,
d7718a9d
JA
607 * obviously. Right now only IORING_OP_POLL_ADD uses them, and
608 * async armed poll handlers for regular commands. The latter
609 * restore the work, if needed.
b41e9852
JA
610 */
611 struct {
b41e9852 612 struct callback_head task_work;
d7718a9d
JA
613 struct hlist_node hash_node;
614 struct async_poll *apoll;
b41e9852
JA
615 };
616 struct io_wq_work work;
617 };
2b188cc1
JA
618};
619
620#define IO_PLUG_THRESHOLD 2
def596e9 621#define IO_IOPOLL_BATCH 8
2b188cc1 622
9a56a232
JA
623struct io_submit_state {
624 struct blk_plug plug;
625
2579f913
JA
626 /*
627 * io_kiocb alloc cache
628 */
629 void *reqs[IO_IOPOLL_BATCH];
6c8a3134 630 unsigned int free_reqs;
2579f913 631
9a56a232
JA
632 /*
633 * File reference cache
634 */
635 struct file *file;
636 unsigned int fd;
637 unsigned int has_refs;
638 unsigned int used_refs;
639 unsigned int ios_left;
640};
641
d3656344
JA
642struct io_op_def {
643 /* needs req->io allocated for deferral/async */
644 unsigned async_ctx : 1;
645 /* needs current->mm setup, does mm access */
646 unsigned needs_mm : 1;
647 /* needs req->file assigned */
648 unsigned needs_file : 1;
649 /* needs req->file assigned IFF fd is >= 0 */
650 unsigned fd_non_neg : 1;
651 /* hash wq insertion if file is a regular file */
652 unsigned hash_reg_file : 1;
653 /* unbound wq insertion if file is a non-regular file */
654 unsigned unbound_nonreg_file : 1;
66f4af93
JA
655 /* opcode is not supported by this kernel */
656 unsigned not_supported : 1;
f86cd20c
JA
657 /* needs file table */
658 unsigned file_table : 1;
ff002b30
JA
659 /* needs ->fs */
660 unsigned needs_fs : 1;
8a72758c
JA
661 /* set if opcode supports polled "wait" */
662 unsigned pollin : 1;
663 unsigned pollout : 1;
d3656344
JA
664};
665
666static const struct io_op_def io_op_defs[] = {
0463b6c5
PB
667 [IORING_OP_NOP] = {},
668 [IORING_OP_READV] = {
d3656344
JA
669 .async_ctx = 1,
670 .needs_mm = 1,
671 .needs_file = 1,
672 .unbound_nonreg_file = 1,
8a72758c 673 .pollin = 1,
d3656344 674 },
0463b6c5 675 [IORING_OP_WRITEV] = {
d3656344
JA
676 .async_ctx = 1,
677 .needs_mm = 1,
678 .needs_file = 1,
679 .hash_reg_file = 1,
680 .unbound_nonreg_file = 1,
8a72758c 681 .pollout = 1,
d3656344 682 },
0463b6c5 683 [IORING_OP_FSYNC] = {
d3656344
JA
684 .needs_file = 1,
685 },
0463b6c5 686 [IORING_OP_READ_FIXED] = {
d3656344
JA
687 .needs_file = 1,
688 .unbound_nonreg_file = 1,
8a72758c 689 .pollin = 1,
d3656344 690 },
0463b6c5 691 [IORING_OP_WRITE_FIXED] = {
d3656344
JA
692 .needs_file = 1,
693 .hash_reg_file = 1,
694 .unbound_nonreg_file = 1,
8a72758c 695 .pollout = 1,
d3656344 696 },
0463b6c5 697 [IORING_OP_POLL_ADD] = {
d3656344
JA
698 .needs_file = 1,
699 .unbound_nonreg_file = 1,
700 },
0463b6c5
PB
701 [IORING_OP_POLL_REMOVE] = {},
702 [IORING_OP_SYNC_FILE_RANGE] = {
d3656344
JA
703 .needs_file = 1,
704 },
0463b6c5 705 [IORING_OP_SENDMSG] = {
d3656344
JA
706 .async_ctx = 1,
707 .needs_mm = 1,
708 .needs_file = 1,
709 .unbound_nonreg_file = 1,
ff002b30 710 .needs_fs = 1,
8a72758c 711 .pollout = 1,
d3656344 712 },
0463b6c5 713 [IORING_OP_RECVMSG] = {
d3656344
JA
714 .async_ctx = 1,
715 .needs_mm = 1,
716 .needs_file = 1,
717 .unbound_nonreg_file = 1,
ff002b30 718 .needs_fs = 1,
8a72758c 719 .pollin = 1,
d3656344 720 },
0463b6c5 721 [IORING_OP_TIMEOUT] = {
d3656344
JA
722 .async_ctx = 1,
723 .needs_mm = 1,
724 },
0463b6c5
PB
725 [IORING_OP_TIMEOUT_REMOVE] = {},
726 [IORING_OP_ACCEPT] = {
d3656344
JA
727 .needs_mm = 1,
728 .needs_file = 1,
729 .unbound_nonreg_file = 1,
f86cd20c 730 .file_table = 1,
8a72758c 731 .pollin = 1,
d3656344 732 },
0463b6c5
PB
733 [IORING_OP_ASYNC_CANCEL] = {},
734 [IORING_OP_LINK_TIMEOUT] = {
d3656344
JA
735 .async_ctx = 1,
736 .needs_mm = 1,
737 },
0463b6c5 738 [IORING_OP_CONNECT] = {
d3656344
JA
739 .async_ctx = 1,
740 .needs_mm = 1,
741 .needs_file = 1,
742 .unbound_nonreg_file = 1,
8a72758c 743 .pollout = 1,
d3656344 744 },
0463b6c5 745 [IORING_OP_FALLOCATE] = {
d3656344
JA
746 .needs_file = 1,
747 },
0463b6c5 748 [IORING_OP_OPENAT] = {
d3656344
JA
749 .needs_file = 1,
750 .fd_non_neg = 1,
f86cd20c 751 .file_table = 1,
ff002b30 752 .needs_fs = 1,
d3656344 753 },
0463b6c5 754 [IORING_OP_CLOSE] = {
d3656344 755 .needs_file = 1,
f86cd20c 756 .file_table = 1,
d3656344 757 },
0463b6c5 758 [IORING_OP_FILES_UPDATE] = {
d3656344 759 .needs_mm = 1,
f86cd20c 760 .file_table = 1,
d3656344 761 },
0463b6c5 762 [IORING_OP_STATX] = {
d3656344
JA
763 .needs_mm = 1,
764 .needs_file = 1,
765 .fd_non_neg = 1,
ff002b30 766 .needs_fs = 1,
d3656344 767 },
0463b6c5 768 [IORING_OP_READ] = {
3a6820f2
JA
769 .needs_mm = 1,
770 .needs_file = 1,
771 .unbound_nonreg_file = 1,
8a72758c 772 .pollin = 1,
3a6820f2 773 },
0463b6c5 774 [IORING_OP_WRITE] = {
3a6820f2
JA
775 .needs_mm = 1,
776 .needs_file = 1,
777 .unbound_nonreg_file = 1,
8a72758c 778 .pollout = 1,
3a6820f2 779 },
0463b6c5 780 [IORING_OP_FADVISE] = {
4840e418
JA
781 .needs_file = 1,
782 },
0463b6c5 783 [IORING_OP_MADVISE] = {
c1ca757b
JA
784 .needs_mm = 1,
785 },
0463b6c5 786 [IORING_OP_SEND] = {
fddaface
JA
787 .needs_mm = 1,
788 .needs_file = 1,
789 .unbound_nonreg_file = 1,
8a72758c 790 .pollout = 1,
fddaface 791 },
0463b6c5 792 [IORING_OP_RECV] = {
fddaface
JA
793 .needs_mm = 1,
794 .needs_file = 1,
795 .unbound_nonreg_file = 1,
8a72758c 796 .pollin = 1,
fddaface 797 },
0463b6c5 798 [IORING_OP_OPENAT2] = {
cebdb986
JA
799 .needs_file = 1,
800 .fd_non_neg = 1,
f86cd20c 801 .file_table = 1,
ff002b30 802 .needs_fs = 1,
cebdb986 803 },
3e4827b0
JA
804 [IORING_OP_EPOLL_CTL] = {
805 .unbound_nonreg_file = 1,
806 .file_table = 1,
807 },
7d67af2c
PB
808 [IORING_OP_SPLICE] = {
809 .needs_file = 1,
810 .hash_reg_file = 1,
811 .unbound_nonreg_file = 1,
ddf0322d
JA
812 },
813 [IORING_OP_PROVIDE_BUFFERS] = {},
d3656344
JA
814};
815
561fb04a 816static void io_wq_submit_work(struct io_wq_work **workptr);
78e19bbe 817static void io_cqring_fill_event(struct io_kiocb *req, long res);
ec9c02ad 818static void io_put_req(struct io_kiocb *req);
978db57e 819static void __io_double_put_req(struct io_kiocb *req);
94ae5e77
JA
820static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
821static void io_queue_linked_timeout(struct io_kiocb *req);
05f3fb3c
JA
822static int __io_sqe_files_update(struct io_ring_ctx *ctx,
823 struct io_uring_files_update *ip,
824 unsigned nr_args);
f86cd20c 825static int io_grab_files(struct io_kiocb *req);
2faf852d 826static void io_ring_file_ref_flush(struct fixed_file_data *data);
99bc4c38 827static void io_cleanup_req(struct io_kiocb *req);
b41e9852
JA
828static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
829 int fd, struct file **out_file, bool fixed);
830static void __io_queue_sqe(struct io_kiocb *req,
831 const struct io_uring_sqe *sqe);
de0617e4 832
2b188cc1
JA
833static struct kmem_cache *req_cachep;
834
835static const struct file_operations io_uring_fops;
836
837struct sock *io_uring_get_socket(struct file *file)
838{
839#if defined(CONFIG_UNIX)
840 if (file->f_op == &io_uring_fops) {
841 struct io_ring_ctx *ctx = file->private_data;
842
843 return ctx->ring_sock->sk;
844 }
845#endif
846 return NULL;
847}
848EXPORT_SYMBOL(io_uring_get_socket);
849
850static void io_ring_ctx_ref_free(struct percpu_ref *ref)
851{
852 struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
853
206aefde 854 complete(&ctx->completions[0]);
2b188cc1
JA
855}
856
857static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
858{
859 struct io_ring_ctx *ctx;
78076bb6 860 int hash_bits;
2b188cc1
JA
861
862 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
863 if (!ctx)
864 return NULL;
865
0ddf92e8
JA
866 ctx->fallback_req = kmem_cache_alloc(req_cachep, GFP_KERNEL);
867 if (!ctx->fallback_req)
868 goto err;
869
206aefde
JA
870 ctx->completions = kmalloc(2 * sizeof(struct completion), GFP_KERNEL);
871 if (!ctx->completions)
872 goto err;
873
78076bb6
JA
874 /*
875 * Use 5 bits less than the max cq entries, that should give us around
876 * 32 entries per hash list if totally full and uniformly spread.
877 */
878 hash_bits = ilog2(p->cq_entries);
879 hash_bits -= 5;
880 if (hash_bits <= 0)
881 hash_bits = 1;
882 ctx->cancel_hash_bits = hash_bits;
883 ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
884 GFP_KERNEL);
885 if (!ctx->cancel_hash)
886 goto err;
887 __hash_init(ctx->cancel_hash, 1U << hash_bits);
888
21482896 889 if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
206aefde
JA
890 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
891 goto err;
2b188cc1
JA
892
893 ctx->flags = p->flags;
894 init_waitqueue_head(&ctx->cq_wait);
1d7bb1d5 895 INIT_LIST_HEAD(&ctx->cq_overflow_list);
206aefde
JA
896 init_completion(&ctx->completions[0]);
897 init_completion(&ctx->completions[1]);
5a2e745d 898 idr_init(&ctx->io_buffer_idr);
071698e1 899 idr_init(&ctx->personality_idr);
2b188cc1
JA
900 mutex_init(&ctx->uring_lock);
901 init_waitqueue_head(&ctx->wait);
902 spin_lock_init(&ctx->completion_lock);
def596e9 903 INIT_LIST_HEAD(&ctx->poll_list);
de0617e4 904 INIT_LIST_HEAD(&ctx->defer_list);
5262f567 905 INIT_LIST_HEAD(&ctx->timeout_list);
fcb323cc
JA
906 init_waitqueue_head(&ctx->inflight_wait);
907 spin_lock_init(&ctx->inflight_lock);
908 INIT_LIST_HEAD(&ctx->inflight_list);
2b188cc1 909 return ctx;
206aefde 910err:
0ddf92e8
JA
911 if (ctx->fallback_req)
912 kmem_cache_free(req_cachep, ctx->fallback_req);
206aefde 913 kfree(ctx->completions);
78076bb6 914 kfree(ctx->cancel_hash);
206aefde
JA
915 kfree(ctx);
916 return NULL;
2b188cc1
JA
917}
918
9d858b21 919static inline bool __req_need_defer(struct io_kiocb *req)
7adf4eaf 920{
a197f664
JL
921 struct io_ring_ctx *ctx = req->ctx;
922
498ccd9e
JA
923 return req->sequence != ctx->cached_cq_tail + ctx->cached_sq_dropped
924 + atomic_read(&ctx->cached_cq_overflow);
7adf4eaf
JA
925}
926
9d858b21 927static inline bool req_need_defer(struct io_kiocb *req)
de0617e4 928{
87987898 929 if (unlikely(req->flags & REQ_F_IO_DRAIN))
9d858b21 930 return __req_need_defer(req);
de0617e4 931
9d858b21 932 return false;
de0617e4
JA
933}
934
7adf4eaf 935static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
de0617e4
JA
936{
937 struct io_kiocb *req;
938
7adf4eaf 939 req = list_first_entry_or_null(&ctx->defer_list, struct io_kiocb, list);
9d858b21 940 if (req && !req_need_defer(req)) {
de0617e4
JA
941 list_del_init(&req->list);
942 return req;
943 }
944
945 return NULL;
946}
947
5262f567
JA
948static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx)
949{
7adf4eaf
JA
950 struct io_kiocb *req;
951
952 req = list_first_entry_or_null(&ctx->timeout_list, struct io_kiocb, list);
93bd25bb
JA
953 if (req) {
954 if (req->flags & REQ_F_TIMEOUT_NOSEQ)
955 return NULL;
fb4b3d3f 956 if (!__req_need_defer(req)) {
93bd25bb
JA
957 list_del_init(&req->list);
958 return req;
959 }
7adf4eaf
JA
960 }
961
962 return NULL;
5262f567
JA
963}
964
de0617e4 965static void __io_commit_cqring(struct io_ring_ctx *ctx)
2b188cc1 966{
75b28aff 967 struct io_rings *rings = ctx->rings;
2b188cc1 968
07910158
PB
969 /* order cqe stores with ring update */
970 smp_store_release(&rings->cq.tail, ctx->cached_cq_tail);
2b188cc1 971
07910158
PB
972 if (wq_has_sleeper(&ctx->cq_wait)) {
973 wake_up_interruptible(&ctx->cq_wait);
974 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
2b188cc1
JA
975 }
976}
977
cccf0ee8
JA
978static inline void io_req_work_grab_env(struct io_kiocb *req,
979 const struct io_op_def *def)
980{
981 if (!req->work.mm && def->needs_mm) {
982 mmgrab(current->mm);
983 req->work.mm = current->mm;
2b188cc1 984 }
cccf0ee8
JA
985 if (!req->work.creds)
986 req->work.creds = get_current_cred();
ff002b30
JA
987 if (!req->work.fs && def->needs_fs) {
988 spin_lock(&current->fs->lock);
989 if (!current->fs->in_exec) {
990 req->work.fs = current->fs;
991 req->work.fs->users++;
992 } else {
993 req->work.flags |= IO_WQ_WORK_CANCEL;
994 }
995 spin_unlock(&current->fs->lock);
996 }
6ab23144
JA
997 if (!req->work.task_pid)
998 req->work.task_pid = task_pid_vnr(current);
2b188cc1
JA
999}
1000
cccf0ee8 1001static inline void io_req_work_drop_env(struct io_kiocb *req)
18d9be1a 1002{
cccf0ee8
JA
1003 if (req->work.mm) {
1004 mmdrop(req->work.mm);
1005 req->work.mm = NULL;
1006 }
1007 if (req->work.creds) {
1008 put_cred(req->work.creds);
1009 req->work.creds = NULL;
1010 }
ff002b30
JA
1011 if (req->work.fs) {
1012 struct fs_struct *fs = req->work.fs;
1013
1014 spin_lock(&req->work.fs->lock);
1015 if (--fs->users)
1016 fs = NULL;
1017 spin_unlock(&req->work.fs->lock);
1018 if (fs)
1019 free_fs_struct(fs);
1020 }
561fb04a
JA
1021}
1022
94ae5e77
JA
1023static inline bool io_prep_async_work(struct io_kiocb *req,
1024 struct io_kiocb **link)
18d9be1a 1025{
d3656344 1026 const struct io_op_def *def = &io_op_defs[req->opcode];
561fb04a 1027 bool do_hashed = false;
54a91f3b 1028
d3656344
JA
1029 if (req->flags & REQ_F_ISREG) {
1030 if (def->hash_reg_file)
3529d8c2 1031 do_hashed = true;
d3656344
JA
1032 } else {
1033 if (def->unbound_nonreg_file)
3529d8c2 1034 req->work.flags |= IO_WQ_WORK_UNBOUND;
54a91f3b 1035 }
cccf0ee8
JA
1036
1037 io_req_work_grab_env(req, def);
54a91f3b 1038
94ae5e77 1039 *link = io_prep_linked_timeout(req);
561fb04a
JA
1040 return do_hashed;
1041}
1042
a197f664 1043static inline void io_queue_async_work(struct io_kiocb *req)
561fb04a 1044{
a197f664 1045 struct io_ring_ctx *ctx = req->ctx;
94ae5e77
JA
1046 struct io_kiocb *link;
1047 bool do_hashed;
1048
1049 do_hashed = io_prep_async_work(req, &link);
561fb04a
JA
1050
1051 trace_io_uring_queue_async_work(ctx, do_hashed, req, &req->work,
1052 req->flags);
1053 if (!do_hashed) {
1054 io_wq_enqueue(ctx->io_wq, &req->work);
1055 } else {
1056 io_wq_enqueue_hashed(ctx->io_wq, &req->work,
1057 file_inode(req->file));
1058 }
94ae5e77
JA
1059
1060 if (link)
1061 io_queue_linked_timeout(link);
18d9be1a
JA
1062}
1063
5262f567
JA
1064static void io_kill_timeout(struct io_kiocb *req)
1065{
1066 int ret;
1067
2d28390a 1068 ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
5262f567
JA
1069 if (ret != -1) {
1070 atomic_inc(&req->ctx->cq_timeouts);
842f9612 1071 list_del_init(&req->list);
78e19bbe 1072 io_cqring_fill_event(req, 0);
ec9c02ad 1073 io_put_req(req);
5262f567
JA
1074 }
1075}
1076
1077static void io_kill_timeouts(struct io_ring_ctx *ctx)
1078{
1079 struct io_kiocb *req, *tmp;
1080
1081 spin_lock_irq(&ctx->completion_lock);
1082 list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list)
1083 io_kill_timeout(req);
1084 spin_unlock_irq(&ctx->completion_lock);
1085}
1086
de0617e4
JA
1087static void io_commit_cqring(struct io_ring_ctx *ctx)
1088{
1089 struct io_kiocb *req;
1090
5262f567
JA
1091 while ((req = io_get_timeout_req(ctx)) != NULL)
1092 io_kill_timeout(req);
1093
de0617e4
JA
1094 __io_commit_cqring(ctx);
1095
87987898 1096 while ((req = io_get_deferred_req(ctx)) != NULL)
a197f664 1097 io_queue_async_work(req);
de0617e4
JA
1098}
1099
2b188cc1
JA
1100static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
1101{
75b28aff 1102 struct io_rings *rings = ctx->rings;
2b188cc1
JA
1103 unsigned tail;
1104
1105 tail = ctx->cached_cq_tail;
115e12e5
SB
1106 /*
1107 * writes to the cq entry need to come after reading head; the
1108 * control dependency is enough as we're using WRITE_ONCE to
1109 * fill the cq entry
1110 */
75b28aff 1111 if (tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries)
2b188cc1
JA
1112 return NULL;
1113
1114 ctx->cached_cq_tail++;
75b28aff 1115 return &rings->cqes[tail & ctx->cq_mask];
2b188cc1
JA
1116}
1117
f2842ab5
JA
1118static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
1119{
f0b493e6
JA
1120 if (!ctx->cq_ev_fd)
1121 return false;
f2842ab5
JA
1122 if (!ctx->eventfd_async)
1123 return true;
b41e9852 1124 return io_wq_current_is_worker();
f2842ab5
JA
1125}
1126
b41e9852 1127static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
1d7bb1d5
JA
1128{
1129 if (waitqueue_active(&ctx->wait))
1130 wake_up(&ctx->wait);
1131 if (waitqueue_active(&ctx->sqo_wait))
1132 wake_up(&ctx->sqo_wait);
b41e9852 1133 if (io_should_trigger_evfd(ctx))
1d7bb1d5
JA
1134 eventfd_signal(ctx->cq_ev_fd, 1);
1135}
1136
c4a2ed72
JA
1137/* Returns true if there are no backlogged entries after the flush */
1138static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
1d7bb1d5
JA
1139{
1140 struct io_rings *rings = ctx->rings;
1141 struct io_uring_cqe *cqe;
1142 struct io_kiocb *req;
1143 unsigned long flags;
1144 LIST_HEAD(list);
1145
1146 if (!force) {
1147 if (list_empty_careful(&ctx->cq_overflow_list))
c4a2ed72 1148 return true;
1d7bb1d5
JA
1149 if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) ==
1150 rings->cq_ring_entries))
c4a2ed72 1151 return false;
1d7bb1d5
JA
1152 }
1153
1154 spin_lock_irqsave(&ctx->completion_lock, flags);
1155
1156 /* if force is set, the ring is going away. always drop after that */
1157 if (force)
69b3e546 1158 ctx->cq_overflow_flushed = 1;
1d7bb1d5 1159
c4a2ed72 1160 cqe = NULL;
1d7bb1d5
JA
1161 while (!list_empty(&ctx->cq_overflow_list)) {
1162 cqe = io_get_cqring(ctx);
1163 if (!cqe && !force)
1164 break;
1165
1166 req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb,
1167 list);
1168 list_move(&req->list, &list);
2ca10259 1169 req->flags &= ~REQ_F_OVERFLOW;
1d7bb1d5
JA
1170 if (cqe) {
1171 WRITE_ONCE(cqe->user_data, req->user_data);
1172 WRITE_ONCE(cqe->res, req->result);
1173 WRITE_ONCE(cqe->flags, 0);
1174 } else {
1175 WRITE_ONCE(ctx->rings->cq_overflow,
1176 atomic_inc_return(&ctx->cached_cq_overflow));
1177 }
1178 }
1179
1180 io_commit_cqring(ctx);
ad3eb2c8
JA
1181 if (cqe) {
1182 clear_bit(0, &ctx->sq_check_overflow);
1183 clear_bit(0, &ctx->cq_check_overflow);
1184 }
1d7bb1d5
JA
1185 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1186 io_cqring_ev_posted(ctx);
1187
1188 while (!list_empty(&list)) {
1189 req = list_first_entry(&list, struct io_kiocb, list);
1190 list_del(&req->list);
ec9c02ad 1191 io_put_req(req);
1d7bb1d5 1192 }
c4a2ed72
JA
1193
1194 return cqe != NULL;
1d7bb1d5
JA
1195}
1196
78e19bbe 1197static void io_cqring_fill_event(struct io_kiocb *req, long res)
2b188cc1 1198{
78e19bbe 1199 struct io_ring_ctx *ctx = req->ctx;
2b188cc1
JA
1200 struct io_uring_cqe *cqe;
1201
78e19bbe 1202 trace_io_uring_complete(ctx, req->user_data, res);
51c3ff62 1203
2b188cc1
JA
1204 /*
1205 * If we can't get a cq entry, userspace overflowed the
1206 * submission (by quite a lot). Increment the overflow count in
1207 * the ring.
1208 */
1209 cqe = io_get_cqring(ctx);
1d7bb1d5 1210 if (likely(cqe)) {
78e19bbe 1211 WRITE_ONCE(cqe->user_data, req->user_data);
2b188cc1 1212 WRITE_ONCE(cqe->res, res);
c71ffb67 1213 WRITE_ONCE(cqe->flags, 0);
1d7bb1d5 1214 } else if (ctx->cq_overflow_flushed) {
498ccd9e
JA
1215 WRITE_ONCE(ctx->rings->cq_overflow,
1216 atomic_inc_return(&ctx->cached_cq_overflow));
1d7bb1d5 1217 } else {
ad3eb2c8
JA
1218 if (list_empty(&ctx->cq_overflow_list)) {
1219 set_bit(0, &ctx->sq_check_overflow);
1220 set_bit(0, &ctx->cq_check_overflow);
1221 }
2ca10259 1222 req->flags |= REQ_F_OVERFLOW;
1d7bb1d5
JA
1223 refcount_inc(&req->refs);
1224 req->result = res;
1225 list_add_tail(&req->list, &ctx->cq_overflow_list);
2b188cc1
JA
1226 }
1227}
1228
78e19bbe 1229static void io_cqring_add_event(struct io_kiocb *req, long res)
2b188cc1 1230{
78e19bbe 1231 struct io_ring_ctx *ctx = req->ctx;
2b188cc1
JA
1232 unsigned long flags;
1233
1234 spin_lock_irqsave(&ctx->completion_lock, flags);
78e19bbe 1235 io_cqring_fill_event(req, res);
2b188cc1
JA
1236 io_commit_cqring(ctx);
1237 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1238
8c838788 1239 io_cqring_ev_posted(ctx);
2b188cc1
JA
1240}
1241
0ddf92e8
JA
1242static inline bool io_is_fallback_req(struct io_kiocb *req)
1243{
1244 return req == (struct io_kiocb *)
1245 ((unsigned long) req->ctx->fallback_req & ~1UL);
1246}
1247
1248static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx)
1249{
1250 struct io_kiocb *req;
1251
1252 req = ctx->fallback_req;
1253 if (!test_and_set_bit_lock(0, (unsigned long *) ctx->fallback_req))
1254 return req;
1255
1256 return NULL;
1257}
1258
2579f913
JA
1259static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
1260 struct io_submit_state *state)
2b188cc1 1261{
fd6fab2c 1262 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
2b188cc1
JA
1263 struct io_kiocb *req;
1264
2579f913 1265 if (!state) {
fd6fab2c 1266 req = kmem_cache_alloc(req_cachep, gfp);
2579f913 1267 if (unlikely(!req))
0ddf92e8 1268 goto fallback;
2579f913
JA
1269 } else if (!state->free_reqs) {
1270 size_t sz;
1271 int ret;
1272
1273 sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
fd6fab2c
JA
1274 ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);
1275
1276 /*
1277 * Bulk alloc is all-or-nothing. If we fail to get a batch,
1278 * retry single alloc to be on the safe side.
1279 */
1280 if (unlikely(ret <= 0)) {
1281 state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
1282 if (!state->reqs[0])
0ddf92e8 1283 goto fallback;
fd6fab2c
JA
1284 ret = 1;
1285 }
2579f913 1286 state->free_reqs = ret - 1;
6c8a3134 1287 req = state->reqs[ret - 1];
2579f913 1288 } else {
2579f913 1289 state->free_reqs--;
6c8a3134 1290 req = state->reqs[state->free_reqs];
2b188cc1
JA
1291 }
1292
0ddf92e8 1293got_it:
1a6b74fc 1294 req->io = NULL;
60c112b0 1295 req->file = NULL;
2579f913
JA
1296 req->ctx = ctx;
1297 req->flags = 0;
e65ef56d
JA
1298 /* one is dropped after submission, the other at completion */
1299 refcount_set(&req->refs, 2);
9e645e11 1300 req->result = 0;
561fb04a 1301 INIT_IO_WORK(&req->work, io_wq_submit_work);
2579f913 1302 return req;
0ddf92e8
JA
1303fallback:
1304 req = io_get_fallback_req(ctx);
1305 if (req)
1306 goto got_it;
6805b32e 1307 percpu_ref_put(&ctx->refs);
2b188cc1
JA
1308 return NULL;
1309}
1310
8da11c19
PB
1311static inline void io_put_file(struct io_kiocb *req, struct file *file,
1312 bool fixed)
1313{
1314 if (fixed)
1315 percpu_ref_put(&req->ctx->file_data->refs);
1316 else
1317 fput(file);
1318}
1319
2b85edfc 1320static void __io_req_do_free(struct io_kiocb *req)
def596e9 1321{
2b85edfc
PB
1322 if (likely(!io_is_fallback_req(req)))
1323 kmem_cache_free(req_cachep, req);
1324 else
1325 clear_bit_unlock(0, (unsigned long *) req->ctx->fallback_req);
1326}
1327
c6ca97b3 1328static void __io_req_aux_free(struct io_kiocb *req)
2b188cc1 1329{
929a3af9
PB
1330 if (req->flags & REQ_F_NEED_CLEANUP)
1331 io_cleanup_req(req);
1332
96fd84d8 1333 kfree(req->io);
8da11c19
PB
1334 if (req->file)
1335 io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
cccf0ee8
JA
1336
1337 io_req_work_drop_env(req);
def596e9
JA
1338}
1339
9e645e11 1340static void __io_free_req(struct io_kiocb *req)
2b188cc1 1341{
c6ca97b3 1342 __io_req_aux_free(req);
fcb323cc 1343
fcb323cc 1344 if (req->flags & REQ_F_INFLIGHT) {
c6ca97b3 1345 struct io_ring_ctx *ctx = req->ctx;
fcb323cc
JA
1346 unsigned long flags;
1347
1348 spin_lock_irqsave(&ctx->inflight_lock, flags);
1349 list_del(&req->inflight_entry);
1350 if (waitqueue_active(&ctx->inflight_wait))
1351 wake_up(&ctx->inflight_wait);
1352 spin_unlock_irqrestore(&ctx->inflight_lock, flags);
1353 }
2b85edfc
PB
1354
1355 percpu_ref_put(&req->ctx->refs);
1356 __io_req_do_free(req);
e65ef56d
JA
1357}
1358
c6ca97b3
JA
1359struct req_batch {
1360 void *reqs[IO_IOPOLL_BATCH];
1361 int to_free;
1362 int need_iter;
1363};
1364
1365static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
1366{
10fef4be
JA
1367 int fixed_refs = rb->to_free;
1368
c6ca97b3
JA
1369 if (!rb->to_free)
1370 return;
1371 if (rb->need_iter) {
1372 int i, inflight = 0;
1373 unsigned long flags;
1374
10fef4be 1375 fixed_refs = 0;
c6ca97b3
JA
1376 for (i = 0; i < rb->to_free; i++) {
1377 struct io_kiocb *req = rb->reqs[i];
1378
10fef4be 1379 if (req->flags & REQ_F_FIXED_FILE) {
c6ca97b3 1380 req->file = NULL;
10fef4be
JA
1381 fixed_refs++;
1382 }
c6ca97b3
JA
1383 if (req->flags & REQ_F_INFLIGHT)
1384 inflight++;
c6ca97b3
JA
1385 __io_req_aux_free(req);
1386 }
1387 if (!inflight)
1388 goto do_free;
1389
1390 spin_lock_irqsave(&ctx->inflight_lock, flags);
1391 for (i = 0; i < rb->to_free; i++) {
1392 struct io_kiocb *req = rb->reqs[i];
1393
10fef4be 1394 if (req->flags & REQ_F_INFLIGHT) {
c6ca97b3
JA
1395 list_del(&req->inflight_entry);
1396 if (!--inflight)
1397 break;
1398 }
1399 }
1400 spin_unlock_irqrestore(&ctx->inflight_lock, flags);
1401
1402 if (waitqueue_active(&ctx->inflight_wait))
1403 wake_up(&ctx->inflight_wait);
1404 }
1405do_free:
1406 kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
10fef4be
JA
1407 if (fixed_refs)
1408 percpu_ref_put_many(&ctx->file_data->refs, fixed_refs);
c6ca97b3 1409 percpu_ref_put_many(&ctx->refs, rb->to_free);
c6ca97b3 1410 rb->to_free = rb->need_iter = 0;
e65ef56d
JA
1411}
1412
a197f664 1413static bool io_link_cancel_timeout(struct io_kiocb *req)
2665abfd 1414{
a197f664 1415 struct io_ring_ctx *ctx = req->ctx;
2665abfd
JA
1416 int ret;
1417
2d28390a 1418 ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
2665abfd 1419 if (ret != -1) {
78e19bbe 1420 io_cqring_fill_event(req, -ECANCELED);
2665abfd
JA
1421 io_commit_cqring(ctx);
1422 req->flags &= ~REQ_F_LINK;
ec9c02ad 1423 io_put_req(req);
2665abfd
JA
1424 return true;
1425 }
1426
1427 return false;
e65ef56d
JA
1428}
1429
ba816ad6 1430static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
9e645e11 1431{
2665abfd 1432 struct io_ring_ctx *ctx = req->ctx;
2665abfd 1433 bool wake_ev = false;
9e645e11 1434
4d7dd462
JA
1435 /* Already got next link */
1436 if (req->flags & REQ_F_LINK_NEXT)
1437 return;
1438
9e645e11
JA
1439 /*
1440 * The list should never be empty when we are called here. But could
1441 * potentially happen if the chain is messed up, check to be on the
1442 * safe side.
1443 */
4493233e
PB
1444 while (!list_empty(&req->link_list)) {
1445 struct io_kiocb *nxt = list_first_entry(&req->link_list,
1446 struct io_kiocb, link_list);
94ae5e77 1447
4493233e
PB
1448 if (unlikely((req->flags & REQ_F_LINK_TIMEOUT) &&
1449 (nxt->flags & REQ_F_TIMEOUT))) {
1450 list_del_init(&nxt->link_list);
94ae5e77 1451 wake_ev |= io_link_cancel_timeout(nxt);
94ae5e77
JA
1452 req->flags &= ~REQ_F_LINK_TIMEOUT;
1453 continue;
1454 }
9e645e11 1455
4493233e
PB
1456 list_del_init(&req->link_list);
1457 if (!list_empty(&nxt->link_list))
1458 nxt->flags |= REQ_F_LINK;
b18fdf71 1459 *nxtptr = nxt;
94ae5e77 1460 break;
9e645e11 1461 }
2665abfd 1462
4d7dd462 1463 req->flags |= REQ_F_LINK_NEXT;
2665abfd
JA
1464 if (wake_ev)
1465 io_cqring_ev_posted(ctx);
9e645e11
JA
1466}
1467
1468/*
1469 * Called if REQ_F_LINK is set, and we fail the head request
1470 */
1471static void io_fail_links(struct io_kiocb *req)
1472{
2665abfd 1473 struct io_ring_ctx *ctx = req->ctx;
2665abfd
JA
1474 unsigned long flags;
1475
1476 spin_lock_irqsave(&ctx->completion_lock, flags);
9e645e11
JA
1477
1478 while (!list_empty(&req->link_list)) {
4493233e
PB
1479 struct io_kiocb *link = list_first_entry(&req->link_list,
1480 struct io_kiocb, link_list);
9e645e11 1481
4493233e 1482 list_del_init(&link->link_list);
c826bd7a 1483 trace_io_uring_fail_link(req, link);
2665abfd
JA
1484
1485 if ((req->flags & REQ_F_LINK_TIMEOUT) &&
d625c6ee 1486 link->opcode == IORING_OP_LINK_TIMEOUT) {
a197f664 1487 io_link_cancel_timeout(link);
2665abfd 1488 } else {
78e19bbe 1489 io_cqring_fill_event(link, -ECANCELED);
978db57e 1490 __io_double_put_req(link);
2665abfd 1491 }
5d960724 1492 req->flags &= ~REQ_F_LINK_TIMEOUT;
9e645e11 1493 }
2665abfd
JA
1494
1495 io_commit_cqring(ctx);
1496 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1497 io_cqring_ev_posted(ctx);
9e645e11
JA
1498}
1499
4d7dd462 1500static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
9e645e11 1501{
4d7dd462 1502 if (likely(!(req->flags & REQ_F_LINK)))
2665abfd 1503 return;
2665abfd 1504
9e645e11
JA
1505 /*
1506 * If LINK is set, we have dependent requests in this chain. If we
1507 * didn't fail this request, queue the first one up, moving any other
1508 * dependencies to the next request. In case of failure, fail the rest
1509 * of the chain.
1510 */
2665abfd
JA
1511 if (req->flags & REQ_F_FAIL_LINK) {
1512 io_fail_links(req);
7c9e7f0f
JA
1513 } else if ((req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_COMP_LOCKED)) ==
1514 REQ_F_LINK_TIMEOUT) {
2665abfd
JA
1515 struct io_ring_ctx *ctx = req->ctx;
1516 unsigned long flags;
1517
1518 /*
1519 * If this is a timeout link, we could be racing with the
1520 * timeout timer. Grab the completion lock for this case to
7c9e7f0f 1521 * protect against that.
2665abfd
JA
1522 */
1523 spin_lock_irqsave(&ctx->completion_lock, flags);
1524 io_req_link_next(req, nxt);
1525 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1526 } else {
1527 io_req_link_next(req, nxt);
9e645e11 1528 }
4d7dd462 1529}
9e645e11 1530
c69f8dbe
JL
1531static void io_free_req(struct io_kiocb *req)
1532{
944e58bf
PB
1533 struct io_kiocb *nxt = NULL;
1534
1535 io_req_find_next(req, &nxt);
70cf9f32 1536 __io_free_req(req);
944e58bf
PB
1537
1538 if (nxt)
1539 io_queue_async_work(nxt);
c69f8dbe
JL
1540}
1541
7a743e22
PB
1542static void io_link_work_cb(struct io_wq_work **workptr)
1543{
1544 struct io_wq_work *work = *workptr;
1545 struct io_kiocb *link = work->data;
1546
1547 io_queue_linked_timeout(link);
1548 io_wq_submit_work(workptr);
1549}
1550
1551static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt)
1552{
1553 struct io_kiocb *link;
1554
1555 *workptr = &nxt->work;
1556 link = io_prep_linked_timeout(nxt);
1557 if (link) {
1558 nxt->work.func = io_link_work_cb;
1559 nxt->work.data = link;
1560 }
1561}
1562
ba816ad6
JA
1563/*
1564 * Drop reference to request, return next in chain (if there is one) if this
1565 * was the last reference to this request.
1566 */
f9bd67f6 1567__attribute__((nonnull))
ec9c02ad 1568static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
e65ef56d 1569{
2a44f467
JA
1570 if (refcount_dec_and_test(&req->refs)) {
1571 io_req_find_next(req, nxtptr);
4d7dd462 1572 __io_free_req(req);
2a44f467 1573 }
2b188cc1
JA
1574}
1575
e65ef56d
JA
1576static void io_put_req(struct io_kiocb *req)
1577{
1578 if (refcount_dec_and_test(&req->refs))
1579 io_free_req(req);
2b188cc1
JA
1580}
1581
e9fd9396
PB
1582static void io_steal_work(struct io_kiocb *req,
1583 struct io_wq_work **workptr)
7a743e22
PB
1584{
1585 /*
1586 * It's in an io-wq worker, so there always should be at least
1587 * one reference, which will be dropped in io_put_work() just
1588 * after the current handler returns.
1589 *
1590 * It also means, that if the counter dropped to 1, then there is
1591 * no asynchronous users left, so it's safe to steal the next work.
1592 */
7a743e22
PB
1593 if (refcount_read(&req->refs) == 1) {
1594 struct io_kiocb *nxt = NULL;
1595
1596 io_req_find_next(req, &nxt);
1597 if (nxt)
1598 io_wq_assign_next(workptr, nxt);
1599 }
1600}
1601
978db57e
JA
1602/*
1603 * Must only be used if we don't need to care about links, usually from
1604 * within the completion handling itself.
1605 */
1606static void __io_double_put_req(struct io_kiocb *req)
78e19bbe
JA
1607{
1608 /* drop both submit and complete references */
1609 if (refcount_sub_and_test(2, &req->refs))
1610 __io_free_req(req);
1611}
1612
978db57e
JA
1613static void io_double_put_req(struct io_kiocb *req)
1614{
1615 /* drop both submit and complete references */
1616 if (refcount_sub_and_test(2, &req->refs))
1617 io_free_req(req);
1618}
1619
1d7bb1d5 1620static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
a3a0e43f 1621{
84f97dc2
JA
1622 struct io_rings *rings = ctx->rings;
1623
ad3eb2c8
JA
1624 if (test_bit(0, &ctx->cq_check_overflow)) {
1625 /*
1626 * noflush == true is from the waitqueue handler, just ensure
1627 * we wake up the task, and the next invocation will flush the
1628 * entries. We cannot safely to it from here.
1629 */
1630 if (noflush && !list_empty(&ctx->cq_overflow_list))
1631 return -1U;
1d7bb1d5 1632
ad3eb2c8
JA
1633 io_cqring_overflow_flush(ctx, false);
1634 }
1d7bb1d5 1635
a3a0e43f
JA
1636 /* See comment at the top of this file */
1637 smp_rmb();
ad3eb2c8 1638 return ctx->cached_cq_tail - READ_ONCE(rings->cq.head);
a3a0e43f
JA
1639}
1640
fb5ccc98
PB
1641static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
1642{
1643 struct io_rings *rings = ctx->rings;
1644
1645 /* make sure SQ entry isn't read before tail */
1646 return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
1647}
1648
8237e045 1649static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req)
e94f141b 1650{
c6ca97b3
JA
1651 if ((req->flags & REQ_F_LINK) || io_is_fallback_req(req))
1652 return false;
e94f141b 1653
c6ca97b3
JA
1654 if (!(req->flags & REQ_F_FIXED_FILE) || req->io)
1655 rb->need_iter++;
1656
1657 rb->reqs[rb->to_free++] = req;
1658 if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs)))
1659 io_free_req_many(req->ctx, rb);
1660 return true;
e94f141b
JA
1661}
1662
def596e9
JA
1663/*
1664 * Find and free completed poll iocbs
1665 */
1666static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
1667 struct list_head *done)
1668{
8237e045 1669 struct req_batch rb;
def596e9 1670 struct io_kiocb *req;
def596e9 1671
c6ca97b3 1672 rb.to_free = rb.need_iter = 0;
def596e9
JA
1673 while (!list_empty(done)) {
1674 req = list_first_entry(done, struct io_kiocb, list);
1675 list_del(&req->list);
1676
78e19bbe 1677 io_cqring_fill_event(req, req->result);
def596e9
JA
1678 (*nr_events)++;
1679
8237e045
JA
1680 if (refcount_dec_and_test(&req->refs) &&
1681 !io_req_multi_free(&rb, req))
1682 io_free_req(req);
def596e9 1683 }
def596e9 1684
09bb8394 1685 io_commit_cqring(ctx);
8237e045 1686 io_free_req_many(ctx, &rb);
def596e9
JA
1687}
1688
1689static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
1690 long min)
1691{
1692 struct io_kiocb *req, *tmp;
1693 LIST_HEAD(done);
1694 bool spin;
1695 int ret;
1696
1697 /*
1698 * Only spin for completions if we don't have multiple devices hanging
1699 * off our complete list, and we're under the requested amount.
1700 */
1701 spin = !ctx->poll_multi_file && *nr_events < min;
1702
1703 ret = 0;
1704 list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) {
9adbd45d 1705 struct kiocb *kiocb = &req->rw.kiocb;
def596e9
JA
1706
1707 /*
1708 * Move completed entries to our local list. If we find a
1709 * request that requires polling, break out and complete
1710 * the done list first, if we have entries there.
1711 */
1712 if (req->flags & REQ_F_IOPOLL_COMPLETED) {
1713 list_move_tail(&req->list, &done);
1714 continue;
1715 }
1716 if (!list_empty(&done))
1717 break;
1718
1719 ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
1720 if (ret < 0)
1721 break;
1722
1723 if (ret && spin)
1724 spin = false;
1725 ret = 0;
1726 }
1727
1728 if (!list_empty(&done))
1729 io_iopoll_complete(ctx, nr_events, &done);
1730
1731 return ret;
1732}
1733
1734/*
d195a66e 1735 * Poll for a minimum of 'min' events. Note that if min == 0 we consider that a
def596e9
JA
1736 * non-spinning poll check - we'll still enter the driver poll loop, but only
1737 * as a non-spinning completion check.
1738 */
1739static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
1740 long min)
1741{
08f5439f 1742 while (!list_empty(&ctx->poll_list) && !need_resched()) {
def596e9
JA
1743 int ret;
1744
1745 ret = io_do_iopoll(ctx, nr_events, min);
1746 if (ret < 0)
1747 return ret;
1748 if (!min || *nr_events >= min)
1749 return 0;
1750 }
1751
1752 return 1;
1753}
1754
1755/*
1756 * We can't just wait for polled events to come to us, we have to actively
1757 * find and complete them.
1758 */
1759static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
1760{
1761 if (!(ctx->flags & IORING_SETUP_IOPOLL))
1762 return;
1763
1764 mutex_lock(&ctx->uring_lock);
1765 while (!list_empty(&ctx->poll_list)) {
1766 unsigned int nr_events = 0;
1767
1768 io_iopoll_getevents(ctx, &nr_events, 1);
08f5439f
JA
1769
1770 /*
1771 * Ensure we allow local-to-the-cpu processing to take place,
1772 * in this case we need to ensure that we reap all events.
1773 */
1774 cond_resched();
def596e9
JA
1775 }
1776 mutex_unlock(&ctx->uring_lock);
1777}
1778
c7849be9
XW
1779static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
1780 long min)
def596e9 1781{
2b2ed975 1782 int iters = 0, ret = 0;
500f9fba 1783
c7849be9
XW
1784 /*
1785 * We disallow the app entering submit/complete with polling, but we
1786 * still need to lock the ring to prevent racing with polled issue
1787 * that got punted to a workqueue.
1788 */
1789 mutex_lock(&ctx->uring_lock);
def596e9
JA
1790 do {
1791 int tmin = 0;
1792
a3a0e43f
JA
1793 /*
1794 * Don't enter poll loop if we already have events pending.
1795 * If we do, we can potentially be spinning for commands that
1796 * already triggered a CQE (eg in error).
1797 */
1d7bb1d5 1798 if (io_cqring_events(ctx, false))
a3a0e43f
JA
1799 break;
1800
500f9fba
JA
1801 /*
1802 * If a submit got punted to a workqueue, we can have the
1803 * application entering polling for a command before it gets
1804 * issued. That app will hold the uring_lock for the duration
1805 * of the poll right here, so we need to take a breather every
1806 * now and then to ensure that the issue has a chance to add
1807 * the poll to the issued list. Otherwise we can spin here
1808 * forever, while the workqueue is stuck trying to acquire the
1809 * very same mutex.
1810 */
1811 if (!(++iters & 7)) {
1812 mutex_unlock(&ctx->uring_lock);
1813 mutex_lock(&ctx->uring_lock);
1814 }
1815
def596e9
JA
1816 if (*nr_events < min)
1817 tmin = min - *nr_events;
1818
1819 ret = io_iopoll_getevents(ctx, nr_events, tmin);
1820 if (ret <= 0)
1821 break;
1822 ret = 0;
1823 } while (min && !*nr_events && !need_resched());
1824
500f9fba 1825 mutex_unlock(&ctx->uring_lock);
def596e9
JA
1826 return ret;
1827}
1828
491381ce 1829static void kiocb_end_write(struct io_kiocb *req)
2b188cc1 1830{
491381ce
JA
1831 /*
1832 * Tell lockdep we inherited freeze protection from submission
1833 * thread.
1834 */
1835 if (req->flags & REQ_F_ISREG) {
1836 struct inode *inode = file_inode(req->file);
2b188cc1 1837
491381ce 1838 __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
2b188cc1 1839 }
491381ce 1840 file_end_write(req->file);
2b188cc1
JA
1841}
1842
4e88d6e7
JA
1843static inline void req_set_fail_links(struct io_kiocb *req)
1844{
1845 if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
1846 req->flags |= REQ_F_FAIL_LINK;
1847}
1848
ba816ad6 1849static void io_complete_rw_common(struct kiocb *kiocb, long res)
2b188cc1 1850{
9adbd45d 1851 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2b188cc1 1852
491381ce
JA
1853 if (kiocb->ki_flags & IOCB_WRITE)
1854 kiocb_end_write(req);
2b188cc1 1855
4e88d6e7
JA
1856 if (res != req->result)
1857 req_set_fail_links(req);
78e19bbe 1858 io_cqring_add_event(req, res);
ba816ad6
JA
1859}
1860
1861static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
1862{
9adbd45d 1863 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
ba816ad6
JA
1864
1865 io_complete_rw_common(kiocb, res);
e65ef56d 1866 io_put_req(req);
2b188cc1
JA
1867}
1868
def596e9
JA
1869static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
1870{
9adbd45d 1871 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
def596e9 1872
491381ce
JA
1873 if (kiocb->ki_flags & IOCB_WRITE)
1874 kiocb_end_write(req);
def596e9 1875
4e88d6e7
JA
1876 if (res != req->result)
1877 req_set_fail_links(req);
9e645e11 1878 req->result = res;
def596e9
JA
1879 if (res != -EAGAIN)
1880 req->flags |= REQ_F_IOPOLL_COMPLETED;
1881}
1882
1883/*
1884 * After the iocb has been issued, it's safe to be found on the poll list.
1885 * Adding the kiocb to the list AFTER submission ensures that we don't
1886 * find it from a io_iopoll_getevents() thread before the issuer is done
1887 * accessing the kiocb cookie.
1888 */
1889static void io_iopoll_req_issued(struct io_kiocb *req)
1890{
1891 struct io_ring_ctx *ctx = req->ctx;
1892
1893 /*
1894 * Track whether we have multiple files in our lists. This will impact
1895 * how we do polling eventually, not spinning if we're on potentially
1896 * different devices.
1897 */
1898 if (list_empty(&ctx->poll_list)) {
1899 ctx->poll_multi_file = false;
1900 } else if (!ctx->poll_multi_file) {
1901 struct io_kiocb *list_req;
1902
1903 list_req = list_first_entry(&ctx->poll_list, struct io_kiocb,
1904 list);
9adbd45d 1905 if (list_req->file != req->file)
def596e9
JA
1906 ctx->poll_multi_file = true;
1907 }
1908
1909 /*
1910 * For fast devices, IO may have already completed. If it has, add
1911 * it to the front so we find it first.
1912 */
1913 if (req->flags & REQ_F_IOPOLL_COMPLETED)
1914 list_add(&req->list, &ctx->poll_list);
1915 else
1916 list_add_tail(&req->list, &ctx->poll_list);
bdcd3eab
XW
1917
1918 if ((ctx->flags & IORING_SETUP_SQPOLL) &&
1919 wq_has_sleeper(&ctx->sqo_wait))
1920 wake_up(&ctx->sqo_wait);
def596e9
JA
1921}
1922
3d6770fb 1923static void io_file_put(struct io_submit_state *state)
9a56a232 1924{
3d6770fb 1925 if (state->file) {
9a56a232
JA
1926 int diff = state->has_refs - state->used_refs;
1927
1928 if (diff)
1929 fput_many(state->file, diff);
1930 state->file = NULL;
1931 }
1932}
1933
1934/*
1935 * Get as many references to a file as we have IOs left in this submission,
1936 * assuming most submissions are for one file, or at least that each file
1937 * has more than one submission.
1938 */
8da11c19 1939static struct file *__io_file_get(struct io_submit_state *state, int fd)
9a56a232
JA
1940{
1941 if (!state)
1942 return fget(fd);
1943
1944 if (state->file) {
1945 if (state->fd == fd) {
1946 state->used_refs++;
1947 state->ios_left--;
1948 return state->file;
1949 }
3d6770fb 1950 io_file_put(state);
9a56a232
JA
1951 }
1952 state->file = fget_many(fd, state->ios_left);
1953 if (!state->file)
1954 return NULL;
1955
1956 state->fd = fd;
1957 state->has_refs = state->ios_left;
1958 state->used_refs = 1;
1959 state->ios_left--;
1960 return state->file;
1961}
1962
2b188cc1
JA
1963/*
1964 * If we tracked the file through the SCM inflight mechanism, we could support
1965 * any file. For now, just ensure that anything potentially problematic is done
1966 * inline.
1967 */
1968static bool io_file_supports_async(struct file *file)
1969{
1970 umode_t mode = file_inode(file)->i_mode;
1971
10d59345 1972 if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISSOCK(mode))
2b188cc1
JA
1973 return true;
1974 if (S_ISREG(mode) && file->f_op != &io_uring_fops)
1975 return true;
1976
1977 return false;
1978}
1979
3529d8c2
JA
1980static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
1981 bool force_nonblock)
2b188cc1 1982{
def596e9 1983 struct io_ring_ctx *ctx = req->ctx;
9adbd45d 1984 struct kiocb *kiocb = &req->rw.kiocb;
09bb8394
JA
1985 unsigned ioprio;
1986 int ret;
2b188cc1 1987
491381ce
JA
1988 if (S_ISREG(file_inode(req->file)->i_mode))
1989 req->flags |= REQ_F_ISREG;
1990
2b188cc1 1991 kiocb->ki_pos = READ_ONCE(sqe->off);
ba04291e
JA
1992 if (kiocb->ki_pos == -1 && !(req->file->f_mode & FMODE_STREAM)) {
1993 req->flags |= REQ_F_CUR_POS;
1994 kiocb->ki_pos = req->file->f_pos;
1995 }
2b188cc1 1996 kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
3e577dcd
PB
1997 kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
1998 ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
1999 if (unlikely(ret))
2000 return ret;
2b188cc1
JA
2001
2002 ioprio = READ_ONCE(sqe->ioprio);
2003 if (ioprio) {
2004 ret = ioprio_check_cap(ioprio);
2005 if (ret)
09bb8394 2006 return ret;
2b188cc1
JA
2007
2008 kiocb->ki_ioprio = ioprio;
2009 } else
2010 kiocb->ki_ioprio = get_current_ioprio();
2011
8449eeda 2012 /* don't allow async punt if RWF_NOWAIT was requested */
491381ce
JA
2013 if ((kiocb->ki_flags & IOCB_NOWAIT) ||
2014 (req->file->f_flags & O_NONBLOCK))
8449eeda
SB
2015 req->flags |= REQ_F_NOWAIT;
2016
2017 if (force_nonblock)
2b188cc1 2018 kiocb->ki_flags |= IOCB_NOWAIT;
8449eeda 2019
def596e9 2020 if (ctx->flags & IORING_SETUP_IOPOLL) {
def596e9
JA
2021 if (!(kiocb->ki_flags & IOCB_DIRECT) ||
2022 !kiocb->ki_filp->f_op->iopoll)
09bb8394 2023 return -EOPNOTSUPP;
2b188cc1 2024
def596e9
JA
2025 kiocb->ki_flags |= IOCB_HIPRI;
2026 kiocb->ki_complete = io_complete_rw_iopoll;
6873e0bd 2027 req->result = 0;
def596e9 2028 } else {
09bb8394
JA
2029 if (kiocb->ki_flags & IOCB_HIPRI)
2030 return -EINVAL;
def596e9
JA
2031 kiocb->ki_complete = io_complete_rw;
2032 }
9adbd45d 2033
3529d8c2
JA
2034 req->rw.addr = READ_ONCE(sqe->addr);
2035 req->rw.len = READ_ONCE(sqe->len);
9adbd45d
JA
2036 /* we own ->private, reuse it for the buffer index */
2037 req->rw.kiocb.private = (void *) (unsigned long)
3529d8c2 2038 READ_ONCE(sqe->buf_index);
2b188cc1 2039 return 0;
2b188cc1
JA
2040}
2041
2042static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
2043{
2044 switch (ret) {
2045 case -EIOCBQUEUED:
2046 break;
2047 case -ERESTARTSYS:
2048 case -ERESTARTNOINTR:
2049 case -ERESTARTNOHAND:
2050 case -ERESTART_RESTARTBLOCK:
2051 /*
2052 * We can't just restart the syscall, since previously
2053 * submitted sqes may already be in progress. Just fail this
2054 * IO with EINTR.
2055 */
2056 ret = -EINTR;
2057 /* fall through */
2058 default:
2059 kiocb->ki_complete(kiocb, ret, 0);
2060 }
2061}
2062
014db007 2063static void kiocb_done(struct kiocb *kiocb, ssize_t ret)
ba816ad6 2064{
ba04291e
JA
2065 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2066
2067 if (req->flags & REQ_F_CUR_POS)
2068 req->file->f_pos = kiocb->ki_pos;
bcaec089 2069 if (ret >= 0 && kiocb->ki_complete == io_complete_rw)
014db007 2070 io_complete_rw(kiocb, ret, 0);
ba816ad6
JA
2071 else
2072 io_rw_done(kiocb, ret);
2073}
2074
9adbd45d 2075static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
7d009165 2076 struct iov_iter *iter)
edafccee 2077{
9adbd45d
JA
2078 struct io_ring_ctx *ctx = req->ctx;
2079 size_t len = req->rw.len;
edafccee
JA
2080 struct io_mapped_ubuf *imu;
2081 unsigned index, buf_index;
2082 size_t offset;
2083 u64 buf_addr;
2084
2085 /* attempt to use fixed buffers without having provided iovecs */
2086 if (unlikely(!ctx->user_bufs))
2087 return -EFAULT;
2088
9adbd45d 2089 buf_index = (unsigned long) req->rw.kiocb.private;
edafccee
JA
2090 if (unlikely(buf_index >= ctx->nr_user_bufs))
2091 return -EFAULT;
2092
2093 index = array_index_nospec(buf_index, ctx->nr_user_bufs);
2094 imu = &ctx->user_bufs[index];
9adbd45d 2095 buf_addr = req->rw.addr;
edafccee
JA
2096
2097 /* overflow */
2098 if (buf_addr + len < buf_addr)
2099 return -EFAULT;
2100 /* not inside the mapped region */
2101 if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
2102 return -EFAULT;
2103
2104 /*
2105 * May not be a start of buffer, set size appropriately
2106 * and advance us to the beginning.
2107 */
2108 offset = buf_addr - imu->ubuf;
2109 iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
bd11b3a3
JA
2110
2111 if (offset) {
2112 /*
2113 * Don't use iov_iter_advance() here, as it's really slow for
2114 * using the latter parts of a big fixed buffer - it iterates
2115 * over each segment manually. We can cheat a bit here, because
2116 * we know that:
2117 *
2118 * 1) it's a BVEC iter, we set it up
2119 * 2) all bvecs are PAGE_SIZE in size, except potentially the
2120 * first and last bvec
2121 *
2122 * So just find our index, and adjust the iterator afterwards.
2123 * If the offset is within the first bvec (or the whole first
2124 * bvec, just use iov_iter_advance(). This makes it easier
2125 * since we can just skip the first segment, which may not
2126 * be PAGE_SIZE aligned.
2127 */
2128 const struct bio_vec *bvec = imu->bvec;
2129
2130 if (offset <= bvec->bv_len) {
2131 iov_iter_advance(iter, offset);
2132 } else {
2133 unsigned long seg_skip;
2134
2135 /* skip first vec */
2136 offset -= bvec->bv_len;
2137 seg_skip = 1 + (offset >> PAGE_SHIFT);
2138
2139 iter->bvec = bvec + seg_skip;
2140 iter->nr_segs -= seg_skip;
99c79f66 2141 iter->count -= bvec->bv_len + offset;
bd11b3a3 2142 iter->iov_offset = offset & ~PAGE_MASK;
bd11b3a3
JA
2143 }
2144 }
2145
5e559561 2146 return len;
edafccee
JA
2147}
2148
cf6fd4bd
PB
2149static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
2150 struct iovec **iovec, struct iov_iter *iter)
2b188cc1 2151{
9adbd45d
JA
2152 void __user *buf = u64_to_user_ptr(req->rw.addr);
2153 size_t sqe_len = req->rw.len;
edafccee
JA
2154 u8 opcode;
2155
d625c6ee 2156 opcode = req->opcode;
7d009165 2157 if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
edafccee 2158 *iovec = NULL;
9adbd45d 2159 return io_import_fixed(req, rw, iter);
edafccee 2160 }
2b188cc1 2161
9adbd45d
JA
2162 /* buffer index only valid with fixed read/write */
2163 if (req->rw.kiocb.private)
2164 return -EINVAL;
2165
3a6820f2
JA
2166 if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
2167 ssize_t ret;
2168 ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
2169 *iovec = NULL;
3a901598 2170 return ret < 0 ? ret : sqe_len;
3a6820f2
JA
2171 }
2172
f67676d1
JA
2173 if (req->io) {
2174 struct io_async_rw *iorw = &req->io->rw;
2175
2176 *iovec = iorw->iov;
2177 iov_iter_init(iter, rw, *iovec, iorw->nr_segs, iorw->size);
2178 if (iorw->iov == iorw->fast_iov)
2179 *iovec = NULL;
2180 return iorw->size;
2181 }
2182
2b188cc1 2183#ifdef CONFIG_COMPAT
cf6fd4bd 2184 if (req->ctx->compat)
2b188cc1
JA
2185 return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
2186 iovec, iter);
2187#endif
2188
2189 return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
2190}
2191
31b51510 2192/*
32960613
JA
2193 * For files that don't have ->read_iter() and ->write_iter(), handle them
2194 * by looping over ->read() or ->write() manually.
31b51510 2195 */
32960613
JA
2196static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
2197 struct iov_iter *iter)
2198{
2199 ssize_t ret = 0;
2200
2201 /*
2202 * Don't support polled IO through this interface, and we can't
2203 * support non-blocking either. For the latter, this just causes
2204 * the kiocb to be handled from an async context.
2205 */
2206 if (kiocb->ki_flags & IOCB_HIPRI)
2207 return -EOPNOTSUPP;
2208 if (kiocb->ki_flags & IOCB_NOWAIT)
2209 return -EAGAIN;
2210
2211 while (iov_iter_count(iter)) {
311ae9e1 2212 struct iovec iovec;
32960613
JA
2213 ssize_t nr;
2214
311ae9e1
PB
2215 if (!iov_iter_is_bvec(iter)) {
2216 iovec = iov_iter_iovec(iter);
2217 } else {
2218 /* fixed buffers import bvec */
2219 iovec.iov_base = kmap(iter->bvec->bv_page)
2220 + iter->iov_offset;
2221 iovec.iov_len = min(iter->count,
2222 iter->bvec->bv_len - iter->iov_offset);
2223 }
2224
32960613
JA
2225 if (rw == READ) {
2226 nr = file->f_op->read(file, iovec.iov_base,
2227 iovec.iov_len, &kiocb->ki_pos);
2228 } else {
2229 nr = file->f_op->write(file, iovec.iov_base,
2230 iovec.iov_len, &kiocb->ki_pos);
2231 }
2232
311ae9e1
PB
2233 if (iov_iter_is_bvec(iter))
2234 kunmap(iter->bvec->bv_page);
2235
32960613
JA
2236 if (nr < 0) {
2237 if (!ret)
2238 ret = nr;
2239 break;
2240 }
2241 ret += nr;
2242 if (nr != iovec.iov_len)
2243 break;
2244 iov_iter_advance(iter, nr);
2245 }
2246
2247 return ret;
2248}
2249
b7bb4f7d 2250static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size,
f67676d1
JA
2251 struct iovec *iovec, struct iovec *fast_iov,
2252 struct iov_iter *iter)
2253{
2254 req->io->rw.nr_segs = iter->nr_segs;
2255 req->io->rw.size = io_size;
2256 req->io->rw.iov = iovec;
2257 if (!req->io->rw.iov) {
2258 req->io->rw.iov = req->io->rw.fast_iov;
2259 memcpy(req->io->rw.iov, fast_iov,
2260 sizeof(struct iovec) * iter->nr_segs);
99bc4c38
PB
2261 } else {
2262 req->flags |= REQ_F_NEED_CLEANUP;
f67676d1
JA
2263 }
2264}
2265
b7bb4f7d 2266static int io_alloc_async_ctx(struct io_kiocb *req)
f67676d1 2267{
d3656344
JA
2268 if (!io_op_defs[req->opcode].async_ctx)
2269 return 0;
f67676d1 2270 req->io = kmalloc(sizeof(*req->io), GFP_KERNEL);
06b76d44 2271 return req->io == NULL;
b7bb4f7d
JA
2272}
2273
b7bb4f7d
JA
2274static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
2275 struct iovec *iovec, struct iovec *fast_iov,
2276 struct iov_iter *iter)
2277{
980ad263 2278 if (!io_op_defs[req->opcode].async_ctx)
74566df3 2279 return 0;
5d204bcf
JA
2280 if (!req->io) {
2281 if (io_alloc_async_ctx(req))
2282 return -ENOMEM;
b7bb4f7d 2283
5d204bcf
JA
2284 io_req_map_rw(req, io_size, iovec, fast_iov, iter);
2285 }
b7bb4f7d 2286 return 0;
f67676d1
JA
2287}
2288
3529d8c2
JA
2289static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2290 bool force_nonblock)
f67676d1 2291{
3529d8c2
JA
2292 struct io_async_ctx *io;
2293 struct iov_iter iter;
f67676d1
JA
2294 ssize_t ret;
2295
3529d8c2
JA
2296 ret = io_prep_rw(req, sqe, force_nonblock);
2297 if (ret)
2298 return ret;
f67676d1 2299
3529d8c2
JA
2300 if (unlikely(!(req->file->f_mode & FMODE_READ)))
2301 return -EBADF;
f67676d1 2302
5f798bea
PB
2303 /* either don't need iovec imported or already have it */
2304 if (!req->io || req->flags & REQ_F_NEED_CLEANUP)
3529d8c2
JA
2305 return 0;
2306
2307 io = req->io;
2308 io->rw.iov = io->rw.fast_iov;
2309 req->io = NULL;
2310 ret = io_import_iovec(READ, req, &io->rw.iov, &iter);
2311 req->io = io;
2312 if (ret < 0)
2313 return ret;
2314
2315 io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
2316 return 0;
f67676d1
JA
2317}
2318
014db007 2319static int io_read(struct io_kiocb *req, bool force_nonblock)
2b188cc1
JA
2320{
2321 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
9adbd45d 2322 struct kiocb *kiocb = &req->rw.kiocb;
2b188cc1 2323 struct iov_iter iter;
31b51510 2324 size_t iov_count;
f67676d1 2325 ssize_t io_size, ret;
2b188cc1 2326
3529d8c2 2327 ret = io_import_iovec(READ, req, &iovec, &iter);
06b76d44
JA
2328 if (ret < 0)
2329 return ret;
2b188cc1 2330
fd6c2e4c
JA
2331 /* Ensure we clear previously set non-block flag */
2332 if (!force_nonblock)
29de5f6a 2333 kiocb->ki_flags &= ~IOCB_NOWAIT;
fd6c2e4c 2334
797f3f53 2335 req->result = 0;
f67676d1 2336 io_size = ret;
9e645e11 2337 if (req->flags & REQ_F_LINK)
f67676d1
JA
2338 req->result = io_size;
2339
2340 /*
2341 * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
2342 * we know to async punt it even if it was opened O_NONBLOCK
2343 */
29de5f6a 2344 if (force_nonblock && !io_file_supports_async(req->file))
f67676d1 2345 goto copy_iov;
9e645e11 2346
31b51510 2347 iov_count = iov_iter_count(&iter);
9adbd45d 2348 ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count);
2b188cc1
JA
2349 if (!ret) {
2350 ssize_t ret2;
2351
9adbd45d
JA
2352 if (req->file->f_op->read_iter)
2353 ret2 = call_read_iter(req->file, kiocb, &iter);
32960613 2354 else
9adbd45d 2355 ret2 = loop_rw_iter(READ, req->file, kiocb, &iter);
32960613 2356
9d93a3f5 2357 /* Catch -EAGAIN return for forced non-blocking submission */
f67676d1 2358 if (!force_nonblock || ret2 != -EAGAIN) {
014db007 2359 kiocb_done(kiocb, ret2);
f67676d1
JA
2360 } else {
2361copy_iov:
b7bb4f7d 2362 ret = io_setup_async_rw(req, io_size, iovec,
f67676d1
JA
2363 inline_vecs, &iter);
2364 if (ret)
2365 goto out_free;
29de5f6a
JA
2366 /* any defer here is final, must blocking retry */
2367 if (!(req->flags & REQ_F_NOWAIT))
2368 req->flags |= REQ_F_MUST_PUNT;
f67676d1
JA
2369 return -EAGAIN;
2370 }
2b188cc1 2371 }
f67676d1 2372out_free:
1e95081c 2373 kfree(iovec);
99bc4c38 2374 req->flags &= ~REQ_F_NEED_CLEANUP;
2b188cc1
JA
2375 return ret;
2376}
2377
3529d8c2
JA
2378static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2379 bool force_nonblock)
f67676d1 2380{
3529d8c2
JA
2381 struct io_async_ctx *io;
2382 struct iov_iter iter;
f67676d1
JA
2383 ssize_t ret;
2384
3529d8c2
JA
2385 ret = io_prep_rw(req, sqe, force_nonblock);
2386 if (ret)
2387 return ret;
f67676d1 2388
3529d8c2
JA
2389 if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
2390 return -EBADF;
f67676d1 2391
5f798bea
PB
2392 /* either don't need iovec imported or already have it */
2393 if (!req->io || req->flags & REQ_F_NEED_CLEANUP)
3529d8c2
JA
2394 return 0;
2395
2396 io = req->io;
2397 io->rw.iov = io->rw.fast_iov;
2398 req->io = NULL;
2399 ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter);
2400 req->io = io;
2401 if (ret < 0)
2402 return ret;
2403
2404 io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
2405 return 0;
f67676d1
JA
2406}
2407
014db007 2408static int io_write(struct io_kiocb *req, bool force_nonblock)
2b188cc1
JA
2409{
2410 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
9adbd45d 2411 struct kiocb *kiocb = &req->rw.kiocb;
2b188cc1 2412 struct iov_iter iter;
31b51510 2413 size_t iov_count;
f67676d1 2414 ssize_t ret, io_size;
2b188cc1 2415
3529d8c2 2416 ret = io_import_iovec(WRITE, req, &iovec, &iter);
06b76d44
JA
2417 if (ret < 0)
2418 return ret;
2b188cc1 2419
fd6c2e4c
JA
2420 /* Ensure we clear previously set non-block flag */
2421 if (!force_nonblock)
9adbd45d 2422 req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT;
fd6c2e4c 2423
797f3f53 2424 req->result = 0;
f67676d1 2425 io_size = ret;
9e645e11 2426 if (req->flags & REQ_F_LINK)
f67676d1 2427 req->result = io_size;
9e645e11 2428
f67676d1
JA
2429 /*
2430 * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
2431 * we know to async punt it even if it was opened O_NONBLOCK
2432 */
29de5f6a 2433 if (force_nonblock && !io_file_supports_async(req->file))
f67676d1 2434 goto copy_iov;
31b51510 2435
10d59345
JA
2436 /* file path doesn't support NOWAIT for non-direct_IO */
2437 if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
2438 (req->flags & REQ_F_ISREG))
f67676d1 2439 goto copy_iov;
31b51510 2440
f67676d1 2441 iov_count = iov_iter_count(&iter);
9adbd45d 2442 ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count);
2b188cc1 2443 if (!ret) {
9bf7933f
RP
2444 ssize_t ret2;
2445
2b188cc1
JA
2446 /*
2447 * Open-code file_start_write here to grab freeze protection,
2448 * which will be released by another thread in
2449 * io_complete_rw(). Fool lockdep by telling it the lock got
2450 * released so that it doesn't complain about the held lock when
2451 * we return to userspace.
2452 */
491381ce 2453 if (req->flags & REQ_F_ISREG) {
9adbd45d 2454 __sb_start_write(file_inode(req->file)->i_sb,
2b188cc1 2455 SB_FREEZE_WRITE, true);
9adbd45d 2456 __sb_writers_release(file_inode(req->file)->i_sb,
2b188cc1
JA
2457 SB_FREEZE_WRITE);
2458 }
2459 kiocb->ki_flags |= IOCB_WRITE;
9bf7933f 2460
9adbd45d
JA
2461 if (req->file->f_op->write_iter)
2462 ret2 = call_write_iter(req->file, kiocb, &iter);
32960613 2463 else
9adbd45d 2464 ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter);
faac996c
JA
2465 /*
2466 * Raw bdev writes will -EOPNOTSUPP for IOCB_NOWAIT. Just
2467 * retry them without IOCB_NOWAIT.
2468 */
2469 if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
2470 ret2 = -EAGAIN;
f67676d1 2471 if (!force_nonblock || ret2 != -EAGAIN) {
014db007 2472 kiocb_done(kiocb, ret2);
f67676d1
JA
2473 } else {
2474copy_iov:
b7bb4f7d 2475 ret = io_setup_async_rw(req, io_size, iovec,
f67676d1
JA
2476 inline_vecs, &iter);
2477 if (ret)
2478 goto out_free;
29de5f6a
JA
2479 /* any defer here is final, must blocking retry */
2480 req->flags |= REQ_F_MUST_PUNT;
f67676d1
JA
2481 return -EAGAIN;
2482 }
2b188cc1 2483 }
31b51510 2484out_free:
99bc4c38 2485 req->flags &= ~REQ_F_NEED_CLEANUP;
1e95081c 2486 kfree(iovec);
2b188cc1
JA
2487 return ret;
2488}
2489
7d67af2c
PB
2490static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2491{
2492 struct io_splice* sp = &req->splice;
2493 unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
2494 int ret;
2495
2496 if (req->flags & REQ_F_NEED_CLEANUP)
2497 return 0;
2498
2499 sp->file_in = NULL;
2500 sp->off_in = READ_ONCE(sqe->splice_off_in);
2501 sp->off_out = READ_ONCE(sqe->off);
2502 sp->len = READ_ONCE(sqe->len);
2503 sp->flags = READ_ONCE(sqe->splice_flags);
2504
2505 if (unlikely(sp->flags & ~valid_flags))
2506 return -EINVAL;
2507
2508 ret = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in), &sp->file_in,
2509 (sp->flags & SPLICE_F_FD_IN_FIXED));
2510 if (ret)
2511 return ret;
2512 req->flags |= REQ_F_NEED_CLEANUP;
2513
2514 if (!S_ISREG(file_inode(sp->file_in)->i_mode))
2515 req->work.flags |= IO_WQ_WORK_UNBOUND;
2516
2517 return 0;
2518}
2519
2520static bool io_splice_punt(struct file *file)
2521{
2522 if (get_pipe_info(file))
2523 return false;
2524 if (!io_file_supports_async(file))
2525 return true;
2526 return !(file->f_mode & O_NONBLOCK);
2527}
2528
014db007 2529static int io_splice(struct io_kiocb *req, bool force_nonblock)
7d67af2c
PB
2530{
2531 struct io_splice *sp = &req->splice;
2532 struct file *in = sp->file_in;
2533 struct file *out = sp->file_out;
2534 unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
2535 loff_t *poff_in, *poff_out;
2536 long ret;
2537
2538 if (force_nonblock) {
2539 if (io_splice_punt(in) || io_splice_punt(out))
2540 return -EAGAIN;
2541 flags |= SPLICE_F_NONBLOCK;
2542 }
2543
2544 poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
2545 poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
2546 ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
2547 if (force_nonblock && ret == -EAGAIN)
2548 return -EAGAIN;
2549
2550 io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
2551 req->flags &= ~REQ_F_NEED_CLEANUP;
2552
2553 io_cqring_add_event(req, ret);
2554 if (ret != sp->len)
2555 req_set_fail_links(req);
014db007 2556 io_put_req(req);
7d67af2c
PB
2557 return 0;
2558}
2559
2b188cc1
JA
2560/*
2561 * IORING_OP_NOP just posts a completion event, nothing else.
2562 */
78e19bbe 2563static int io_nop(struct io_kiocb *req)
2b188cc1
JA
2564{
2565 struct io_ring_ctx *ctx = req->ctx;
2b188cc1 2566
def596e9
JA
2567 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
2568 return -EINVAL;
2569
78e19bbe 2570 io_cqring_add_event(req, 0);
e65ef56d 2571 io_put_req(req);
2b188cc1
JA
2572 return 0;
2573}
2574
3529d8c2 2575static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
c992fe29 2576{
6b06314c 2577 struct io_ring_ctx *ctx = req->ctx;
c992fe29 2578
09bb8394
JA
2579 if (!req->file)
2580 return -EBADF;
c992fe29 2581
6b06314c 2582 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
def596e9 2583 return -EINVAL;
edafccee 2584 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
c992fe29
CH
2585 return -EINVAL;
2586
8ed8d3c3
JA
2587 req->sync.flags = READ_ONCE(sqe->fsync_flags);
2588 if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
2589 return -EINVAL;
2590
2591 req->sync.off = READ_ONCE(sqe->off);
2592 req->sync.len = READ_ONCE(sqe->len);
c992fe29
CH
2593 return 0;
2594}
2595
8ed8d3c3
JA
2596static bool io_req_cancelled(struct io_kiocb *req)
2597{
2598 if (req->work.flags & IO_WQ_WORK_CANCEL) {
2599 req_set_fail_links(req);
2600 io_cqring_add_event(req, -ECANCELED);
e9fd9396 2601 io_put_req(req);
8ed8d3c3
JA
2602 return true;
2603 }
2604
2605 return false;
2606}
2607
014db007 2608static void __io_fsync(struct io_kiocb *req)
8ed8d3c3 2609{
8ed8d3c3 2610 loff_t end = req->sync.off + req->sync.len;
8ed8d3c3
JA
2611 int ret;
2612
9adbd45d 2613 ret = vfs_fsync_range(req->file, req->sync.off,
8ed8d3c3
JA
2614 end > 0 ? end : LLONG_MAX,
2615 req->sync.flags & IORING_FSYNC_DATASYNC);
2616 if (ret < 0)
2617 req_set_fail_links(req);
2618 io_cqring_add_event(req, ret);
014db007 2619 io_put_req(req);
5ea62161
PB
2620}
2621
2622static void io_fsync_finish(struct io_wq_work **workptr)
2623{
2624 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
5ea62161
PB
2625
2626 if (io_req_cancelled(req))
2627 return;
014db007 2628 __io_fsync(req);
e9fd9396 2629 io_steal_work(req, workptr);
8ed8d3c3
JA
2630}
2631
014db007 2632static int io_fsync(struct io_kiocb *req, bool force_nonblock)
c992fe29 2633{
c992fe29 2634 /* fsync always requires a blocking context */
8ed8d3c3 2635 if (force_nonblock) {
8ed8d3c3 2636 req->work.func = io_fsync_finish;
c992fe29 2637 return -EAGAIN;
8ed8d3c3 2638 }
014db007 2639 __io_fsync(req);
c992fe29
CH
2640 return 0;
2641}
2642
014db007 2643static void __io_fallocate(struct io_kiocb *req)
8ed8d3c3 2644{
8ed8d3c3
JA
2645 int ret;
2646
d63d1b5e
JA
2647 ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
2648 req->sync.len);
8ed8d3c3
JA
2649 if (ret < 0)
2650 req_set_fail_links(req);
2651 io_cqring_add_event(req, ret);
014db007 2652 io_put_req(req);
5ea62161
PB
2653}
2654
2655static void io_fallocate_finish(struct io_wq_work **workptr)
2656{
2657 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
5ea62161 2658
594506fe
PB
2659 if (io_req_cancelled(req))
2660 return;
014db007 2661 __io_fallocate(req);
e9fd9396 2662 io_steal_work(req, workptr);
5d17b4a4
JA
2663}
2664
d63d1b5e
JA
2665static int io_fallocate_prep(struct io_kiocb *req,
2666 const struct io_uring_sqe *sqe)
2667{
2668 if (sqe->ioprio || sqe->buf_index || sqe->rw_flags)
2669 return -EINVAL;
2670
2671 req->sync.off = READ_ONCE(sqe->off);
2672 req->sync.len = READ_ONCE(sqe->addr);
2673 req->sync.mode = READ_ONCE(sqe->len);
2674 return 0;
2675}
2676
014db007 2677static int io_fallocate(struct io_kiocb *req, bool force_nonblock)
5d17b4a4 2678{
d63d1b5e 2679 /* fallocate always requiring blocking context */
8ed8d3c3 2680 if (force_nonblock) {
d63d1b5e 2681 req->work.func = io_fallocate_finish;
5d17b4a4 2682 return -EAGAIN;
8ed8d3c3 2683 }
5d17b4a4 2684
014db007 2685 __io_fallocate(req);
5d17b4a4
JA
2686 return 0;
2687}
2688
15b71abe 2689static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
b7bb4f7d 2690{
f8748881 2691 const char __user *fname;
15b71abe 2692 int ret;
b7bb4f7d 2693
15b71abe
JA
2694 if (sqe->ioprio || sqe->buf_index)
2695 return -EINVAL;
cf3040ca
JA
2696 if (sqe->flags & IOSQE_FIXED_FILE)
2697 return -EBADF;
0bdbdd08
PB
2698 if (req->flags & REQ_F_NEED_CLEANUP)
2699 return 0;
03b1230c 2700
15b71abe 2701 req->open.dfd = READ_ONCE(sqe->fd);
c12cedf2 2702 req->open.how.mode = READ_ONCE(sqe->len);
f8748881 2703 fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
c12cedf2 2704 req->open.how.flags = READ_ONCE(sqe->open_flags);
3529d8c2 2705
f8748881 2706 req->open.filename = getname(fname);
15b71abe
JA
2707 if (IS_ERR(req->open.filename)) {
2708 ret = PTR_ERR(req->open.filename);
2709 req->open.filename = NULL;
2710 return ret;
2711 }
3529d8c2 2712
8fef80bf 2713 req->flags |= REQ_F_NEED_CLEANUP;
15b71abe 2714 return 0;
03b1230c
JA
2715}
2716
cebdb986 2717static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
aa1fa28f 2718{
cebdb986
JA
2719 struct open_how __user *how;
2720 const char __user *fname;
2721 size_t len;
0fa03c62
JA
2722 int ret;
2723
cebdb986 2724 if (sqe->ioprio || sqe->buf_index)
0fa03c62 2725 return -EINVAL;
cf3040ca
JA
2726 if (sqe->flags & IOSQE_FIXED_FILE)
2727 return -EBADF;
0bdbdd08
PB
2728 if (req->flags & REQ_F_NEED_CLEANUP)
2729 return 0;
0fa03c62 2730
cebdb986
JA
2731 req->open.dfd = READ_ONCE(sqe->fd);
2732 fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
2733 how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
2734 len = READ_ONCE(sqe->len);
0fa03c62 2735
cebdb986
JA
2736 if (len < OPEN_HOW_SIZE_VER0)
2737 return -EINVAL;
3529d8c2 2738
cebdb986
JA
2739 ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
2740 len);
2741 if (ret)
2742 return ret;
3529d8c2 2743
cebdb986
JA
2744 if (!(req->open.how.flags & O_PATH) && force_o_largefile())
2745 req->open.how.flags |= O_LARGEFILE;
0fa03c62 2746
cebdb986
JA
2747 req->open.filename = getname(fname);
2748 if (IS_ERR(req->open.filename)) {
2749 ret = PTR_ERR(req->open.filename);
2750 req->open.filename = NULL;
2751 return ret;
2752 }
2753
8fef80bf 2754 req->flags |= REQ_F_NEED_CLEANUP;
cebdb986
JA
2755 return 0;
2756}
2757
014db007 2758static int io_openat2(struct io_kiocb *req, bool force_nonblock)
15b71abe
JA
2759{
2760 struct open_flags op;
15b71abe
JA
2761 struct file *file;
2762 int ret;
2763
f86cd20c 2764 if (force_nonblock)
15b71abe 2765 return -EAGAIN;
15b71abe 2766
cebdb986 2767 ret = build_open_flags(&req->open.how, &op);
15b71abe
JA
2768 if (ret)
2769 goto err;
2770
cebdb986 2771 ret = get_unused_fd_flags(req->open.how.flags);
15b71abe
JA
2772 if (ret < 0)
2773 goto err;
2774
2775 file = do_filp_open(req->open.dfd, req->open.filename, &op);
2776 if (IS_ERR(file)) {
2777 put_unused_fd(ret);
2778 ret = PTR_ERR(file);
2779 } else {
2780 fsnotify_open(file);
2781 fd_install(ret, file);
2782 }
2783err:
2784 putname(req->open.filename);
8fef80bf 2785 req->flags &= ~REQ_F_NEED_CLEANUP;
15b71abe
JA
2786 if (ret < 0)
2787 req_set_fail_links(req);
2788 io_cqring_add_event(req, ret);
014db007 2789 io_put_req(req);
15b71abe
JA
2790 return 0;
2791}
2792
014db007 2793static int io_openat(struct io_kiocb *req, bool force_nonblock)
cebdb986
JA
2794{
2795 req->open.how = build_open_how(req->open.how.flags, req->open.how.mode);
014db007 2796 return io_openat2(req, force_nonblock);
cebdb986
JA
2797}
2798
ddf0322d
JA
2799static int io_provide_buffers_prep(struct io_kiocb *req,
2800 const struct io_uring_sqe *sqe)
2801{
2802 struct io_provide_buf *p = &req->pbuf;
2803 u64 tmp;
2804
2805 if (sqe->ioprio || sqe->rw_flags)
2806 return -EINVAL;
2807
2808 tmp = READ_ONCE(sqe->fd);
2809 if (!tmp || tmp > USHRT_MAX)
2810 return -E2BIG;
2811 p->nbufs = tmp;
2812 p->addr = READ_ONCE(sqe->addr);
2813 p->len = READ_ONCE(sqe->len);
2814
2815 if (!access_ok(u64_to_user_ptr(p->addr), p->len))
2816 return -EFAULT;
2817
2818 p->bgid = READ_ONCE(sqe->buf_group);
2819 tmp = READ_ONCE(sqe->off);
2820 if (tmp > USHRT_MAX)
2821 return -E2BIG;
2822 p->bid = tmp;
2823 return 0;
2824}
2825
2826static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
2827{
2828 struct io_buffer *buf;
2829 u64 addr = pbuf->addr;
2830 int i, bid = pbuf->bid;
2831
2832 for (i = 0; i < pbuf->nbufs; i++) {
2833 buf = kmalloc(sizeof(*buf), GFP_KERNEL);
2834 if (!buf)
2835 break;
2836
2837 buf->addr = addr;
2838 buf->len = pbuf->len;
2839 buf->bid = bid;
2840 addr += pbuf->len;
2841 bid++;
2842 if (!*head) {
2843 INIT_LIST_HEAD(&buf->list);
2844 *head = buf;
2845 } else {
2846 list_add_tail(&buf->list, &(*head)->list);
2847 }
2848 }
2849
2850 return i ? i : -ENOMEM;
2851}
2852
2853static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
2854{
2855 if (needs_lock)
2856 mutex_unlock(&ctx->uring_lock);
2857}
2858
2859static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
2860{
2861 /*
2862 * "Normal" inline submissions always hold the uring_lock, since we
2863 * grab it from the system call. Same is true for the SQPOLL offload.
2864 * The only exception is when we've detached the request and issue it
2865 * from an async worker thread, grab the lock for that case.
2866 */
2867 if (needs_lock)
2868 mutex_lock(&ctx->uring_lock);
2869}
2870
2871static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock)
2872{
2873 struct io_provide_buf *p = &req->pbuf;
2874 struct io_ring_ctx *ctx = req->ctx;
2875 struct io_buffer *head, *list;
2876 int ret = 0;
2877
2878 io_ring_submit_lock(ctx, !force_nonblock);
2879
2880 lockdep_assert_held(&ctx->uring_lock);
2881
2882 list = head = idr_find(&ctx->io_buffer_idr, p->bgid);
2883
2884 ret = io_add_buffers(p, &head);
2885 if (ret < 0)
2886 goto out;
2887
2888 if (!list) {
2889 ret = idr_alloc(&ctx->io_buffer_idr, head, p->bgid, p->bgid + 1,
2890 GFP_KERNEL);
2891 if (ret < 0) {
2892 while (!list_empty(&head->list)) {
2893 struct io_buffer *buf;
2894
2895 buf = list_first_entry(&head->list,
2896 struct io_buffer, list);
2897 list_del(&buf->list);
2898 kfree(buf);
2899 }
2900 kfree(head);
2901 goto out;
2902 }
2903 }
2904out:
2905 io_ring_submit_unlock(ctx, !force_nonblock);
2906 if (ret < 0)
2907 req_set_fail_links(req);
2908 io_cqring_add_event(req, ret);
2909 io_put_req(req);
2910 return 0;
2911}
2912
3e4827b0
JA
2913static int io_epoll_ctl_prep(struct io_kiocb *req,
2914 const struct io_uring_sqe *sqe)
2915{
2916#if defined(CONFIG_EPOLL)
2917 if (sqe->ioprio || sqe->buf_index)
2918 return -EINVAL;
2919
2920 req->epoll.epfd = READ_ONCE(sqe->fd);
2921 req->epoll.op = READ_ONCE(sqe->len);
2922 req->epoll.fd = READ_ONCE(sqe->off);
2923
2924 if (ep_op_has_event(req->epoll.op)) {
2925 struct epoll_event __user *ev;
2926
2927 ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
2928 if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
2929 return -EFAULT;
2930 }
2931
2932 return 0;
2933#else
2934 return -EOPNOTSUPP;
2935#endif
2936}
2937
014db007 2938static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock)
3e4827b0
JA
2939{
2940#if defined(CONFIG_EPOLL)
2941 struct io_epoll *ie = &req->epoll;
2942 int ret;
2943
2944 ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
2945 if (force_nonblock && ret == -EAGAIN)
2946 return -EAGAIN;
2947
2948 if (ret < 0)
2949 req_set_fail_links(req);
2950 io_cqring_add_event(req, ret);
014db007 2951 io_put_req(req);
3e4827b0
JA
2952 return 0;
2953#else
2954 return -EOPNOTSUPP;
2955#endif
2956}
2957
c1ca757b
JA
2958static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2959{
2960#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
2961 if (sqe->ioprio || sqe->buf_index || sqe->off)
2962 return -EINVAL;
2963
2964 req->madvise.addr = READ_ONCE(sqe->addr);
2965 req->madvise.len = READ_ONCE(sqe->len);
2966 req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
2967 return 0;
2968#else
2969 return -EOPNOTSUPP;
2970#endif
2971}
2972
014db007 2973static int io_madvise(struct io_kiocb *req, bool force_nonblock)
c1ca757b
JA
2974{
2975#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
2976 struct io_madvise *ma = &req->madvise;
2977 int ret;
2978
2979 if (force_nonblock)
2980 return -EAGAIN;
2981
2982 ret = do_madvise(ma->addr, ma->len, ma->advice);
2983 if (ret < 0)
2984 req_set_fail_links(req);
2985 io_cqring_add_event(req, ret);
014db007 2986 io_put_req(req);
c1ca757b
JA
2987 return 0;
2988#else
2989 return -EOPNOTSUPP;
2990#endif
2991}
2992
4840e418
JA
2993static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2994{
2995 if (sqe->ioprio || sqe->buf_index || sqe->addr)
2996 return -EINVAL;
2997
2998 req->fadvise.offset = READ_ONCE(sqe->off);
2999 req->fadvise.len = READ_ONCE(sqe->len);
3000 req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
3001 return 0;
3002}
3003
014db007 3004static int io_fadvise(struct io_kiocb *req, bool force_nonblock)
4840e418
JA
3005{
3006 struct io_fadvise *fa = &req->fadvise;
3007 int ret;
3008
3e69426d
JA
3009 if (force_nonblock) {
3010 switch (fa->advice) {
3011 case POSIX_FADV_NORMAL:
3012 case POSIX_FADV_RANDOM:
3013 case POSIX_FADV_SEQUENTIAL:
3014 break;
3015 default:
3016 return -EAGAIN;
3017 }
3018 }
4840e418
JA
3019
3020 ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
3021 if (ret < 0)
3022 req_set_fail_links(req);
3023 io_cqring_add_event(req, ret);
014db007 3024 io_put_req(req);
4840e418
JA
3025 return 0;
3026}
3027
eddc7ef5
JA
3028static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3029{
f8748881 3030 const char __user *fname;
eddc7ef5
JA
3031 unsigned lookup_flags;
3032 int ret;
3033
3034 if (sqe->ioprio || sqe->buf_index)
3035 return -EINVAL;
cf3040ca
JA
3036 if (sqe->flags & IOSQE_FIXED_FILE)
3037 return -EBADF;
0bdbdd08
PB
3038 if (req->flags & REQ_F_NEED_CLEANUP)
3039 return 0;
eddc7ef5
JA
3040
3041 req->open.dfd = READ_ONCE(sqe->fd);
3042 req->open.mask = READ_ONCE(sqe->len);
f8748881 3043 fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
eddc7ef5 3044 req->open.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
c12cedf2 3045 req->open.how.flags = READ_ONCE(sqe->statx_flags);
eddc7ef5 3046
c12cedf2 3047 if (vfs_stat_set_lookup_flags(&lookup_flags, req->open.how.flags))
eddc7ef5
JA
3048 return -EINVAL;
3049
f8748881 3050 req->open.filename = getname_flags(fname, lookup_flags, NULL);
eddc7ef5
JA
3051 if (IS_ERR(req->open.filename)) {
3052 ret = PTR_ERR(req->open.filename);
3053 req->open.filename = NULL;
3054 return ret;
3055 }
3056
8fef80bf 3057 req->flags |= REQ_F_NEED_CLEANUP;
eddc7ef5
JA
3058 return 0;
3059}
3060
014db007 3061static int io_statx(struct io_kiocb *req, bool force_nonblock)
eddc7ef5
JA
3062{
3063 struct io_open *ctx = &req->open;
3064 unsigned lookup_flags;
3065 struct path path;
3066 struct kstat stat;
3067 int ret;
3068
3069 if (force_nonblock)
3070 return -EAGAIN;
3071
c12cedf2 3072 if (vfs_stat_set_lookup_flags(&lookup_flags, ctx->how.flags))
eddc7ef5
JA
3073 return -EINVAL;
3074
3075retry:
3076 /* filename_lookup() drops it, keep a reference */
3077 ctx->filename->refcnt++;
3078
3079 ret = filename_lookup(ctx->dfd, ctx->filename, lookup_flags, &path,
3080 NULL);
3081 if (ret)
3082 goto err;
3083
c12cedf2 3084 ret = vfs_getattr(&path, &stat, ctx->mask, ctx->how.flags);
eddc7ef5
JA
3085 path_put(&path);
3086 if (retry_estale(ret, lookup_flags)) {
3087 lookup_flags |= LOOKUP_REVAL;
3088 goto retry;
3089 }
3090 if (!ret)
3091 ret = cp_statx(&stat, ctx->buffer);
3092err:
3093 putname(ctx->filename);
8fef80bf 3094 req->flags &= ~REQ_F_NEED_CLEANUP;
eddc7ef5
JA
3095 if (ret < 0)
3096 req_set_fail_links(req);
3097 io_cqring_add_event(req, ret);
014db007 3098 io_put_req(req);
eddc7ef5
JA
3099 return 0;
3100}
3101
b5dba59e
JA
3102static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3103{
3104 /*
3105 * If we queue this for async, it must not be cancellable. That would
3106 * leave the 'file' in an undeterminate state.
3107 */
3108 req->work.flags |= IO_WQ_WORK_NO_CANCEL;
3109
3110 if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
3111 sqe->rw_flags || sqe->buf_index)
3112 return -EINVAL;
3113 if (sqe->flags & IOSQE_FIXED_FILE)
cf3040ca 3114 return -EBADF;
b5dba59e
JA
3115
3116 req->close.fd = READ_ONCE(sqe->fd);
3117 if (req->file->f_op == &io_uring_fops ||
b14cca0c 3118 req->close.fd == req->ctx->ring_fd)
b5dba59e
JA
3119 return -EBADF;
3120
3121 return 0;
3122}
3123
a93b3331 3124/* only called when __close_fd_get_file() is done */
014db007 3125static void __io_close_finish(struct io_kiocb *req)
a93b3331
PB
3126{
3127 int ret;
3128
3129 ret = filp_close(req->close.put_file, req->work.files);
3130 if (ret < 0)
3131 req_set_fail_links(req);
3132 io_cqring_add_event(req, ret);
3133 fput(req->close.put_file);
014db007 3134 io_put_req(req);
a93b3331
PB
3135}
3136
b5dba59e
JA
3137static void io_close_finish(struct io_wq_work **workptr)
3138{
3139 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
b5dba59e 3140
7fbeb95d 3141 /* not cancellable, don't do io_req_cancelled() */
014db007 3142 __io_close_finish(req);
e9fd9396 3143 io_steal_work(req, workptr);
b5dba59e
JA
3144}
3145
014db007 3146static int io_close(struct io_kiocb *req, bool force_nonblock)
b5dba59e
JA
3147{
3148 int ret;
3149
3150 req->close.put_file = NULL;
3151 ret = __close_fd_get_file(req->close.fd, &req->close.put_file);
3152 if (ret < 0)
3153 return ret;
3154
3155 /* if the file has a flush method, be safe and punt to async */
a2100672 3156 if (req->close.put_file->f_op->flush && force_nonblock) {
594506fe
PB
3157 /* submission ref will be dropped, take it for async */
3158 refcount_inc(&req->refs);
3159
a2100672
PB
3160 req->work.func = io_close_finish;
3161 /*
3162 * Do manual async queue here to avoid grabbing files - we don't
3163 * need the files, and it'll cause io_close_finish() to close
3164 * the file again and cause a double CQE entry for this request
3165 */
3166 io_queue_async_work(req);
3167 return 0;
3168 }
b5dba59e
JA
3169
3170 /*
3171 * No ->flush(), safely close from here and just punt the
3172 * fput() to async context.
3173 */
014db007 3174 __io_close_finish(req);
a93b3331 3175 return 0;
b5dba59e
JA
3176}
3177
3529d8c2 3178static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5d17b4a4
JA
3179{
3180 struct io_ring_ctx *ctx = req->ctx;
5d17b4a4
JA
3181
3182 if (!req->file)
3183 return -EBADF;
5d17b4a4
JA
3184
3185 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
3186 return -EINVAL;
3187 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
3188 return -EINVAL;
3189
8ed8d3c3
JA
3190 req->sync.off = READ_ONCE(sqe->off);
3191 req->sync.len = READ_ONCE(sqe->len);
3192 req->sync.flags = READ_ONCE(sqe->sync_range_flags);
8ed8d3c3
JA
3193 return 0;
3194}
3195
014db007 3196static void __io_sync_file_range(struct io_kiocb *req)
8ed8d3c3 3197{
8ed8d3c3
JA
3198 int ret;
3199
9adbd45d 3200 ret = sync_file_range(req->file, req->sync.off, req->sync.len,
8ed8d3c3
JA
3201 req->sync.flags);
3202 if (ret < 0)
3203 req_set_fail_links(req);
3204 io_cqring_add_event(req, ret);
014db007 3205 io_put_req(req);
5ea62161
PB
3206}
3207
3208
3209static void io_sync_file_range_finish(struct io_wq_work **workptr)
3210{
3211 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
3212 struct io_kiocb *nxt = NULL;
3213
3214 if (io_req_cancelled(req))
3215 return;
014db007 3216 __io_sync_file_range(req);
594506fe 3217 io_put_req(req); /* put submission ref */
8ed8d3c3 3218 if (nxt)
78912934 3219 io_wq_assign_next(workptr, nxt);
5d17b4a4
JA
3220}
3221
014db007 3222static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
5d17b4a4 3223{
5d17b4a4 3224 /* sync_file_range always requires a blocking context */
8ed8d3c3 3225 if (force_nonblock) {
8ed8d3c3 3226 req->work.func = io_sync_file_range_finish;
5d17b4a4 3227 return -EAGAIN;
8ed8d3c3 3228 }
5d17b4a4 3229
014db007 3230 __io_sync_file_range(req);
5d17b4a4
JA
3231 return 0;
3232}
3233
02d27d89
PB
3234static int io_setup_async_msg(struct io_kiocb *req,
3235 struct io_async_msghdr *kmsg)
3236{
3237 if (req->io)
3238 return -EAGAIN;
3239 if (io_alloc_async_ctx(req)) {
3240 if (kmsg->iov != kmsg->fast_iov)
3241 kfree(kmsg->iov);
3242 return -ENOMEM;
3243 }
3244 req->flags |= REQ_F_NEED_CLEANUP;
3245 memcpy(&req->io->msg, kmsg, sizeof(*kmsg));
3246 return -EAGAIN;
3247}
3248
3529d8c2 3249static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
03b1230c 3250{
0fa03c62 3251#if defined(CONFIG_NET)
e47293fd 3252 struct io_sr_msg *sr = &req->sr_msg;
3529d8c2 3253 struct io_async_ctx *io = req->io;
99bc4c38 3254 int ret;
03b1230c 3255
e47293fd
JA
3256 sr->msg_flags = READ_ONCE(sqe->msg_flags);
3257 sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
fddaface 3258 sr->len = READ_ONCE(sqe->len);
3529d8c2 3259
d8768362
JA
3260#ifdef CONFIG_COMPAT
3261 if (req->ctx->compat)
3262 sr->msg_flags |= MSG_CMSG_COMPAT;
3263#endif
3264
fddaface 3265 if (!io || req->opcode == IORING_OP_SEND)
3529d8c2 3266 return 0;
5f798bea
PB
3267 /* iovec is already imported */
3268 if (req->flags & REQ_F_NEED_CLEANUP)
3269 return 0;
3529d8c2 3270
d9688565 3271 io->msg.iov = io->msg.fast_iov;
99bc4c38 3272 ret = sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
e47293fd 3273 &io->msg.iov);
99bc4c38
PB
3274 if (!ret)
3275 req->flags |= REQ_F_NEED_CLEANUP;
3276 return ret;
03b1230c 3277#else
e47293fd 3278 return -EOPNOTSUPP;
03b1230c
JA
3279#endif
3280}
3281
014db007 3282static int io_sendmsg(struct io_kiocb *req, bool force_nonblock)
aa1fa28f 3283{
03b1230c 3284#if defined(CONFIG_NET)
0b416c3e 3285 struct io_async_msghdr *kmsg = NULL;
0fa03c62
JA
3286 struct socket *sock;
3287 int ret;
3288
3289 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3290 return -EINVAL;
3291
3292 sock = sock_from_file(req->file, &ret);
3293 if (sock) {
b7bb4f7d 3294 struct io_async_ctx io;
0fa03c62
JA
3295 unsigned flags;
3296
03b1230c 3297 if (req->io) {
0b416c3e 3298 kmsg = &req->io->msg;
b537916c 3299 kmsg->msg.msg_name = &req->io->msg.addr;
0b416c3e
JA
3300 /* if iov is set, it's allocated already */
3301 if (!kmsg->iov)
3302 kmsg->iov = kmsg->fast_iov;
3303 kmsg->msg.msg_iter.iov = kmsg->iov;
03b1230c 3304 } else {
3529d8c2
JA
3305 struct io_sr_msg *sr = &req->sr_msg;
3306
0b416c3e 3307 kmsg = &io.msg;
b537916c 3308 kmsg->msg.msg_name = &io.msg.addr;
3529d8c2
JA
3309
3310 io.msg.iov = io.msg.fast_iov;
3311 ret = sendmsg_copy_msghdr(&io.msg.msg, sr->msg,
3312 sr->msg_flags, &io.msg.iov);
03b1230c 3313 if (ret)
3529d8c2 3314 return ret;
03b1230c 3315 }
0fa03c62 3316
e47293fd
JA
3317 flags = req->sr_msg.msg_flags;
3318 if (flags & MSG_DONTWAIT)
3319 req->flags |= REQ_F_NOWAIT;
3320 else if (force_nonblock)
3321 flags |= MSG_DONTWAIT;
3322
0b416c3e 3323 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
02d27d89
PB
3324 if (force_nonblock && ret == -EAGAIN)
3325 return io_setup_async_msg(req, kmsg);
441cdbd5
JA
3326 if (ret == -ERESTARTSYS)
3327 ret = -EINTR;
0fa03c62
JA
3328 }
3329
1e95081c 3330 if (kmsg && kmsg->iov != kmsg->fast_iov)
0b416c3e 3331 kfree(kmsg->iov);
99bc4c38 3332 req->flags &= ~REQ_F_NEED_CLEANUP;
78e19bbe 3333 io_cqring_add_event(req, ret);
4e88d6e7
JA
3334 if (ret < 0)
3335 req_set_fail_links(req);
014db007 3336 io_put_req(req);
5d17b4a4 3337 return 0;
03b1230c
JA
3338#else
3339 return -EOPNOTSUPP;
aa1fa28f 3340#endif
03b1230c 3341}
aa1fa28f 3342
014db007 3343static int io_send(struct io_kiocb *req, bool force_nonblock)
fddaface
JA
3344{
3345#if defined(CONFIG_NET)
3346 struct socket *sock;
3347 int ret;
3348
3349 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3350 return -EINVAL;
3351
3352 sock = sock_from_file(req->file, &ret);
3353 if (sock) {
3354 struct io_sr_msg *sr = &req->sr_msg;
3355 struct msghdr msg;
3356 struct iovec iov;
3357 unsigned flags;
3358
3359 ret = import_single_range(WRITE, sr->buf, sr->len, &iov,
3360 &msg.msg_iter);
3361 if (ret)
3362 return ret;
3363
3364 msg.msg_name = NULL;
3365 msg.msg_control = NULL;
3366 msg.msg_controllen = 0;
3367 msg.msg_namelen = 0;
3368
3369 flags = req->sr_msg.msg_flags;
3370 if (flags & MSG_DONTWAIT)
3371 req->flags |= REQ_F_NOWAIT;
3372 else if (force_nonblock)
3373 flags |= MSG_DONTWAIT;
3374
0b7b21e4
JA
3375 msg.msg_flags = flags;
3376 ret = sock_sendmsg(sock, &msg);
fddaface
JA
3377 if (force_nonblock && ret == -EAGAIN)
3378 return -EAGAIN;
3379 if (ret == -ERESTARTSYS)
3380 ret = -EINTR;
3381 }
3382
3383 io_cqring_add_event(req, ret);
3384 if (ret < 0)
3385 req_set_fail_links(req);
014db007 3386 io_put_req(req);
fddaface
JA
3387 return 0;
3388#else
3389 return -EOPNOTSUPP;
3390#endif
3391}
3392
3529d8c2
JA
3393static int io_recvmsg_prep(struct io_kiocb *req,
3394 const struct io_uring_sqe *sqe)
aa1fa28f
JA
3395{
3396#if defined(CONFIG_NET)
e47293fd 3397 struct io_sr_msg *sr = &req->sr_msg;
3529d8c2 3398 struct io_async_ctx *io = req->io;
99bc4c38 3399 int ret;
3529d8c2
JA
3400
3401 sr->msg_flags = READ_ONCE(sqe->msg_flags);
3402 sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
0b7b21e4 3403 sr->len = READ_ONCE(sqe->len);
06b76d44 3404
d8768362
JA
3405#ifdef CONFIG_COMPAT
3406 if (req->ctx->compat)
3407 sr->msg_flags |= MSG_CMSG_COMPAT;
3408#endif
3409
fddaface 3410 if (!io || req->opcode == IORING_OP_RECV)
06b76d44 3411 return 0;
5f798bea
PB
3412 /* iovec is already imported */
3413 if (req->flags & REQ_F_NEED_CLEANUP)
3414 return 0;
03b1230c 3415
d9688565 3416 io->msg.iov = io->msg.fast_iov;
99bc4c38 3417 ret = recvmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
e47293fd 3418 &io->msg.uaddr, &io->msg.iov);
99bc4c38
PB
3419 if (!ret)
3420 req->flags |= REQ_F_NEED_CLEANUP;
3421 return ret;
aa1fa28f 3422#else
e47293fd 3423 return -EOPNOTSUPP;
aa1fa28f
JA
3424#endif
3425}
3426
014db007 3427static int io_recvmsg(struct io_kiocb *req, bool force_nonblock)
aa1fa28f
JA
3428{
3429#if defined(CONFIG_NET)
0b416c3e 3430 struct io_async_msghdr *kmsg = NULL;
03b1230c
JA
3431 struct socket *sock;
3432 int ret;
3433
3434 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3435 return -EINVAL;
3436
3437 sock = sock_from_file(req->file, &ret);
3438 if (sock) {
b7bb4f7d 3439 struct io_async_ctx io;
03b1230c
JA
3440 unsigned flags;
3441
03b1230c 3442 if (req->io) {
0b416c3e 3443 kmsg = &req->io->msg;
b537916c 3444 kmsg->msg.msg_name = &req->io->msg.addr;
0b416c3e
JA
3445 /* if iov is set, it's allocated already */
3446 if (!kmsg->iov)
3447 kmsg->iov = kmsg->fast_iov;
3448 kmsg->msg.msg_iter.iov = kmsg->iov;
03b1230c 3449 } else {
3529d8c2
JA
3450 struct io_sr_msg *sr = &req->sr_msg;
3451
0b416c3e 3452 kmsg = &io.msg;
b537916c 3453 kmsg->msg.msg_name = &io.msg.addr;
3529d8c2
JA
3454
3455 io.msg.iov = io.msg.fast_iov;
3456 ret = recvmsg_copy_msghdr(&io.msg.msg, sr->msg,
3457 sr->msg_flags, &io.msg.uaddr,
3458 &io.msg.iov);
03b1230c 3459 if (ret)
3529d8c2 3460 return ret;
03b1230c
JA
3461 }
3462
e47293fd
JA
3463 flags = req->sr_msg.msg_flags;
3464 if (flags & MSG_DONTWAIT)
3465 req->flags |= REQ_F_NOWAIT;
3466 else if (force_nonblock)
3467 flags |= MSG_DONTWAIT;
3468
3469 ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.msg,
3470 kmsg->uaddr, flags);
02d27d89
PB
3471 if (force_nonblock && ret == -EAGAIN)
3472 return io_setup_async_msg(req, kmsg);
03b1230c
JA
3473 if (ret == -ERESTARTSYS)
3474 ret = -EINTR;
3475 }
3476
1e95081c 3477 if (kmsg && kmsg->iov != kmsg->fast_iov)
0b416c3e 3478 kfree(kmsg->iov);
99bc4c38 3479 req->flags &= ~REQ_F_NEED_CLEANUP;
03b1230c 3480 io_cqring_add_event(req, ret);
4e88d6e7
JA
3481 if (ret < 0)
3482 req_set_fail_links(req);
014db007 3483 io_put_req(req);
03b1230c 3484 return 0;
0fa03c62
JA
3485#else
3486 return -EOPNOTSUPP;
3487#endif
3488}
5d17b4a4 3489
014db007 3490static int io_recv(struct io_kiocb *req, bool force_nonblock)
fddaface
JA
3491{
3492#if defined(CONFIG_NET)
3493 struct socket *sock;
3494 int ret;
3495
3496 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3497 return -EINVAL;
3498
3499 sock = sock_from_file(req->file, &ret);
3500 if (sock) {
3501 struct io_sr_msg *sr = &req->sr_msg;
3502 struct msghdr msg;
3503 struct iovec iov;
3504 unsigned flags;
3505
3506 ret = import_single_range(READ, sr->buf, sr->len, &iov,
3507 &msg.msg_iter);
3508 if (ret)
3509 return ret;
3510
3511 msg.msg_name = NULL;
3512 msg.msg_control = NULL;
3513 msg.msg_controllen = 0;
3514 msg.msg_namelen = 0;
3515 msg.msg_iocb = NULL;
3516 msg.msg_flags = 0;
3517
3518 flags = req->sr_msg.msg_flags;
3519 if (flags & MSG_DONTWAIT)
3520 req->flags |= REQ_F_NOWAIT;
3521 else if (force_nonblock)
3522 flags |= MSG_DONTWAIT;
3523
0b7b21e4 3524 ret = sock_recvmsg(sock, &msg, flags);
fddaface
JA
3525 if (force_nonblock && ret == -EAGAIN)
3526 return -EAGAIN;
3527 if (ret == -ERESTARTSYS)
3528 ret = -EINTR;
3529 }
3530
3531 io_cqring_add_event(req, ret);
3532 if (ret < 0)
3533 req_set_fail_links(req);
014db007 3534 io_put_req(req);
fddaface
JA
3535 return 0;
3536#else
3537 return -EOPNOTSUPP;
3538#endif
3539}
3540
3541
3529d8c2 3542static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
17f2fe35
JA
3543{
3544#if defined(CONFIG_NET)
8ed8d3c3
JA
3545 struct io_accept *accept = &req->accept;
3546
17f2fe35
JA
3547 if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
3548 return -EINVAL;
8042d6ce 3549 if (sqe->ioprio || sqe->len || sqe->buf_index)
17f2fe35
JA
3550 return -EINVAL;
3551
d55e5f5b
JA
3552 accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
3553 accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
8ed8d3c3 3554 accept->flags = READ_ONCE(sqe->accept_flags);
8ed8d3c3
JA
3555 return 0;
3556#else
3557 return -EOPNOTSUPP;
3558#endif
3559}
17f2fe35 3560
8ed8d3c3 3561#if defined(CONFIG_NET)
014db007 3562static int __io_accept(struct io_kiocb *req, bool force_nonblock)
8ed8d3c3
JA
3563{
3564 struct io_accept *accept = &req->accept;
3565 unsigned file_flags;
3566 int ret;
3567
3568 file_flags = force_nonblock ? O_NONBLOCK : 0;
3569 ret = __sys_accept4_file(req->file, file_flags, accept->addr,
3570 accept->addr_len, accept->flags);
3571 if (ret == -EAGAIN && force_nonblock)
17f2fe35 3572 return -EAGAIN;
8e3cca12
JA
3573 if (ret == -ERESTARTSYS)
3574 ret = -EINTR;
4e88d6e7
JA
3575 if (ret < 0)
3576 req_set_fail_links(req);
78e19bbe 3577 io_cqring_add_event(req, ret);
014db007 3578 io_put_req(req);
17f2fe35 3579 return 0;
8ed8d3c3
JA
3580}
3581
3582static void io_accept_finish(struct io_wq_work **workptr)
3583{
3584 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
8ed8d3c3
JA
3585
3586 if (io_req_cancelled(req))
3587 return;
014db007 3588 __io_accept(req, false);
e9fd9396 3589 io_steal_work(req, workptr);
8ed8d3c3
JA
3590}
3591#endif
3592
014db007 3593static int io_accept(struct io_kiocb *req, bool force_nonblock)
8ed8d3c3
JA
3594{
3595#if defined(CONFIG_NET)
3596 int ret;
3597
014db007 3598 ret = __io_accept(req, force_nonblock);
8ed8d3c3
JA
3599 if (ret == -EAGAIN && force_nonblock) {
3600 req->work.func = io_accept_finish;
8ed8d3c3
JA
3601 return -EAGAIN;
3602 }
3603 return 0;
0fa03c62
JA
3604#else
3605 return -EOPNOTSUPP;
3606#endif
3607}
5d17b4a4 3608
3529d8c2 3609static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
f499a021
JA
3610{
3611#if defined(CONFIG_NET)
3529d8c2
JA
3612 struct io_connect *conn = &req->connect;
3613 struct io_async_ctx *io = req->io;
f499a021 3614
3fbb51c1
JA
3615 if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
3616 return -EINVAL;
3617 if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
3618 return -EINVAL;
3619
3529d8c2
JA
3620 conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
3621 conn->addr_len = READ_ONCE(sqe->addr2);
3622
3623 if (!io)
3624 return 0;
3625
3626 return move_addr_to_kernel(conn->addr, conn->addr_len,
3fbb51c1 3627 &io->connect.address);
f499a021 3628#else
3fbb51c1 3629 return -EOPNOTSUPP;
f499a021
JA
3630#endif
3631}
3632
014db007 3633static int io_connect(struct io_kiocb *req, bool force_nonblock)
f8e85cf2
JA
3634{
3635#if defined(CONFIG_NET)
f499a021 3636 struct io_async_ctx __io, *io;
f8e85cf2 3637 unsigned file_flags;
3fbb51c1 3638 int ret;
f8e85cf2 3639
f499a021
JA
3640 if (req->io) {
3641 io = req->io;
3642 } else {
3529d8c2
JA
3643 ret = move_addr_to_kernel(req->connect.addr,
3644 req->connect.addr_len,
3645 &__io.connect.address);
f499a021
JA
3646 if (ret)
3647 goto out;
3648 io = &__io;
3649 }
3650
3fbb51c1
JA
3651 file_flags = force_nonblock ? O_NONBLOCK : 0;
3652
3653 ret = __sys_connect_file(req->file, &io->connect.address,
3654 req->connect.addr_len, file_flags);
87f80d62 3655 if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
b7bb4f7d
JA
3656 if (req->io)
3657 return -EAGAIN;
3658 if (io_alloc_async_ctx(req)) {
f499a021
JA
3659 ret = -ENOMEM;
3660 goto out;
3661 }
b7bb4f7d 3662 memcpy(&req->io->connect, &__io.connect, sizeof(__io.connect));
f8e85cf2 3663 return -EAGAIN;
f499a021 3664 }
f8e85cf2
JA
3665 if (ret == -ERESTARTSYS)
3666 ret = -EINTR;
f499a021 3667out:
4e88d6e7
JA
3668 if (ret < 0)
3669 req_set_fail_links(req);
f8e85cf2 3670 io_cqring_add_event(req, ret);
014db007 3671 io_put_req(req);
f8e85cf2
JA
3672 return 0;
3673#else
3674 return -EOPNOTSUPP;
3675#endif
3676}
3677
d7718a9d
JA
3678struct io_poll_table {
3679 struct poll_table_struct pt;
3680 struct io_kiocb *req;
3681 int error;
3682};
3683
3684static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
3685 struct wait_queue_head *head)
3686{
3687 if (unlikely(poll->head)) {
3688 pt->error = -EINVAL;
3689 return;
3690 }
3691
3692 pt->error = 0;
3693 poll->head = head;
3694 add_wait_queue(head, &poll->wait);
3695}
3696
3697static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
3698 struct poll_table_struct *p)
3699{
3700 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
3701
3702 __io_queue_proc(&pt->req->apoll->poll, pt, head);
3703}
3704
3705static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
3706 __poll_t mask, task_work_func_t func)
3707{
3708 struct task_struct *tsk;
3709
3710 /* for instances that support it check for an event match first: */
3711 if (mask && !(mask & poll->events))
3712 return 0;
3713
3714 trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
3715
3716 list_del_init(&poll->wait.entry);
3717
3718 tsk = req->task;
3719 req->result = mask;
3720 init_task_work(&req->task_work, func);
3721 /*
3722 * If this fails, then the task is exiting. If that is the case, then
3723 * the exit check will ultimately cancel these work items. Hence we
3724 * don't need to check here and handle it specifically.
3725 */
3726 task_work_add(tsk, &req->task_work, true);
3727 wake_up_process(tsk);
3728 return 1;
3729}
3730
3731static void io_async_task_func(struct callback_head *cb)
3732{
3733 struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
3734 struct async_poll *apoll = req->apoll;
3735 struct io_ring_ctx *ctx = req->ctx;
3736
3737 trace_io_uring_task_run(req->ctx, req->opcode, req->user_data);
3738
3739 WARN_ON_ONCE(!list_empty(&req->apoll->poll.wait.entry));
3740
3741 if (hash_hashed(&req->hash_node)) {
3742 spin_lock_irq(&ctx->completion_lock);
3743 hash_del(&req->hash_node);
3744 spin_unlock_irq(&ctx->completion_lock);
3745 }
3746
3747 /* restore ->work in case we need to retry again */
3748 memcpy(&req->work, &apoll->work, sizeof(req->work));
3749
3750 __set_current_state(TASK_RUNNING);
3751 mutex_lock(&ctx->uring_lock);
3752 __io_queue_sqe(req, NULL);
3753 mutex_unlock(&ctx->uring_lock);
3754
3755 kfree(apoll);
3756}
3757
3758static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
3759 void *key)
3760{
3761 struct io_kiocb *req = wait->private;
3762 struct io_poll_iocb *poll = &req->apoll->poll;
3763
3764 trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data,
3765 key_to_poll(key));
3766
3767 return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func);
3768}
3769
3770static void io_poll_req_insert(struct io_kiocb *req)
3771{
3772 struct io_ring_ctx *ctx = req->ctx;
3773 struct hlist_head *list;
3774
3775 list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
3776 hlist_add_head(&req->hash_node, list);
3777}
3778
3779static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
3780 struct io_poll_iocb *poll,
3781 struct io_poll_table *ipt, __poll_t mask,
3782 wait_queue_func_t wake_func)
3783 __acquires(&ctx->completion_lock)
3784{
3785 struct io_ring_ctx *ctx = req->ctx;
3786 bool cancel = false;
3787
3788 poll->file = req->file;
3789 poll->head = NULL;
3790 poll->done = poll->canceled = false;
3791 poll->events = mask;
3792
3793 ipt->pt._key = mask;
3794 ipt->req = req;
3795 ipt->error = -EINVAL;
3796
3797 INIT_LIST_HEAD(&poll->wait.entry);
3798 init_waitqueue_func_entry(&poll->wait, wake_func);
3799 poll->wait.private = req;
3800
3801 mask = vfs_poll(req->file, &ipt->pt) & poll->events;
3802
3803 spin_lock_irq(&ctx->completion_lock);
3804 if (likely(poll->head)) {
3805 spin_lock(&poll->head->lock);
3806 if (unlikely(list_empty(&poll->wait.entry))) {
3807 if (ipt->error)
3808 cancel = true;
3809 ipt->error = 0;
3810 mask = 0;
3811 }
3812 if (mask || ipt->error)
3813 list_del_init(&poll->wait.entry);
3814 else if (cancel)
3815 WRITE_ONCE(poll->canceled, true);
3816 else if (!poll->done) /* actually waiting for an event */
3817 io_poll_req_insert(req);
3818 spin_unlock(&poll->head->lock);
3819 }
3820
3821 return mask;
3822}
3823
3824static bool io_arm_poll_handler(struct io_kiocb *req)
3825{
3826 const struct io_op_def *def = &io_op_defs[req->opcode];
3827 struct io_ring_ctx *ctx = req->ctx;
3828 struct async_poll *apoll;
3829 struct io_poll_table ipt;
3830 __poll_t mask, ret;
3831
3832 if (!req->file || !file_can_poll(req->file))
3833 return false;
3834 if (req->flags & (REQ_F_MUST_PUNT | REQ_F_POLLED))
3835 return false;
3836 if (!def->pollin && !def->pollout)
3837 return false;
3838
3839 apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
3840 if (unlikely(!apoll))
3841 return false;
3842
3843 req->flags |= REQ_F_POLLED;
3844 memcpy(&apoll->work, &req->work, sizeof(req->work));
3845
3846 /*
3847 * Don't need a reference here, as we're adding it to the task
3848 * task_works list. If the task exits, the list is pruned.
3849 */
3850 req->task = current;
3851 req->apoll = apoll;
3852 INIT_HLIST_NODE(&req->hash_node);
3853
8755d97a 3854 mask = 0;
d7718a9d 3855 if (def->pollin)
8755d97a 3856 mask |= POLLIN | POLLRDNORM;
d7718a9d
JA
3857 if (def->pollout)
3858 mask |= POLLOUT | POLLWRNORM;
3859 mask |= POLLERR | POLLPRI;
3860
3861 ipt.pt._qproc = io_async_queue_proc;
3862
3863 ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
3864 io_async_wake);
3865 if (ret) {
3866 ipt.error = 0;
3867 apoll->poll.done = true;
3868 spin_unlock_irq(&ctx->completion_lock);
3869 memcpy(&req->work, &apoll->work, sizeof(req->work));
3870 kfree(apoll);
3871 return false;
3872 }
3873 spin_unlock_irq(&ctx->completion_lock);
3874 trace_io_uring_poll_arm(ctx, req->opcode, req->user_data, mask,
3875 apoll->poll.events);
3876 return true;
3877}
3878
3879static bool __io_poll_remove_one(struct io_kiocb *req,
3880 struct io_poll_iocb *poll)
221c5eb2 3881{
b41e9852 3882 bool do_complete = false;
221c5eb2
JA
3883
3884 spin_lock(&poll->head->lock);
3885 WRITE_ONCE(poll->canceled, true);
392edb45
JA
3886 if (!list_empty(&poll->wait.entry)) {
3887 list_del_init(&poll->wait.entry);
b41e9852 3888 do_complete = true;
221c5eb2
JA
3889 }
3890 spin_unlock(&poll->head->lock);
d7718a9d
JA
3891 return do_complete;
3892}
3893
3894static bool io_poll_remove_one(struct io_kiocb *req)
3895{
3896 bool do_complete;
3897
3898 if (req->opcode == IORING_OP_POLL_ADD) {
3899 do_complete = __io_poll_remove_one(req, &req->poll);
3900 } else {
3901 /* non-poll requests have submit ref still */
3902 do_complete = __io_poll_remove_one(req, &req->apoll->poll);
3903 if (do_complete)
3904 io_put_req(req);
3905 }
3906
78076bb6 3907 hash_del(&req->hash_node);
d7718a9d 3908
b41e9852
JA
3909 if (do_complete) {
3910 io_cqring_fill_event(req, -ECANCELED);
3911 io_commit_cqring(req->ctx);
3912 req->flags |= REQ_F_COMP_LOCKED;
3913 io_put_req(req);
3914 }
3915
3916 return do_complete;
221c5eb2
JA
3917}
3918
3919static void io_poll_remove_all(struct io_ring_ctx *ctx)
3920{
78076bb6 3921 struct hlist_node *tmp;
221c5eb2 3922 struct io_kiocb *req;
78076bb6 3923 int i;
221c5eb2
JA
3924
3925 spin_lock_irq(&ctx->completion_lock);
78076bb6
JA
3926 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
3927 struct hlist_head *list;
3928
3929 list = &ctx->cancel_hash[i];
3930 hlist_for_each_entry_safe(req, tmp, list, hash_node)
3931 io_poll_remove_one(req);
221c5eb2
JA
3932 }
3933 spin_unlock_irq(&ctx->completion_lock);
b41e9852
JA
3934
3935 io_cqring_ev_posted(ctx);
221c5eb2
JA
3936}
3937
47f46768
JA
3938static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
3939{
78076bb6 3940 struct hlist_head *list;
47f46768
JA
3941 struct io_kiocb *req;
3942
78076bb6
JA
3943 list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
3944 hlist_for_each_entry(req, list, hash_node) {
b41e9852
JA
3945 if (sqe_addr != req->user_data)
3946 continue;
3947 if (io_poll_remove_one(req))
eac406c6 3948 return 0;
b41e9852 3949 return -EALREADY;
47f46768
JA
3950 }
3951
3952 return -ENOENT;
3953}
3954
3529d8c2
JA
3955static int io_poll_remove_prep(struct io_kiocb *req,
3956 const struct io_uring_sqe *sqe)
0969e783 3957{
0969e783
JA
3958 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3959 return -EINVAL;
3960 if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
3961 sqe->poll_events)
3962 return -EINVAL;
3963
3964 req->poll.addr = READ_ONCE(sqe->addr);
0969e783
JA
3965 return 0;
3966}
3967
221c5eb2
JA
3968/*
3969 * Find a running poll command that matches one specified in sqe->addr,
3970 * and remove it if found.
3971 */
fc4df999 3972static int io_poll_remove(struct io_kiocb *req)
221c5eb2
JA
3973{
3974 struct io_ring_ctx *ctx = req->ctx;
0969e783 3975 u64 addr;
47f46768 3976 int ret;
221c5eb2 3977
0969e783 3978 addr = req->poll.addr;
221c5eb2 3979 spin_lock_irq(&ctx->completion_lock);
0969e783 3980 ret = io_poll_cancel(ctx, addr);
221c5eb2
JA
3981 spin_unlock_irq(&ctx->completion_lock);
3982
78e19bbe 3983 io_cqring_add_event(req, ret);
4e88d6e7
JA
3984 if (ret < 0)
3985 req_set_fail_links(req);
e65ef56d 3986 io_put_req(req);
221c5eb2
JA
3987 return 0;
3988}
3989
b0dd8a41 3990static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
221c5eb2 3991{
a197f664
JL
3992 struct io_ring_ctx *ctx = req->ctx;
3993
8c838788 3994 req->poll.done = true;
b0a20349 3995 io_cqring_fill_event(req, error ? error : mangle_poll(mask));
8c838788 3996 io_commit_cqring(ctx);
221c5eb2
JA
3997}
3998
b41e9852 3999static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt)
221c5eb2 4000{
221c5eb2 4001 struct io_ring_ctx *ctx = req->ctx;
221c5eb2 4002
221c5eb2 4003 spin_lock_irq(&ctx->completion_lock);
78076bb6 4004 hash_del(&req->hash_node);
b41e9852
JA
4005 io_poll_complete(req, req->result, 0);
4006 req->flags |= REQ_F_COMP_LOCKED;
4007 io_put_req_find_next(req, nxt);
e94f141b
JA
4008 spin_unlock_irq(&ctx->completion_lock);
4009
4010 io_cqring_ev_posted(ctx);
e94f141b
JA
4011}
4012
b41e9852 4013static void io_poll_task_func(struct callback_head *cb)
f0b493e6 4014{
b41e9852
JA
4015 struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
4016 struct io_kiocb *nxt = NULL;
f0b493e6 4017
b41e9852 4018 io_poll_task_handler(req, &nxt);
d7718a9d
JA
4019 if (nxt) {
4020 struct io_ring_ctx *ctx = nxt->ctx;
4021
4022 mutex_lock(&ctx->uring_lock);
b41e9852 4023 __io_queue_sqe(nxt, NULL);
d7718a9d
JA
4024 mutex_unlock(&ctx->uring_lock);
4025 }
f0b493e6
JA
4026}
4027
221c5eb2
JA
4028static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
4029 void *key)
4030{
c2f2eb7d
JA
4031 struct io_kiocb *req = wait->private;
4032 struct io_poll_iocb *poll = &req->poll;
221c5eb2 4033
d7718a9d 4034 return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func);
221c5eb2
JA
4035}
4036
221c5eb2
JA
4037static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
4038 struct poll_table_struct *p)
4039{
4040 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
4041
d7718a9d 4042 __io_queue_proc(&pt->req->poll, pt, head);
eac406c6
JA
4043}
4044
3529d8c2 4045static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
221c5eb2
JA
4046{
4047 struct io_poll_iocb *poll = &req->poll;
221c5eb2 4048 u16 events;
221c5eb2
JA
4049
4050 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4051 return -EINVAL;
4052 if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
4053 return -EINVAL;
09bb8394
JA
4054 if (!poll->file)
4055 return -EBADF;
221c5eb2 4056
221c5eb2
JA
4057 events = READ_ONCE(sqe->poll_events);
4058 poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
b41e9852 4059
d7718a9d
JA
4060 /*
4061 * Don't need a reference here, as we're adding it to the task
4062 * task_works list. If the task exits, the list is pruned.
4063 */
b41e9852 4064 req->task = current;
0969e783
JA
4065 return 0;
4066}
4067
014db007 4068static int io_poll_add(struct io_kiocb *req)
0969e783
JA
4069{
4070 struct io_poll_iocb *poll = &req->poll;
4071 struct io_ring_ctx *ctx = req->ctx;
4072 struct io_poll_table ipt;
0969e783 4073 __poll_t mask;
0969e783 4074
78076bb6 4075 INIT_HLIST_NODE(&req->hash_node);
36703247 4076 INIT_LIST_HEAD(&req->list);
d7718a9d 4077 ipt.pt._qproc = io_poll_queue_proc;
36703247 4078
d7718a9d
JA
4079 mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events,
4080 io_poll_wake);
221c5eb2 4081
8c838788 4082 if (mask) { /* no async, we'd stolen it */
221c5eb2 4083 ipt.error = 0;
b0dd8a41 4084 io_poll_complete(req, mask, 0);
221c5eb2 4085 }
221c5eb2
JA
4086 spin_unlock_irq(&ctx->completion_lock);
4087
8c838788
JA
4088 if (mask) {
4089 io_cqring_ev_posted(ctx);
014db007 4090 io_put_req(req);
221c5eb2 4091 }
8c838788 4092 return ipt.error;
221c5eb2
JA
4093}
4094
5262f567
JA
4095static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
4096{
ad8a48ac
JA
4097 struct io_timeout_data *data = container_of(timer,
4098 struct io_timeout_data, timer);
4099 struct io_kiocb *req = data->req;
4100 struct io_ring_ctx *ctx = req->ctx;
5262f567
JA
4101 unsigned long flags;
4102
5262f567
JA
4103 atomic_inc(&ctx->cq_timeouts);
4104
4105 spin_lock_irqsave(&ctx->completion_lock, flags);
ef03681a 4106 /*
11365043
JA
4107 * We could be racing with timeout deletion. If the list is empty,
4108 * then timeout lookup already found it and will be handling it.
ef03681a 4109 */
842f9612 4110 if (!list_empty(&req->list)) {
11365043 4111 struct io_kiocb *prev;
5262f567 4112
11365043
JA
4113 /*
4114 * Adjust the reqs sequence before the current one because it
d195a66e 4115 * will consume a slot in the cq_ring and the cq_tail
11365043
JA
4116 * pointer will be increased, otherwise other timeout reqs may
4117 * return in advance without waiting for enough wait_nr.
4118 */
4119 prev = req;
4120 list_for_each_entry_continue_reverse(prev, &ctx->timeout_list, list)
4121 prev->sequence++;
11365043 4122 list_del_init(&req->list);
11365043 4123 }
5262f567 4124
78e19bbe 4125 io_cqring_fill_event(req, -ETIME);
5262f567
JA
4126 io_commit_cqring(ctx);
4127 spin_unlock_irqrestore(&ctx->completion_lock, flags);
4128
4129 io_cqring_ev_posted(ctx);
4e88d6e7 4130 req_set_fail_links(req);
5262f567
JA
4131 io_put_req(req);
4132 return HRTIMER_NORESTART;
4133}
4134
47f46768
JA
4135static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
4136{
4137 struct io_kiocb *req;
4138 int ret = -ENOENT;
4139
4140 list_for_each_entry(req, &ctx->timeout_list, list) {
4141 if (user_data == req->user_data) {
4142 list_del_init(&req->list);
4143 ret = 0;
4144 break;
4145 }
4146 }
4147
4148 if (ret == -ENOENT)
4149 return ret;
4150
2d28390a 4151 ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
47f46768
JA
4152 if (ret == -1)
4153 return -EALREADY;
4154
4e88d6e7 4155 req_set_fail_links(req);
47f46768
JA
4156 io_cqring_fill_event(req, -ECANCELED);
4157 io_put_req(req);
4158 return 0;
4159}
4160
3529d8c2
JA
4161static int io_timeout_remove_prep(struct io_kiocb *req,
4162 const struct io_uring_sqe *sqe)
b29472ee 4163{
b29472ee
JA
4164 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4165 return -EINVAL;
4166 if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len)
4167 return -EINVAL;
4168
4169 req->timeout.addr = READ_ONCE(sqe->addr);
4170 req->timeout.flags = READ_ONCE(sqe->timeout_flags);
4171 if (req->timeout.flags)
4172 return -EINVAL;
4173
b29472ee
JA
4174 return 0;
4175}
4176
11365043
JA
4177/*
4178 * Remove or update an existing timeout command
4179 */
fc4df999 4180static int io_timeout_remove(struct io_kiocb *req)
11365043
JA
4181{
4182 struct io_ring_ctx *ctx = req->ctx;
47f46768 4183 int ret;
11365043 4184
11365043 4185 spin_lock_irq(&ctx->completion_lock);
b29472ee 4186 ret = io_timeout_cancel(ctx, req->timeout.addr);
11365043 4187
47f46768 4188 io_cqring_fill_event(req, ret);
11365043
JA
4189 io_commit_cqring(ctx);
4190 spin_unlock_irq(&ctx->completion_lock);
5262f567 4191 io_cqring_ev_posted(ctx);
4e88d6e7
JA
4192 if (ret < 0)
4193 req_set_fail_links(req);
ec9c02ad 4194 io_put_req(req);
11365043 4195 return 0;
5262f567
JA
4196}
4197
3529d8c2 4198static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2d28390a 4199 bool is_timeout_link)
5262f567 4200{
ad8a48ac 4201 struct io_timeout_data *data;
a41525ab 4202 unsigned flags;
5262f567 4203
ad8a48ac 4204 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5262f567 4205 return -EINVAL;
ad8a48ac 4206 if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
a41525ab 4207 return -EINVAL;
2d28390a
JA
4208 if (sqe->off && is_timeout_link)
4209 return -EINVAL;
a41525ab
JA
4210 flags = READ_ONCE(sqe->timeout_flags);
4211 if (flags & ~IORING_TIMEOUT_ABS)
5262f567 4212 return -EINVAL;
bdf20073 4213
26a61679
JA
4214 req->timeout.count = READ_ONCE(sqe->off);
4215
3529d8c2 4216 if (!req->io && io_alloc_async_ctx(req))
26a61679
JA
4217 return -ENOMEM;
4218
4219 data = &req->io->timeout;
ad8a48ac 4220 data->req = req;
ad8a48ac
JA
4221 req->flags |= REQ_F_TIMEOUT;
4222
4223 if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
5262f567
JA
4224 return -EFAULT;
4225
11365043 4226 if (flags & IORING_TIMEOUT_ABS)
ad8a48ac 4227 data->mode = HRTIMER_MODE_ABS;
11365043 4228 else
ad8a48ac 4229 data->mode = HRTIMER_MODE_REL;
11365043 4230
ad8a48ac
JA
4231 hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
4232 return 0;
4233}
4234
fc4df999 4235static int io_timeout(struct io_kiocb *req)
ad8a48ac
JA
4236{
4237 unsigned count;
4238 struct io_ring_ctx *ctx = req->ctx;
4239 struct io_timeout_data *data;
4240 struct list_head *entry;
4241 unsigned span = 0;
ad8a48ac 4242
2d28390a 4243 data = &req->io->timeout;
93bd25bb 4244
5262f567
JA
4245 /*
4246 * sqe->off holds how many events that need to occur for this
93bd25bb
JA
4247 * timeout event to be satisfied. If it isn't set, then this is
4248 * a pure timeout request, sequence isn't used.
5262f567 4249 */
26a61679 4250 count = req->timeout.count;
93bd25bb
JA
4251 if (!count) {
4252 req->flags |= REQ_F_TIMEOUT_NOSEQ;
4253 spin_lock_irq(&ctx->completion_lock);
4254 entry = ctx->timeout_list.prev;
4255 goto add;
4256 }
5262f567
JA
4257
4258 req->sequence = ctx->cached_sq_head + count - 1;
2d28390a 4259 data->seq_offset = count;
5262f567
JA
4260
4261 /*
4262 * Insertion sort, ensuring the first entry in the list is always
4263 * the one we need first.
4264 */
5262f567
JA
4265 spin_lock_irq(&ctx->completion_lock);
4266 list_for_each_prev(entry, &ctx->timeout_list) {
4267 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
5da0fb1a 4268 unsigned nxt_sq_head;
4269 long long tmp, tmp_nxt;
2d28390a 4270 u32 nxt_offset = nxt->io->timeout.seq_offset;
5262f567 4271
93bd25bb
JA
4272 if (nxt->flags & REQ_F_TIMEOUT_NOSEQ)
4273 continue;
4274
5da0fb1a 4275 /*
4276 * Since cached_sq_head + count - 1 can overflow, use type long
4277 * long to store it.
4278 */
4279 tmp = (long long)ctx->cached_sq_head + count - 1;
cc42e0ac
PB
4280 nxt_sq_head = nxt->sequence - nxt_offset + 1;
4281 tmp_nxt = (long long)nxt_sq_head + nxt_offset - 1;
5da0fb1a 4282
4283 /*
4284 * cached_sq_head may overflow, and it will never overflow twice
4285 * once there is some timeout req still be valid.
4286 */
4287 if (ctx->cached_sq_head < nxt_sq_head)
8b07a65a 4288 tmp += UINT_MAX;
5da0fb1a 4289
a1f58ba4 4290 if (tmp > tmp_nxt)
5262f567 4291 break;
a1f58ba4 4292
4293 /*
4294 * Sequence of reqs after the insert one and itself should
4295 * be adjusted because each timeout req consumes a slot.
4296 */
4297 span++;
4298 nxt->sequence++;
5262f567 4299 }
a1f58ba4 4300 req->sequence -= span;
93bd25bb 4301add:
5262f567 4302 list_add(&req->list, entry);
ad8a48ac
JA
4303 data->timer.function = io_timeout_fn;
4304 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
5262f567 4305 spin_unlock_irq(&ctx->completion_lock);
5262f567
JA
4306 return 0;
4307}
5262f567 4308
62755e35
JA
4309static bool io_cancel_cb(struct io_wq_work *work, void *data)
4310{
4311 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
4312
4313 return req->user_data == (unsigned long) data;
4314}
4315
e977d6d3 4316static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
62755e35 4317{
62755e35 4318 enum io_wq_cancel cancel_ret;
62755e35
JA
4319 int ret = 0;
4320
62755e35
JA
4321 cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr);
4322 switch (cancel_ret) {
4323 case IO_WQ_CANCEL_OK:
4324 ret = 0;
4325 break;
4326 case IO_WQ_CANCEL_RUNNING:
4327 ret = -EALREADY;
4328 break;
4329 case IO_WQ_CANCEL_NOTFOUND:
4330 ret = -ENOENT;
4331 break;
4332 }
4333
e977d6d3
JA
4334 return ret;
4335}
4336
47f46768
JA
4337static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
4338 struct io_kiocb *req, __u64 sqe_addr,
014db007 4339 int success_ret)
47f46768
JA
4340{
4341 unsigned long flags;
4342 int ret;
4343
4344 ret = io_async_cancel_one(ctx, (void *) (unsigned long) sqe_addr);
4345 if (ret != -ENOENT) {
4346 spin_lock_irqsave(&ctx->completion_lock, flags);
4347 goto done;
4348 }
4349
4350 spin_lock_irqsave(&ctx->completion_lock, flags);
4351 ret = io_timeout_cancel(ctx, sqe_addr);
4352 if (ret != -ENOENT)
4353 goto done;
4354 ret = io_poll_cancel(ctx, sqe_addr);
4355done:
b0dd8a41
JA
4356 if (!ret)
4357 ret = success_ret;
47f46768
JA
4358 io_cqring_fill_event(req, ret);
4359 io_commit_cqring(ctx);
4360 spin_unlock_irqrestore(&ctx->completion_lock, flags);
4361 io_cqring_ev_posted(ctx);
4362
4e88d6e7
JA
4363 if (ret < 0)
4364 req_set_fail_links(req);
014db007 4365 io_put_req(req);
47f46768
JA
4366}
4367
3529d8c2
JA
4368static int io_async_cancel_prep(struct io_kiocb *req,
4369 const struct io_uring_sqe *sqe)
e977d6d3 4370{
fbf23849 4371 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
e977d6d3
JA
4372 return -EINVAL;
4373 if (sqe->flags || sqe->ioprio || sqe->off || sqe->len ||
4374 sqe->cancel_flags)
4375 return -EINVAL;
4376
fbf23849
JA
4377 req->cancel.addr = READ_ONCE(sqe->addr);
4378 return 0;
4379}
4380
014db007 4381static int io_async_cancel(struct io_kiocb *req)
fbf23849
JA
4382{
4383 struct io_ring_ctx *ctx = req->ctx;
fbf23849 4384
014db007 4385 io_async_find_and_cancel(ctx, req, req->cancel.addr, 0);
5262f567
JA
4386 return 0;
4387}
4388
05f3fb3c
JA
4389static int io_files_update_prep(struct io_kiocb *req,
4390 const struct io_uring_sqe *sqe)
4391{
4392 if (sqe->flags || sqe->ioprio || sqe->rw_flags)
4393 return -EINVAL;
4394
4395 req->files_update.offset = READ_ONCE(sqe->off);
4396 req->files_update.nr_args = READ_ONCE(sqe->len);
4397 if (!req->files_update.nr_args)
4398 return -EINVAL;
4399 req->files_update.arg = READ_ONCE(sqe->addr);
4400 return 0;
4401}
4402
4403static int io_files_update(struct io_kiocb *req, bool force_nonblock)
fbf23849
JA
4404{
4405 struct io_ring_ctx *ctx = req->ctx;
05f3fb3c
JA
4406 struct io_uring_files_update up;
4407 int ret;
fbf23849 4408
f86cd20c 4409 if (force_nonblock)
05f3fb3c 4410 return -EAGAIN;
05f3fb3c
JA
4411
4412 up.offset = req->files_update.offset;
4413 up.fds = req->files_update.arg;
4414
4415 mutex_lock(&ctx->uring_lock);
4416 ret = __io_sqe_files_update(ctx, &up, req->files_update.nr_args);
4417 mutex_unlock(&ctx->uring_lock);
4418
4419 if (ret < 0)
4420 req_set_fail_links(req);
4421 io_cqring_add_event(req, ret);
4422 io_put_req(req);
5262f567
JA
4423 return 0;
4424}
4425
3529d8c2
JA
4426static int io_req_defer_prep(struct io_kiocb *req,
4427 const struct io_uring_sqe *sqe)
f67676d1 4428{
e781573e 4429 ssize_t ret = 0;
f67676d1 4430
f86cd20c
JA
4431 if (io_op_defs[req->opcode].file_table) {
4432 ret = io_grab_files(req);
4433 if (unlikely(ret))
4434 return ret;
4435 }
4436
cccf0ee8
JA
4437 io_req_work_grab_env(req, &io_op_defs[req->opcode]);
4438
d625c6ee 4439 switch (req->opcode) {
e781573e
JA
4440 case IORING_OP_NOP:
4441 break;
f67676d1
JA
4442 case IORING_OP_READV:
4443 case IORING_OP_READ_FIXED:
3a6820f2 4444 case IORING_OP_READ:
3529d8c2 4445 ret = io_read_prep(req, sqe, true);
f67676d1
JA
4446 break;
4447 case IORING_OP_WRITEV:
4448 case IORING_OP_WRITE_FIXED:
3a6820f2 4449 case IORING_OP_WRITE:
3529d8c2 4450 ret = io_write_prep(req, sqe, true);
f67676d1 4451 break;
0969e783 4452 case IORING_OP_POLL_ADD:
3529d8c2 4453 ret = io_poll_add_prep(req, sqe);
0969e783
JA
4454 break;
4455 case IORING_OP_POLL_REMOVE:
3529d8c2 4456 ret = io_poll_remove_prep(req, sqe);
0969e783 4457 break;
8ed8d3c3 4458 case IORING_OP_FSYNC:
3529d8c2 4459 ret = io_prep_fsync(req, sqe);
8ed8d3c3
JA
4460 break;
4461 case IORING_OP_SYNC_FILE_RANGE:
3529d8c2 4462 ret = io_prep_sfr(req, sqe);
8ed8d3c3 4463 break;
03b1230c 4464 case IORING_OP_SENDMSG:
fddaface 4465 case IORING_OP_SEND:
3529d8c2 4466 ret = io_sendmsg_prep(req, sqe);
03b1230c
JA
4467 break;
4468 case IORING_OP_RECVMSG:
fddaface 4469 case IORING_OP_RECV:
3529d8c2 4470 ret = io_recvmsg_prep(req, sqe);
03b1230c 4471 break;
f499a021 4472 case IORING_OP_CONNECT:
3529d8c2 4473 ret = io_connect_prep(req, sqe);
f499a021 4474 break;
2d28390a 4475 case IORING_OP_TIMEOUT:
3529d8c2 4476 ret = io_timeout_prep(req, sqe, false);
b7bb4f7d 4477 break;
b29472ee 4478 case IORING_OP_TIMEOUT_REMOVE:
3529d8c2 4479 ret = io_timeout_remove_prep(req, sqe);
b29472ee 4480 break;
fbf23849 4481 case IORING_OP_ASYNC_CANCEL:
3529d8c2 4482 ret = io_async_cancel_prep(req, sqe);
fbf23849 4483 break;
2d28390a 4484 case IORING_OP_LINK_TIMEOUT:
3529d8c2 4485 ret = io_timeout_prep(req, sqe, true);
b7bb4f7d 4486 break;
8ed8d3c3 4487 case IORING_OP_ACCEPT:
3529d8c2 4488 ret = io_accept_prep(req, sqe);
8ed8d3c3 4489 break;
d63d1b5e
JA
4490 case IORING_OP_FALLOCATE:
4491 ret = io_fallocate_prep(req, sqe);
4492 break;
15b71abe
JA
4493 case IORING_OP_OPENAT:
4494 ret = io_openat_prep(req, sqe);
4495 break;
b5dba59e
JA
4496 case IORING_OP_CLOSE:
4497 ret = io_close_prep(req, sqe);
4498 break;
05f3fb3c
JA
4499 case IORING_OP_FILES_UPDATE:
4500 ret = io_files_update_prep(req, sqe);
4501 break;
eddc7ef5
JA
4502 case IORING_OP_STATX:
4503 ret = io_statx_prep(req, sqe);
4504 break;
4840e418
JA
4505 case IORING_OP_FADVISE:
4506 ret = io_fadvise_prep(req, sqe);
4507 break;
c1ca757b
JA
4508 case IORING_OP_MADVISE:
4509 ret = io_madvise_prep(req, sqe);
4510 break;
cebdb986
JA
4511 case IORING_OP_OPENAT2:
4512 ret = io_openat2_prep(req, sqe);
4513 break;
3e4827b0
JA
4514 case IORING_OP_EPOLL_CTL:
4515 ret = io_epoll_ctl_prep(req, sqe);
4516 break;
7d67af2c
PB
4517 case IORING_OP_SPLICE:
4518 ret = io_splice_prep(req, sqe);
4519 break;
ddf0322d
JA
4520 case IORING_OP_PROVIDE_BUFFERS:
4521 ret = io_provide_buffers_prep(req, sqe);
4522 break;
f67676d1 4523 default:
e781573e
JA
4524 printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
4525 req->opcode);
4526 ret = -EINVAL;
b7bb4f7d 4527 break;
f67676d1
JA
4528 }
4529
b7bb4f7d 4530 return ret;
f67676d1
JA
4531}
4532
3529d8c2 4533static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
de0617e4 4534{
a197f664 4535 struct io_ring_ctx *ctx = req->ctx;
f67676d1 4536 int ret;
de0617e4 4537
9d858b21
BL
4538 /* Still need defer if there is pending req in defer list. */
4539 if (!req_need_defer(req) && list_empty(&ctx->defer_list))
de0617e4
JA
4540 return 0;
4541
3529d8c2 4542 if (!req->io && io_alloc_async_ctx(req))
de0617e4
JA
4543 return -EAGAIN;
4544
3529d8c2 4545 ret = io_req_defer_prep(req, sqe);
b7bb4f7d 4546 if (ret < 0)
2d28390a 4547 return ret;
2d28390a 4548
de0617e4 4549 spin_lock_irq(&ctx->completion_lock);
9d858b21 4550 if (!req_need_defer(req) && list_empty(&ctx->defer_list)) {
de0617e4 4551 spin_unlock_irq(&ctx->completion_lock);
de0617e4
JA
4552 return 0;
4553 }
4554
915967f6 4555 trace_io_uring_defer(ctx, req, req->user_data);
de0617e4
JA
4556 list_add_tail(&req->list, &ctx->defer_list);
4557 spin_unlock_irq(&ctx->completion_lock);
4558 return -EIOCBQUEUED;
4559}
4560
99bc4c38
PB
4561static void io_cleanup_req(struct io_kiocb *req)
4562{
4563 struct io_async_ctx *io = req->io;
4564
4565 switch (req->opcode) {
4566 case IORING_OP_READV:
4567 case IORING_OP_READ_FIXED:
4568 case IORING_OP_READ:
4569 case IORING_OP_WRITEV:
4570 case IORING_OP_WRITE_FIXED:
4571 case IORING_OP_WRITE:
4572 if (io->rw.iov != io->rw.fast_iov)
4573 kfree(io->rw.iov);
4574 break;
4575 case IORING_OP_SENDMSG:
4576 case IORING_OP_RECVMSG:
4577 if (io->msg.iov != io->msg.fast_iov)
4578 kfree(io->msg.iov);
4579 break;
8fef80bf
PB
4580 case IORING_OP_OPENAT:
4581 case IORING_OP_OPENAT2:
4582 case IORING_OP_STATX:
4583 putname(req->open.filename);
4584 break;
7d67af2c
PB
4585 case IORING_OP_SPLICE:
4586 io_put_file(req, req->splice.file_in,
4587 (req->splice.flags & SPLICE_F_FD_IN_FIXED));
4588 break;
99bc4c38
PB
4589 }
4590
4591 req->flags &= ~REQ_F_NEED_CLEANUP;
4592}
4593
3529d8c2 4594static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
014db007 4595 bool force_nonblock)
2b188cc1 4596{
a197f664 4597 struct io_ring_ctx *ctx = req->ctx;
d625c6ee 4598 int ret;
2b188cc1 4599
d625c6ee 4600 switch (req->opcode) {
2b188cc1 4601 case IORING_OP_NOP:
78e19bbe 4602 ret = io_nop(req);
2b188cc1
JA
4603 break;
4604 case IORING_OP_READV:
edafccee 4605 case IORING_OP_READ_FIXED:
3a6820f2 4606 case IORING_OP_READ:
3529d8c2
JA
4607 if (sqe) {
4608 ret = io_read_prep(req, sqe, force_nonblock);
4609 if (ret < 0)
4610 break;
4611 }
014db007 4612 ret = io_read(req, force_nonblock);
edafccee 4613 break;
3529d8c2 4614 case IORING_OP_WRITEV:
edafccee 4615 case IORING_OP_WRITE_FIXED:
3a6820f2 4616 case IORING_OP_WRITE:
3529d8c2
JA
4617 if (sqe) {
4618 ret = io_write_prep(req, sqe, force_nonblock);
4619 if (ret < 0)
4620 break;
4621 }
014db007 4622 ret = io_write(req, force_nonblock);
2b188cc1 4623 break;
c992fe29 4624 case IORING_OP_FSYNC:
3529d8c2
JA
4625 if (sqe) {
4626 ret = io_prep_fsync(req, sqe);
4627 if (ret < 0)
4628 break;
4629 }
014db007 4630 ret = io_fsync(req, force_nonblock);
c992fe29 4631 break;
221c5eb2 4632 case IORING_OP_POLL_ADD:
3529d8c2
JA
4633 if (sqe) {
4634 ret = io_poll_add_prep(req, sqe);
4635 if (ret)
4636 break;
4637 }
014db007 4638 ret = io_poll_add(req);
221c5eb2
JA
4639 break;
4640 case IORING_OP_POLL_REMOVE:
3529d8c2
JA
4641 if (sqe) {
4642 ret = io_poll_remove_prep(req, sqe);
4643 if (ret < 0)
4644 break;
4645 }
fc4df999 4646 ret = io_poll_remove(req);
221c5eb2 4647 break;
5d17b4a4 4648 case IORING_OP_SYNC_FILE_RANGE:
3529d8c2
JA
4649 if (sqe) {
4650 ret = io_prep_sfr(req, sqe);
4651 if (ret < 0)
4652 break;
4653 }
014db007 4654 ret = io_sync_file_range(req, force_nonblock);
5d17b4a4 4655 break;
0fa03c62 4656 case IORING_OP_SENDMSG:
fddaface 4657 case IORING_OP_SEND:
3529d8c2
JA
4658 if (sqe) {
4659 ret = io_sendmsg_prep(req, sqe);
4660 if (ret < 0)
4661 break;
4662 }
fddaface 4663 if (req->opcode == IORING_OP_SENDMSG)
014db007 4664 ret = io_sendmsg(req, force_nonblock);
fddaface 4665 else
014db007 4666 ret = io_send(req, force_nonblock);
0fa03c62 4667 break;
aa1fa28f 4668 case IORING_OP_RECVMSG:
fddaface 4669 case IORING_OP_RECV:
3529d8c2
JA
4670 if (sqe) {
4671 ret = io_recvmsg_prep(req, sqe);
4672 if (ret)
4673 break;
4674 }
fddaface 4675 if (req->opcode == IORING_OP_RECVMSG)
014db007 4676 ret = io_recvmsg(req, force_nonblock);
fddaface 4677 else
014db007 4678 ret = io_recv(req, force_nonblock);
aa1fa28f 4679 break;
5262f567 4680 case IORING_OP_TIMEOUT:
3529d8c2
JA
4681 if (sqe) {
4682 ret = io_timeout_prep(req, sqe, false);
4683 if (ret)
4684 break;
4685 }
fc4df999 4686 ret = io_timeout(req);
5262f567 4687 break;
11365043 4688 case IORING_OP_TIMEOUT_REMOVE:
3529d8c2
JA
4689 if (sqe) {
4690 ret = io_timeout_remove_prep(req, sqe);
4691 if (ret)
4692 break;
4693 }
fc4df999 4694 ret = io_timeout_remove(req);
11365043 4695 break;
17f2fe35 4696 case IORING_OP_ACCEPT:
3529d8c2
JA
4697 if (sqe) {
4698 ret = io_accept_prep(req, sqe);
4699 if (ret)
4700 break;
4701 }
014db007 4702 ret = io_accept(req, force_nonblock);
17f2fe35 4703 break;
f8e85cf2 4704 case IORING_OP_CONNECT:
3529d8c2
JA
4705 if (sqe) {
4706 ret = io_connect_prep(req, sqe);
4707 if (ret)
4708 break;
4709 }
014db007 4710 ret = io_connect(req, force_nonblock);
f8e85cf2 4711 break;
62755e35 4712 case IORING_OP_ASYNC_CANCEL:
3529d8c2
JA
4713 if (sqe) {
4714 ret = io_async_cancel_prep(req, sqe);
4715 if (ret)
4716 break;
4717 }
014db007 4718 ret = io_async_cancel(req);
62755e35 4719 break;
d63d1b5e
JA
4720 case IORING_OP_FALLOCATE:
4721 if (sqe) {
4722 ret = io_fallocate_prep(req, sqe);
4723 if (ret)
4724 break;
4725 }
014db007 4726 ret = io_fallocate(req, force_nonblock);
d63d1b5e 4727 break;
15b71abe
JA
4728 case IORING_OP_OPENAT:
4729 if (sqe) {
4730 ret = io_openat_prep(req, sqe);
4731 if (ret)
4732 break;
4733 }
014db007 4734 ret = io_openat(req, force_nonblock);
15b71abe 4735 break;
b5dba59e
JA
4736 case IORING_OP_CLOSE:
4737 if (sqe) {
4738 ret = io_close_prep(req, sqe);
4739 if (ret)
4740 break;
4741 }
014db007 4742 ret = io_close(req, force_nonblock);
b5dba59e 4743 break;
05f3fb3c
JA
4744 case IORING_OP_FILES_UPDATE:
4745 if (sqe) {
4746 ret = io_files_update_prep(req, sqe);
4747 if (ret)
4748 break;
4749 }
4750 ret = io_files_update(req, force_nonblock);
4751 break;
eddc7ef5
JA
4752 case IORING_OP_STATX:
4753 if (sqe) {
4754 ret = io_statx_prep(req, sqe);
4755 if (ret)
4756 break;
4757 }
014db007 4758 ret = io_statx(req, force_nonblock);
eddc7ef5 4759 break;
4840e418
JA
4760 case IORING_OP_FADVISE:
4761 if (sqe) {
4762 ret = io_fadvise_prep(req, sqe);
4763 if (ret)
4764 break;
4765 }
014db007 4766 ret = io_fadvise(req, force_nonblock);
4840e418 4767 break;
c1ca757b
JA
4768 case IORING_OP_MADVISE:
4769 if (sqe) {
4770 ret = io_madvise_prep(req, sqe);
4771 if (ret)
4772 break;
4773 }
014db007 4774 ret = io_madvise(req, force_nonblock);
c1ca757b 4775 break;
cebdb986
JA
4776 case IORING_OP_OPENAT2:
4777 if (sqe) {
4778 ret = io_openat2_prep(req, sqe);
4779 if (ret)
4780 break;
4781 }
014db007 4782 ret = io_openat2(req, force_nonblock);
cebdb986 4783 break;
3e4827b0
JA
4784 case IORING_OP_EPOLL_CTL:
4785 if (sqe) {
4786 ret = io_epoll_ctl_prep(req, sqe);
4787 if (ret)
4788 break;
4789 }
014db007 4790 ret = io_epoll_ctl(req, force_nonblock);
3e4827b0 4791 break;
7d67af2c
PB
4792 case IORING_OP_SPLICE:
4793 if (sqe) {
4794 ret = io_splice_prep(req, sqe);
4795 if (ret < 0)
4796 break;
4797 }
014db007 4798 ret = io_splice(req, force_nonblock);
7d67af2c 4799 break;
ddf0322d
JA
4800 case IORING_OP_PROVIDE_BUFFERS:
4801 if (sqe) {
4802 ret = io_provide_buffers_prep(req, sqe);
4803 if (ret)
4804 break;
4805 }
4806 ret = io_provide_buffers(req, force_nonblock);
4807 break;
2b188cc1
JA
4808 default:
4809 ret = -EINVAL;
4810 break;
4811 }
4812
def596e9
JA
4813 if (ret)
4814 return ret;
4815
4816 if (ctx->flags & IORING_SETUP_IOPOLL) {
11ba820b
JA
4817 const bool in_async = io_wq_current_is_worker();
4818
9e645e11 4819 if (req->result == -EAGAIN)
def596e9
JA
4820 return -EAGAIN;
4821
11ba820b
JA
4822 /* workqueue context doesn't hold uring_lock, grab it now */
4823 if (in_async)
4824 mutex_lock(&ctx->uring_lock);
4825
def596e9 4826 io_iopoll_req_issued(req);
11ba820b
JA
4827
4828 if (in_async)
4829 mutex_unlock(&ctx->uring_lock);
def596e9
JA
4830 }
4831
4832 return 0;
2b188cc1
JA
4833}
4834
561fb04a 4835static void io_wq_submit_work(struct io_wq_work **workptr)
2b188cc1 4836{
561fb04a 4837 struct io_wq_work *work = *workptr;
2b188cc1 4838 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
561fb04a 4839 int ret = 0;
2b188cc1 4840
0c9d5ccd
JA
4841 /* if NO_CANCEL is set, we must still run the work */
4842 if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) ==
4843 IO_WQ_WORK_CANCEL) {
561fb04a 4844 ret = -ECANCELED;
0c9d5ccd 4845 }
31b51510 4846
561fb04a 4847 if (!ret) {
561fb04a 4848 do {
014db007 4849 ret = io_issue_sqe(req, NULL, false);
561fb04a
JA
4850 /*
4851 * We can get EAGAIN for polled IO even though we're
4852 * forcing a sync submission from here, since we can't
4853 * wait for request slots on the block side.
4854 */
4855 if (ret != -EAGAIN)
4856 break;
4857 cond_resched();
4858 } while (1);
4859 }
31b51510 4860
561fb04a 4861 if (ret) {
4e88d6e7 4862 req_set_fail_links(req);
78e19bbe 4863 io_cqring_add_event(req, ret);
817869d2 4864 io_put_req(req);
edafccee 4865 }
2b188cc1 4866
e9fd9396 4867 io_steal_work(req, workptr);
2b188cc1
JA
4868}
4869
15b71abe 4870static int io_req_needs_file(struct io_kiocb *req, int fd)
9e3aa61a 4871{
d3656344 4872 if (!io_op_defs[req->opcode].needs_file)
9e3aa61a 4873 return 0;
0b5faf6b 4874 if ((fd == -1 || fd == AT_FDCWD) && io_op_defs[req->opcode].fd_non_neg)
d3656344
JA
4875 return 0;
4876 return 1;
09bb8394
JA
4877}
4878
65e19f54
JA
4879static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
4880 int index)
4881{
4882 struct fixed_file_table *table;
4883
05f3fb3c
JA
4884 table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT];
4885 return table->files[index & IORING_FILE_TABLE_MASK];;
65e19f54
JA
4886}
4887
8da11c19
PB
4888static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
4889 int fd, struct file **out_file, bool fixed)
09bb8394 4890{
a197f664 4891 struct io_ring_ctx *ctx = req->ctx;
8da11c19 4892 struct file *file;
09bb8394 4893
8da11c19 4894 if (fixed) {
05f3fb3c 4895 if (unlikely(!ctx->file_data ||
09bb8394
JA
4896 (unsigned) fd >= ctx->nr_user_files))
4897 return -EBADF;
b7620121 4898 fd = array_index_nospec(fd, ctx->nr_user_files);
8da11c19
PB
4899 file = io_file_from_index(ctx, fd);
4900 if (!file)
08a45173 4901 return -EBADF;
05f3fb3c 4902 percpu_ref_get(&ctx->file_data->refs);
09bb8394 4903 } else {
c826bd7a 4904 trace_io_uring_file_get(ctx, fd);
8da11c19
PB
4905 file = __io_file_get(state, fd);
4906 if (unlikely(!file))
09bb8394
JA
4907 return -EBADF;
4908 }
4909
8da11c19 4910 *out_file = file;
09bb8394
JA
4911 return 0;
4912}
4913
8da11c19
PB
4914static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
4915 const struct io_uring_sqe *sqe)
4916{
4917 unsigned flags;
4918 int fd;
4919 bool fixed;
4920
4921 flags = READ_ONCE(sqe->flags);
4922 fd = READ_ONCE(sqe->fd);
4923
4924 if (!io_req_needs_file(req, fd))
4925 return 0;
4926
4927 fixed = (flags & IOSQE_FIXED_FILE);
4928 if (unlikely(!fixed && req->needs_fixed_file))
4929 return -EBADF;
4930
4931 return io_file_get(state, req, fd, &req->file, fixed);
4932}
4933
a197f664 4934static int io_grab_files(struct io_kiocb *req)
fcb323cc
JA
4935{
4936 int ret = -EBADF;
a197f664 4937 struct io_ring_ctx *ctx = req->ctx;
fcb323cc 4938
f86cd20c
JA
4939 if (req->work.files)
4940 return 0;
b14cca0c 4941 if (!ctx->ring_file)
b5dba59e
JA
4942 return -EBADF;
4943
fcb323cc
JA
4944 rcu_read_lock();
4945 spin_lock_irq(&ctx->inflight_lock);
4946 /*
4947 * We use the f_ops->flush() handler to ensure that we can flush
4948 * out work accessing these files if the fd is closed. Check if
4949 * the fd has changed since we started down this path, and disallow
4950 * this operation if it has.
4951 */
b14cca0c 4952 if (fcheck(ctx->ring_fd) == ctx->ring_file) {
fcb323cc
JA
4953 list_add(&req->inflight_entry, &ctx->inflight_list);
4954 req->flags |= REQ_F_INFLIGHT;
4955 req->work.files = current->files;
4956 ret = 0;
4957 }
4958 spin_unlock_irq(&ctx->inflight_lock);
4959 rcu_read_unlock();
4960
4961 return ret;
4962}
4963
2665abfd 4964static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
2b188cc1 4965{
ad8a48ac
JA
4966 struct io_timeout_data *data = container_of(timer,
4967 struct io_timeout_data, timer);
4968 struct io_kiocb *req = data->req;
2665abfd
JA
4969 struct io_ring_ctx *ctx = req->ctx;
4970 struct io_kiocb *prev = NULL;
4971 unsigned long flags;
2665abfd
JA
4972
4973 spin_lock_irqsave(&ctx->completion_lock, flags);
4974
4975 /*
4976 * We don't expect the list to be empty, that will only happen if we
4977 * race with the completion of the linked work.
4978 */
4493233e
PB
4979 if (!list_empty(&req->link_list)) {
4980 prev = list_entry(req->link_list.prev, struct io_kiocb,
4981 link_list);
5d960724 4982 if (refcount_inc_not_zero(&prev->refs)) {
4493233e 4983 list_del_init(&req->link_list);
5d960724
JA
4984 prev->flags &= ~REQ_F_LINK_TIMEOUT;
4985 } else
76a46e06 4986 prev = NULL;
2665abfd
JA
4987 }
4988
4989 spin_unlock_irqrestore(&ctx->completion_lock, flags);
4990
4991 if (prev) {
4e88d6e7 4992 req_set_fail_links(prev);
014db007 4993 io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME);
76a46e06 4994 io_put_req(prev);
47f46768
JA
4995 } else {
4996 io_cqring_add_event(req, -ETIME);
4997 io_put_req(req);
2665abfd 4998 }
2665abfd
JA
4999 return HRTIMER_NORESTART;
5000}
5001
ad8a48ac 5002static void io_queue_linked_timeout(struct io_kiocb *req)
2665abfd 5003{
76a46e06 5004 struct io_ring_ctx *ctx = req->ctx;
2665abfd 5005
76a46e06
JA
5006 /*
5007 * If the list is now empty, then our linked request finished before
5008 * we got a chance to setup the timer
5009 */
5010 spin_lock_irq(&ctx->completion_lock);
4493233e 5011 if (!list_empty(&req->link_list)) {
2d28390a 5012 struct io_timeout_data *data = &req->io->timeout;
94ae5e77 5013
ad8a48ac
JA
5014 data->timer.function = io_link_timeout_fn;
5015 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
5016 data->mode);
2665abfd 5017 }
76a46e06 5018 spin_unlock_irq(&ctx->completion_lock);
2665abfd 5019
2665abfd 5020 /* drop submission reference */
76a46e06
JA
5021 io_put_req(req);
5022}
2665abfd 5023
ad8a48ac 5024static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
2665abfd
JA
5025{
5026 struct io_kiocb *nxt;
5027
5028 if (!(req->flags & REQ_F_LINK))
5029 return NULL;
d7718a9d
JA
5030 /* for polled retry, if flag is set, we already went through here */
5031 if (req->flags & REQ_F_POLLED)
5032 return NULL;
2665abfd 5033
4493233e
PB
5034 nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb,
5035 link_list);
d625c6ee 5036 if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT)
76a46e06 5037 return NULL;
2665abfd 5038
76a46e06 5039 req->flags |= REQ_F_LINK_TIMEOUT;
76a46e06 5040 return nxt;
2665abfd
JA
5041}
5042
3529d8c2 5043static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2b188cc1 5044{
4a0a7a18 5045 struct io_kiocb *linked_timeout;
4bc4494e 5046 struct io_kiocb *nxt;
193155c8 5047 const struct cred *old_creds = NULL;
e0c5c576 5048 int ret;
2b188cc1 5049
4a0a7a18
JA
5050again:
5051 linked_timeout = io_prep_linked_timeout(req);
5052
193155c8
JA
5053 if (req->work.creds && req->work.creds != current_cred()) {
5054 if (old_creds)
5055 revert_creds(old_creds);
5056 if (old_creds == req->work.creds)
5057 old_creds = NULL; /* restored original creds */
5058 else
5059 old_creds = override_creds(req->work.creds);
5060 }
5061
014db007 5062 ret = io_issue_sqe(req, sqe, true);
491381ce
JA
5063
5064 /*
5065 * We async punt it if the file wasn't marked NOWAIT, or if the file
5066 * doesn't support non-blocking read/write attempts
5067 */
5068 if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
5069 (req->flags & REQ_F_MUST_PUNT))) {
d7718a9d
JA
5070 if (io_arm_poll_handler(req)) {
5071 if (linked_timeout)
5072 io_queue_linked_timeout(linked_timeout);
4bc4494e 5073 goto exit;
d7718a9d 5074 }
86a761f8 5075punt:
f86cd20c 5076 if (io_op_defs[req->opcode].file_table) {
bbad27b2
PB
5077 ret = io_grab_files(req);
5078 if (ret)
5079 goto err;
2b188cc1 5080 }
bbad27b2
PB
5081
5082 /*
5083 * Queued up for async execution, worker will release
5084 * submit reference when the iocb is actually submitted.
5085 */
5086 io_queue_async_work(req);
4bc4494e 5087 goto exit;
2b188cc1 5088 }
e65ef56d 5089
fcb323cc 5090err:
4bc4494e 5091 nxt = NULL;
76a46e06 5092 /* drop submission reference */
2a44f467 5093 io_put_req_find_next(req, &nxt);
e65ef56d 5094
f9bd67f6 5095 if (linked_timeout) {
76a46e06 5096 if (!ret)
f9bd67f6 5097 io_queue_linked_timeout(linked_timeout);
76a46e06 5098 else
f9bd67f6 5099 io_put_req(linked_timeout);
76a46e06
JA
5100 }
5101
e65ef56d 5102 /* and drop final reference, if we failed */
9e645e11 5103 if (ret) {
78e19bbe 5104 io_cqring_add_event(req, ret);
4e88d6e7 5105 req_set_fail_links(req);
e65ef56d 5106 io_put_req(req);
9e645e11 5107 }
4a0a7a18
JA
5108 if (nxt) {
5109 req = nxt;
86a761f8
PB
5110
5111 if (req->flags & REQ_F_FORCE_ASYNC)
5112 goto punt;
4a0a7a18
JA
5113 goto again;
5114 }
4bc4494e 5115exit:
193155c8
JA
5116 if (old_creds)
5117 revert_creds(old_creds);
2b188cc1
JA
5118}
5119
3529d8c2 5120static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4fe2c963
JL
5121{
5122 int ret;
5123
3529d8c2 5124 ret = io_req_defer(req, sqe);
4fe2c963
JL
5125 if (ret) {
5126 if (ret != -EIOCBQUEUED) {
1118591a 5127fail_req:
78e19bbe 5128 io_cqring_add_event(req, ret);
4e88d6e7 5129 req_set_fail_links(req);
78e19bbe 5130 io_double_put_req(req);
4fe2c963 5131 }
2550878f 5132 } else if (req->flags & REQ_F_FORCE_ASYNC) {
1118591a
PB
5133 ret = io_req_defer_prep(req, sqe);
5134 if (unlikely(ret < 0))
5135 goto fail_req;
ce35a47a
JA
5136 /*
5137 * Never try inline submit of IOSQE_ASYNC is set, go straight
5138 * to async execution.
5139 */
5140 req->work.flags |= IO_WQ_WORK_CONCURRENT;
5141 io_queue_async_work(req);
5142 } else {
3529d8c2 5143 __io_queue_sqe(req, sqe);
ce35a47a 5144 }
4fe2c963
JL
5145}
5146
1b4a51b6 5147static inline void io_queue_link_head(struct io_kiocb *req)
4fe2c963 5148{
94ae5e77 5149 if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
1b4a51b6
PB
5150 io_cqring_add_event(req, -ECANCELED);
5151 io_double_put_req(req);
5152 } else
3529d8c2 5153 io_queue_sqe(req, NULL);
4fe2c963
JL
5154}
5155
4e88d6e7 5156#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
ce35a47a 5157 IOSQE_IO_HARDLINK | IOSQE_ASYNC)
9e645e11 5158
3529d8c2
JA
5159static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
5160 struct io_submit_state *state, struct io_kiocb **link)
9e645e11 5161{
a197f664 5162 struct io_ring_ctx *ctx = req->ctx;
32fe525b 5163 unsigned int sqe_flags;
75c6a039 5164 int ret, id;
9e645e11 5165
32fe525b 5166 sqe_flags = READ_ONCE(sqe->flags);
9e645e11
JA
5167
5168 /* enforce forwards compatibility on users */
32fe525b 5169 if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) {
9e645e11 5170 ret = -EINVAL;
196be95c 5171 goto err_req;
9e645e11
JA
5172 }
5173
75c6a039
JA
5174 id = READ_ONCE(sqe->personality);
5175 if (id) {
193155c8
JA
5176 req->work.creds = idr_find(&ctx->personality_idr, id);
5177 if (unlikely(!req->work.creds)) {
75c6a039
JA
5178 ret = -EINVAL;
5179 goto err_req;
5180 }
193155c8 5181 get_cred(req->work.creds);
75c6a039
JA
5182 }
5183
6b47ee6e 5184 /* same numerical values with corresponding REQ_F_*, safe to copy */
8da11c19
PB
5185 req->flags |= sqe_flags & (IOSQE_IO_DRAIN | IOSQE_IO_HARDLINK |
5186 IOSQE_ASYNC | IOSQE_FIXED_FILE);
9e645e11 5187
3529d8c2 5188 ret = io_req_set_file(state, req, sqe);
9e645e11
JA
5189 if (unlikely(ret)) {
5190err_req:
78e19bbe
JA
5191 io_cqring_add_event(req, ret);
5192 io_double_put_req(req);
2e6e1fde 5193 return false;
9e645e11
JA
5194 }
5195
9e645e11
JA
5196 /*
5197 * If we already have a head request, queue this one for async
5198 * submittal once the head completes. If we don't have a head but
5199 * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
5200 * submitted sync once the chain is complete. If none of those
5201 * conditions are true (normal request), then just queue it.
5202 */
5203 if (*link) {
9d76377f 5204 struct io_kiocb *head = *link;
4e88d6e7 5205
8cdf2193
PB
5206 /*
5207 * Taking sequential execution of a link, draining both sides
5208 * of the link also fullfils IOSQE_IO_DRAIN semantics for all
5209 * requests in the link. So, it drains the head and the
5210 * next after the link request. The last one is done via
5211 * drain_next flag to persist the effect across calls.
5212 */
711be031
PB
5213 if (sqe_flags & IOSQE_IO_DRAIN) {
5214 head->flags |= REQ_F_IO_DRAIN;
5215 ctx->drain_next = 1;
5216 }
b7bb4f7d 5217 if (io_alloc_async_ctx(req)) {
9e645e11
JA
5218 ret = -EAGAIN;
5219 goto err_req;
5220 }
5221
3529d8c2 5222 ret = io_req_defer_prep(req, sqe);
2d28390a 5223 if (ret) {
4e88d6e7 5224 /* fail even hard links since we don't submit */
9d76377f 5225 head->flags |= REQ_F_FAIL_LINK;
f67676d1 5226 goto err_req;
2d28390a 5227 }
9d76377f
PB
5228 trace_io_uring_link(ctx, req, head);
5229 list_add_tail(&req->link_list, &head->link_list);
32fe525b
PB
5230
5231 /* last request of a link, enqueue the link */
5232 if (!(sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK))) {
5233 io_queue_link_head(head);
5234 *link = NULL;
5235 }
9e645e11 5236 } else {
711be031
PB
5237 if (unlikely(ctx->drain_next)) {
5238 req->flags |= REQ_F_IO_DRAIN;
5239 req->ctx->drain_next = 0;
5240 }
5241 if (sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) {
5242 req->flags |= REQ_F_LINK;
711be031
PB
5243 INIT_LIST_HEAD(&req->link_list);
5244 ret = io_req_defer_prep(req, sqe);
5245 if (ret)
5246 req->flags |= REQ_F_FAIL_LINK;
5247 *link = req;
5248 } else {
5249 io_queue_sqe(req, sqe);
5250 }
9e645e11 5251 }
2e6e1fde
PB
5252
5253 return true;
9e645e11
JA
5254}
5255
9a56a232
JA
5256/*
5257 * Batched submission is done, ensure local IO is flushed out.
5258 */
5259static void io_submit_state_end(struct io_submit_state *state)
5260{
5261 blk_finish_plug(&state->plug);
3d6770fb 5262 io_file_put(state);
2579f913 5263 if (state->free_reqs)
6c8a3134 5264 kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
9a56a232
JA
5265}
5266
5267/*
5268 * Start submission side cache.
5269 */
5270static void io_submit_state_start(struct io_submit_state *state,
22efde59 5271 unsigned int max_ios)
9a56a232
JA
5272{
5273 blk_start_plug(&state->plug);
2579f913 5274 state->free_reqs = 0;
9a56a232
JA
5275 state->file = NULL;
5276 state->ios_left = max_ios;
5277}
5278
2b188cc1
JA
5279static void io_commit_sqring(struct io_ring_ctx *ctx)
5280{
75b28aff 5281 struct io_rings *rings = ctx->rings;
2b188cc1 5282
caf582c6
PB
5283 /*
5284 * Ensure any loads from the SQEs are done at this point,
5285 * since once we write the new head, the application could
5286 * write new data to them.
5287 */
5288 smp_store_release(&rings->sq.head, ctx->cached_sq_head);
2b188cc1
JA
5289}
5290
2b188cc1 5291/*
3529d8c2 5292 * Fetch an sqe, if one is available. Note that sqe_ptr will point to memory
2b188cc1
JA
5293 * that is mapped by userspace. This means that care needs to be taken to
5294 * ensure that reads are stable, as we cannot rely on userspace always
5295 * being a good citizen. If members of the sqe are validated and then later
5296 * used, it's important that those reads are done through READ_ONCE() to
5297 * prevent a re-load down the line.
5298 */
3529d8c2
JA
5299static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req,
5300 const struct io_uring_sqe **sqe_ptr)
2b188cc1 5301{
75b28aff 5302 u32 *sq_array = ctx->sq_array;
2b188cc1
JA
5303 unsigned head;
5304
5305 /*
5306 * The cached sq head (or cq tail) serves two purposes:
5307 *
5308 * 1) allows us to batch the cost of updating the user visible
5309 * head updates.
5310 * 2) allows the kernel side to track the head on its own, even
5311 * though the application is the one updating it.
5312 */
ee7d46d9 5313 head = READ_ONCE(sq_array[ctx->cached_sq_head & ctx->sq_mask]);
9835d6fa 5314 if (likely(head < ctx->sq_entries)) {
cf6fd4bd
PB
5315 /*
5316 * All io need record the previous position, if LINK vs DARIN,
5317 * it can be used to mark the position of the first IO in the
5318 * link list.
5319 */
5320 req->sequence = ctx->cached_sq_head;
3529d8c2
JA
5321 *sqe_ptr = &ctx->sq_sqes[head];
5322 req->opcode = READ_ONCE((*sqe_ptr)->opcode);
5323 req->user_data = READ_ONCE((*sqe_ptr)->user_data);
2b188cc1
JA
5324 ctx->cached_sq_head++;
5325 return true;
5326 }
5327
5328 /* drop invalid entries */
5329 ctx->cached_sq_head++;
498ccd9e 5330 ctx->cached_sq_dropped++;
ee7d46d9 5331 WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped);
2b188cc1
JA
5332 return false;
5333}
5334
fb5ccc98 5335static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
ae9428ca
PB
5336 struct file *ring_file, int ring_fd,
5337 struct mm_struct **mm, bool async)
6c271ce2
JA
5338{
5339 struct io_submit_state state, *statep = NULL;
9e645e11 5340 struct io_kiocb *link = NULL;
9e645e11 5341 int i, submitted = 0;
95a1b3ff 5342 bool mm_fault = false;
6c271ce2 5343
c4a2ed72 5344 /* if we have a backlog and couldn't flush it all, return BUSY */
ad3eb2c8
JA
5345 if (test_bit(0, &ctx->sq_check_overflow)) {
5346 if (!list_empty(&ctx->cq_overflow_list) &&
5347 !io_cqring_overflow_flush(ctx, false))
5348 return -EBUSY;
5349 }
6c271ce2 5350
ee7d46d9
PB
5351 /* make sure SQ entry isn't read before tail */
5352 nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
9ef4f124 5353
2b85edfc
PB
5354 if (!percpu_ref_tryget_many(&ctx->refs, nr))
5355 return -EAGAIN;
6c271ce2
JA
5356
5357 if (nr > IO_PLUG_THRESHOLD) {
22efde59 5358 io_submit_state_start(&state, nr);
6c271ce2
JA
5359 statep = &state;
5360 }
5361
b14cca0c
PB
5362 ctx->ring_fd = ring_fd;
5363 ctx->ring_file = ring_file;
5364
6c271ce2 5365 for (i = 0; i < nr; i++) {
3529d8c2 5366 const struct io_uring_sqe *sqe;
196be95c 5367 struct io_kiocb *req;
1cb1edb2 5368 int err;
fb5ccc98 5369
196be95c
PB
5370 req = io_get_req(ctx, statep);
5371 if (unlikely(!req)) {
5372 if (!submitted)
5373 submitted = -EAGAIN;
fb5ccc98 5374 break;
196be95c 5375 }
3529d8c2 5376 if (!io_get_sqring(ctx, req, &sqe)) {
2b85edfc 5377 __io_req_do_free(req);
196be95c
PB
5378 break;
5379 }
fb5ccc98 5380
d3656344
JA
5381 /* will complete beyond this point, count as submitted */
5382 submitted++;
5383
5384 if (unlikely(req->opcode >= IORING_OP_LAST)) {
1cb1edb2
PB
5385 err = -EINVAL;
5386fail_req:
5387 io_cqring_add_event(req, err);
d3656344 5388 io_double_put_req(req);
196be95c
PB
5389 break;
5390 }
fb5ccc98 5391
d3656344 5392 if (io_op_defs[req->opcode].needs_mm && !*mm) {
95a1b3ff 5393 mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm);
1cb1edb2
PB
5394 if (unlikely(mm_fault)) {
5395 err = -EFAULT;
5396 goto fail_req;
95a1b3ff 5397 }
1cb1edb2
PB
5398 use_mm(ctx->sqo_mm);
5399 *mm = ctx->sqo_mm;
9e645e11 5400 }
9e645e11 5401
cf6fd4bd 5402 req->needs_fixed_file = async;
354420f7
JA
5403 trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
5404 true, async);
3529d8c2 5405 if (!io_submit_sqe(req, sqe, statep, &link))
2e6e1fde 5406 break;
6c271ce2
JA
5407 }
5408
9466f437
PB
5409 if (unlikely(submitted != nr)) {
5410 int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
5411
5412 percpu_ref_put_many(&ctx->refs, nr - ref_used);
5413 }
9e645e11 5414 if (link)
1b4a51b6 5415 io_queue_link_head(link);
6c271ce2
JA
5416 if (statep)
5417 io_submit_state_end(&state);
5418
ae9428ca
PB
5419 /* Commit SQ ring head once we've consumed and submitted all SQEs */
5420 io_commit_sqring(ctx);
5421
6c271ce2
JA
5422 return submitted;
5423}
5424
5425static int io_sq_thread(void *data)
5426{
6c271ce2
JA
5427 struct io_ring_ctx *ctx = data;
5428 struct mm_struct *cur_mm = NULL;
181e448d 5429 const struct cred *old_cred;
6c271ce2
JA
5430 mm_segment_t old_fs;
5431 DEFINE_WAIT(wait);
6c271ce2 5432 unsigned long timeout;
bdcd3eab 5433 int ret = 0;
6c271ce2 5434
206aefde 5435 complete(&ctx->completions[1]);
a4c0b3de 5436
6c271ce2
JA
5437 old_fs = get_fs();
5438 set_fs(USER_DS);
181e448d 5439 old_cred = override_creds(ctx->creds);
6c271ce2 5440
bdcd3eab 5441 timeout = jiffies + ctx->sq_thread_idle;
2bbcd6d3 5442 while (!kthread_should_park()) {
fb5ccc98 5443 unsigned int to_submit;
6c271ce2 5444
bdcd3eab 5445 if (!list_empty(&ctx->poll_list)) {
6c271ce2
JA
5446 unsigned nr_events = 0;
5447
bdcd3eab
XW
5448 mutex_lock(&ctx->uring_lock);
5449 if (!list_empty(&ctx->poll_list))
5450 io_iopoll_getevents(ctx, &nr_events, 0);
5451 else
6c271ce2 5452 timeout = jiffies + ctx->sq_thread_idle;
bdcd3eab 5453 mutex_unlock(&ctx->uring_lock);
6c271ce2
JA
5454 }
5455
fb5ccc98 5456 to_submit = io_sqring_entries(ctx);
c1edbf5f
JA
5457
5458 /*
5459 * If submit got -EBUSY, flag us as needing the application
5460 * to enter the kernel to reap and flush events.
5461 */
5462 if (!to_submit || ret == -EBUSY) {
7143b5ac
SG
5463 /*
5464 * Drop cur_mm before scheduling, we can't hold it for
5465 * long periods (or over schedule()). Do this before
5466 * adding ourselves to the waitqueue, as the unuse/drop
5467 * may sleep.
5468 */
5469 if (cur_mm) {
5470 unuse_mm(cur_mm);
5471 mmput(cur_mm);
5472 cur_mm = NULL;
5473 }
5474
6c271ce2
JA
5475 /*
5476 * We're polling. If we're within the defined idle
5477 * period, then let us spin without work before going
c1edbf5f
JA
5478 * to sleep. The exception is if we got EBUSY doing
5479 * more IO, we should wait for the application to
5480 * reap events and wake us up.
6c271ce2 5481 */
bdcd3eab 5482 if (!list_empty(&ctx->poll_list) ||
df069d80
JA
5483 (!time_after(jiffies, timeout) && ret != -EBUSY &&
5484 !percpu_ref_is_dying(&ctx->refs))) {
b41e9852
JA
5485 if (current->task_works)
5486 task_work_run();
9831a90c 5487 cond_resched();
6c271ce2
JA
5488 continue;
5489 }
5490
6c271ce2
JA
5491 prepare_to_wait(&ctx->sqo_wait, &wait,
5492 TASK_INTERRUPTIBLE);
5493
bdcd3eab
XW
5494 /*
5495 * While doing polled IO, before going to sleep, we need
5496 * to check if there are new reqs added to poll_list, it
5497 * is because reqs may have been punted to io worker and
5498 * will be added to poll_list later, hence check the
5499 * poll_list again.
5500 */
5501 if ((ctx->flags & IORING_SETUP_IOPOLL) &&
5502 !list_empty_careful(&ctx->poll_list)) {
5503 finish_wait(&ctx->sqo_wait, &wait);
5504 continue;
5505 }
5506
6c271ce2 5507 /* Tell userspace we may need a wakeup call */
75b28aff 5508 ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
0d7bae69
SB
5509 /* make sure to read SQ tail after writing flags */
5510 smp_mb();
6c271ce2 5511
fb5ccc98 5512 to_submit = io_sqring_entries(ctx);
c1edbf5f 5513 if (!to_submit || ret == -EBUSY) {
2bbcd6d3 5514 if (kthread_should_park()) {
6c271ce2
JA
5515 finish_wait(&ctx->sqo_wait, &wait);
5516 break;
5517 }
b41e9852
JA
5518 if (current->task_works) {
5519 task_work_run();
5520 continue;
5521 }
6c271ce2
JA
5522 if (signal_pending(current))
5523 flush_signals(current);
5524 schedule();
5525 finish_wait(&ctx->sqo_wait, &wait);
5526
75b28aff 5527 ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
6c271ce2
JA
5528 continue;
5529 }
5530 finish_wait(&ctx->sqo_wait, &wait);
5531
75b28aff 5532 ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
6c271ce2
JA
5533 }
5534
8a4955ff 5535 mutex_lock(&ctx->uring_lock);
1d7bb1d5 5536 ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true);
8a4955ff 5537 mutex_unlock(&ctx->uring_lock);
bdcd3eab 5538 timeout = jiffies + ctx->sq_thread_idle;
6c271ce2
JA
5539 }
5540
b41e9852
JA
5541 if (current->task_works)
5542 task_work_run();
5543
6c271ce2
JA
5544 set_fs(old_fs);
5545 if (cur_mm) {
5546 unuse_mm(cur_mm);
5547 mmput(cur_mm);
5548 }
181e448d 5549 revert_creds(old_cred);
06058632 5550
2bbcd6d3 5551 kthread_parkme();
06058632 5552
6c271ce2
JA
5553 return 0;
5554}
5555
bda52162
JA
5556struct io_wait_queue {
5557 struct wait_queue_entry wq;
5558 struct io_ring_ctx *ctx;
5559 unsigned to_wait;
5560 unsigned nr_timeouts;
5561};
5562
1d7bb1d5 5563static inline bool io_should_wake(struct io_wait_queue *iowq, bool noflush)
bda52162
JA
5564{
5565 struct io_ring_ctx *ctx = iowq->ctx;
5566
5567 /*
d195a66e 5568 * Wake up if we have enough events, or if a timeout occurred since we
bda52162
JA
5569 * started waiting. For timeouts, we always want to return to userspace,
5570 * regardless of event count.
5571 */
1d7bb1d5 5572 return io_cqring_events(ctx, noflush) >= iowq->to_wait ||
bda52162
JA
5573 atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
5574}
5575
5576static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
5577 int wake_flags, void *key)
5578{
5579 struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
5580 wq);
5581
1d7bb1d5
JA
5582 /* use noflush == true, as we can't safely rely on locking context */
5583 if (!io_should_wake(iowq, true))
bda52162
JA
5584 return -1;
5585
5586 return autoremove_wake_function(curr, mode, wake_flags, key);
5587}
5588
2b188cc1
JA
5589/*
5590 * Wait until events become available, if we don't already have some. The
5591 * application must reap them itself, as they reside on the shared cq ring.
5592 */
5593static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
5594 const sigset_t __user *sig, size_t sigsz)
5595{
bda52162
JA
5596 struct io_wait_queue iowq = {
5597 .wq = {
5598 .private = current,
5599 .func = io_wake_function,
5600 .entry = LIST_HEAD_INIT(iowq.wq.entry),
5601 },
5602 .ctx = ctx,
5603 .to_wait = min_events,
5604 };
75b28aff 5605 struct io_rings *rings = ctx->rings;
e9ffa5c2 5606 int ret = 0;
2b188cc1 5607
b41e9852
JA
5608 do {
5609 if (io_cqring_events(ctx, false) >= min_events)
5610 return 0;
5611 if (!current->task_works)
5612 break;
5613 task_work_run();
5614 } while (1);
2b188cc1
JA
5615
5616 if (sig) {
9e75ad5d
AB
5617#ifdef CONFIG_COMPAT
5618 if (in_compat_syscall())
5619 ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
b772434b 5620 sigsz);
9e75ad5d
AB
5621 else
5622#endif
b772434b 5623 ret = set_user_sigmask(sig, sigsz);
9e75ad5d 5624
2b188cc1
JA
5625 if (ret)
5626 return ret;
5627 }
5628
bda52162 5629 iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
c826bd7a 5630 trace_io_uring_cqring_wait(ctx, min_events);
bda52162
JA
5631 do {
5632 prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
5633 TASK_INTERRUPTIBLE);
b41e9852
JA
5634 if (current->task_works)
5635 task_work_run();
1d7bb1d5 5636 if (io_should_wake(&iowq, false))
bda52162
JA
5637 break;
5638 schedule();
5639 if (signal_pending(current)) {
e9ffa5c2 5640 ret = -EINTR;
bda52162
JA
5641 break;
5642 }
5643 } while (1);
5644 finish_wait(&ctx->wait, &iowq.wq);
5645
e9ffa5c2 5646 restore_saved_sigmask_unless(ret == -EINTR);
2b188cc1 5647
75b28aff 5648 return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
2b188cc1
JA
5649}
5650
6b06314c
JA
5651static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
5652{
5653#if defined(CONFIG_UNIX)
5654 if (ctx->ring_sock) {
5655 struct sock *sock = ctx->ring_sock->sk;
5656 struct sk_buff *skb;
5657
5658 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
5659 kfree_skb(skb);
5660 }
5661#else
5662 int i;
5663
65e19f54
JA
5664 for (i = 0; i < ctx->nr_user_files; i++) {
5665 struct file *file;
5666
5667 file = io_file_from_index(ctx, i);
5668 if (file)
5669 fput(file);
5670 }
6b06314c
JA
5671#endif
5672}
5673
05f3fb3c
JA
5674static void io_file_ref_kill(struct percpu_ref *ref)
5675{
5676 struct fixed_file_data *data;
5677
5678 data = container_of(ref, struct fixed_file_data, refs);
5679 complete(&data->done);
5680}
5681
6b06314c
JA
5682static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
5683{
05f3fb3c 5684 struct fixed_file_data *data = ctx->file_data;
65e19f54
JA
5685 unsigned nr_tables, i;
5686
05f3fb3c 5687 if (!data)
6b06314c
JA
5688 return -ENXIO;
5689
05f3fb3c 5690 percpu_ref_kill_and_confirm(&data->refs, io_file_ref_kill);
e46a7950 5691 flush_work(&data->ref_work);
2faf852d
JA
5692 wait_for_completion(&data->done);
5693 io_ring_file_ref_flush(data);
05f3fb3c
JA
5694 percpu_ref_exit(&data->refs);
5695
6b06314c 5696 __io_sqe_files_unregister(ctx);
65e19f54
JA
5697 nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
5698 for (i = 0; i < nr_tables; i++)
05f3fb3c
JA
5699 kfree(data->table[i].files);
5700 kfree(data->table);
5701 kfree(data);
5702 ctx->file_data = NULL;
6b06314c
JA
5703 ctx->nr_user_files = 0;
5704 return 0;
5705}
5706
6c271ce2
JA
5707static void io_sq_thread_stop(struct io_ring_ctx *ctx)
5708{
5709 if (ctx->sqo_thread) {
206aefde 5710 wait_for_completion(&ctx->completions[1]);
2bbcd6d3
RP
5711 /*
5712 * The park is a bit of a work-around, without it we get
5713 * warning spews on shutdown with SQPOLL set and affinity
5714 * set to a single CPU.
5715 */
06058632 5716 kthread_park(ctx->sqo_thread);
6c271ce2
JA
5717 kthread_stop(ctx->sqo_thread);
5718 ctx->sqo_thread = NULL;
5719 }
5720}
5721
6b06314c
JA
5722static void io_finish_async(struct io_ring_ctx *ctx)
5723{
6c271ce2
JA
5724 io_sq_thread_stop(ctx);
5725
561fb04a
JA
5726 if (ctx->io_wq) {
5727 io_wq_destroy(ctx->io_wq);
5728 ctx->io_wq = NULL;
6b06314c
JA
5729 }
5730}
5731
5732#if defined(CONFIG_UNIX)
6b06314c
JA
5733/*
5734 * Ensure the UNIX gc is aware of our file set, so we are certain that
5735 * the io_uring can be safely unregistered on process exit, even if we have
5736 * loops in the file referencing.
5737 */
5738static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
5739{
5740 struct sock *sk = ctx->ring_sock->sk;
5741 struct scm_fp_list *fpl;
5742 struct sk_buff *skb;
08a45173 5743 int i, nr_files;
6b06314c
JA
5744
5745 if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
5746 unsigned long inflight = ctx->user->unix_inflight + nr;
5747
5748 if (inflight > task_rlimit(current, RLIMIT_NOFILE))
5749 return -EMFILE;
5750 }
5751
5752 fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
5753 if (!fpl)
5754 return -ENOMEM;
5755
5756 skb = alloc_skb(0, GFP_KERNEL);
5757 if (!skb) {
5758 kfree(fpl);
5759 return -ENOMEM;
5760 }
5761
5762 skb->sk = sk;
6b06314c 5763
08a45173 5764 nr_files = 0;
6b06314c
JA
5765 fpl->user = get_uid(ctx->user);
5766 for (i = 0; i < nr; i++) {
65e19f54
JA
5767 struct file *file = io_file_from_index(ctx, i + offset);
5768
5769 if (!file)
08a45173 5770 continue;
65e19f54 5771 fpl->fp[nr_files] = get_file(file);
08a45173
JA
5772 unix_inflight(fpl->user, fpl->fp[nr_files]);
5773 nr_files++;
6b06314c
JA
5774 }
5775
08a45173
JA
5776 if (nr_files) {
5777 fpl->max = SCM_MAX_FD;
5778 fpl->count = nr_files;
5779 UNIXCB(skb).fp = fpl;
05f3fb3c 5780 skb->destructor = unix_destruct_scm;
08a45173
JA
5781 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
5782 skb_queue_head(&sk->sk_receive_queue, skb);
6b06314c 5783
08a45173
JA
5784 for (i = 0; i < nr_files; i++)
5785 fput(fpl->fp[i]);
5786 } else {
5787 kfree_skb(skb);
5788 kfree(fpl);
5789 }
6b06314c
JA
5790
5791 return 0;
5792}
5793
5794/*
5795 * If UNIX sockets are enabled, fd passing can cause a reference cycle which
5796 * causes regular reference counting to break down. We rely on the UNIX
5797 * garbage collection to take care of this problem for us.
5798 */
5799static int io_sqe_files_scm(struct io_ring_ctx *ctx)
5800{
5801 unsigned left, total;
5802 int ret = 0;
5803
5804 total = 0;
5805 left = ctx->nr_user_files;
5806 while (left) {
5807 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
6b06314c
JA
5808
5809 ret = __io_sqe_files_scm(ctx, this_files, total);
5810 if (ret)
5811 break;
5812 left -= this_files;
5813 total += this_files;
5814 }
5815
5816 if (!ret)
5817 return 0;
5818
5819 while (total < ctx->nr_user_files) {
65e19f54
JA
5820 struct file *file = io_file_from_index(ctx, total);
5821
5822 if (file)
5823 fput(file);
6b06314c
JA
5824 total++;
5825 }
5826
5827 return ret;
5828}
5829#else
5830static int io_sqe_files_scm(struct io_ring_ctx *ctx)
5831{
5832 return 0;
5833}
5834#endif
5835
65e19f54
JA
5836static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables,
5837 unsigned nr_files)
5838{
5839 int i;
5840
5841 for (i = 0; i < nr_tables; i++) {
05f3fb3c 5842 struct fixed_file_table *table = &ctx->file_data->table[i];
65e19f54
JA
5843 unsigned this_files;
5844
5845 this_files = min(nr_files, IORING_MAX_FILES_TABLE);
5846 table->files = kcalloc(this_files, sizeof(struct file *),
5847 GFP_KERNEL);
5848 if (!table->files)
5849 break;
5850 nr_files -= this_files;
5851 }
5852
5853 if (i == nr_tables)
5854 return 0;
5855
5856 for (i = 0; i < nr_tables; i++) {
05f3fb3c 5857 struct fixed_file_table *table = &ctx->file_data->table[i];
65e19f54
JA
5858 kfree(table->files);
5859 }
5860 return 1;
5861}
5862
05f3fb3c
JA
5863static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file)
5864{
5865#if defined(CONFIG_UNIX)
5866 struct sock *sock = ctx->ring_sock->sk;
5867 struct sk_buff_head list, *head = &sock->sk_receive_queue;
5868 struct sk_buff *skb;
5869 int i;
5870
5871 __skb_queue_head_init(&list);
5872
5873 /*
5874 * Find the skb that holds this file in its SCM_RIGHTS. When found,
5875 * remove this entry and rearrange the file array.
5876 */
5877 skb = skb_dequeue(head);
5878 while (skb) {
5879 struct scm_fp_list *fp;
5880
5881 fp = UNIXCB(skb).fp;
5882 for (i = 0; i < fp->count; i++) {
5883 int left;
5884
5885 if (fp->fp[i] != file)
5886 continue;
5887
5888 unix_notinflight(fp->user, fp->fp[i]);
5889 left = fp->count - 1 - i;
5890 if (left) {
5891 memmove(&fp->fp[i], &fp->fp[i + 1],
5892 left * sizeof(struct file *));
5893 }
5894 fp->count--;
5895 if (!fp->count) {
5896 kfree_skb(skb);
5897 skb = NULL;
5898 } else {
5899 __skb_queue_tail(&list, skb);
5900 }
5901 fput(file);
5902 file = NULL;
5903 break;
5904 }
5905
5906 if (!file)
5907 break;
5908
5909 __skb_queue_tail(&list, skb);
5910
5911 skb = skb_dequeue(head);
5912 }
5913
5914 if (skb_peek(&list)) {
5915 spin_lock_irq(&head->lock);
5916 while ((skb = __skb_dequeue(&list)) != NULL)
5917 __skb_queue_tail(head, skb);
5918 spin_unlock_irq(&head->lock);
5919 }
5920#else
5921 fput(file);
5922#endif
5923}
5924
5925struct io_file_put {
5926 struct llist_node llist;
5927 struct file *file;
5928 struct completion *done;
5929};
5930
2faf852d 5931static void io_ring_file_ref_flush(struct fixed_file_data *data)
65e19f54 5932{
05f3fb3c 5933 struct io_file_put *pfile, *tmp;
05f3fb3c 5934 struct llist_node *node;
65e19f54 5935
05f3fb3c
JA
5936 while ((node = llist_del_all(&data->put_llist)) != NULL) {
5937 llist_for_each_entry_safe(pfile, tmp, node, llist) {
5938 io_ring_file_put(data->ctx, pfile->file);
5939 if (pfile->done)
5940 complete(pfile->done);
5941 else
5942 kfree(pfile);
5943 }
65e19f54 5944 }
2faf852d 5945}
65e19f54 5946
2faf852d
JA
5947static void io_ring_file_ref_switch(struct work_struct *work)
5948{
5949 struct fixed_file_data *data;
65e19f54 5950
2faf852d
JA
5951 data = container_of(work, struct fixed_file_data, ref_work);
5952 io_ring_file_ref_flush(data);
05f3fb3c
JA
5953 percpu_ref_switch_to_percpu(&data->refs);
5954}
65e19f54 5955
05f3fb3c
JA
5956static void io_file_data_ref_zero(struct percpu_ref *ref)
5957{
5958 struct fixed_file_data *data;
5959
5960 data = container_of(ref, struct fixed_file_data, refs);
5961
2faf852d
JA
5962 /*
5963 * We can't safely switch from inside this context, punt to wq. If
5964 * the table ref is going away, the table is being unregistered.
5965 * Don't queue up the async work for that case, the caller will
5966 * handle it.
5967 */
5968 if (!percpu_ref_is_dying(&data->refs))
5969 queue_work(system_wq, &data->ref_work);
65e19f54
JA
5970}
5971
6b06314c
JA
5972static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
5973 unsigned nr_args)
5974{
5975 __s32 __user *fds = (__s32 __user *) arg;
65e19f54 5976 unsigned nr_tables;
05f3fb3c 5977 struct file *file;
6b06314c
JA
5978 int fd, ret = 0;
5979 unsigned i;
5980
05f3fb3c 5981 if (ctx->file_data)
6b06314c
JA
5982 return -EBUSY;
5983 if (!nr_args)
5984 return -EINVAL;
5985 if (nr_args > IORING_MAX_FIXED_FILES)
5986 return -EMFILE;
5987
05f3fb3c
JA
5988 ctx->file_data = kzalloc(sizeof(*ctx->file_data), GFP_KERNEL);
5989 if (!ctx->file_data)
5990 return -ENOMEM;
5991 ctx->file_data->ctx = ctx;
5992 init_completion(&ctx->file_data->done);
5993
65e19f54 5994 nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
05f3fb3c
JA
5995 ctx->file_data->table = kcalloc(nr_tables,
5996 sizeof(struct fixed_file_table),
65e19f54 5997 GFP_KERNEL);
05f3fb3c
JA
5998 if (!ctx->file_data->table) {
5999 kfree(ctx->file_data);
6000 ctx->file_data = NULL;
6b06314c 6001 return -ENOMEM;
05f3fb3c
JA
6002 }
6003
6004 if (percpu_ref_init(&ctx->file_data->refs, io_file_data_ref_zero,
6005 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
6006 kfree(ctx->file_data->table);
6007 kfree(ctx->file_data);
6008 ctx->file_data = NULL;
6b06314c 6009 return -ENOMEM;
05f3fb3c
JA
6010 }
6011 ctx->file_data->put_llist.first = NULL;
6012 INIT_WORK(&ctx->file_data->ref_work, io_ring_file_ref_switch);
6b06314c 6013
65e19f54 6014 if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) {
05f3fb3c
JA
6015 percpu_ref_exit(&ctx->file_data->refs);
6016 kfree(ctx->file_data->table);
6017 kfree(ctx->file_data);
6018 ctx->file_data = NULL;
65e19f54
JA
6019 return -ENOMEM;
6020 }
6021
08a45173 6022 for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
65e19f54
JA
6023 struct fixed_file_table *table;
6024 unsigned index;
6025
6b06314c
JA
6026 ret = -EFAULT;
6027 if (copy_from_user(&fd, &fds[i], sizeof(fd)))
6028 break;
08a45173
JA
6029 /* allow sparse sets */
6030 if (fd == -1) {
6031 ret = 0;
6032 continue;
6033 }
6b06314c 6034
05f3fb3c 6035 table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
65e19f54 6036 index = i & IORING_FILE_TABLE_MASK;
05f3fb3c 6037 file = fget(fd);
6b06314c
JA
6038
6039 ret = -EBADF;
05f3fb3c 6040 if (!file)
6b06314c 6041 break;
05f3fb3c 6042
6b06314c
JA
6043 /*
6044 * Don't allow io_uring instances to be registered. If UNIX
6045 * isn't enabled, then this causes a reference cycle and this
6046 * instance can never get freed. If UNIX is enabled we'll
6047 * handle it just fine, but there's still no point in allowing
6048 * a ring fd as it doesn't support regular read/write anyway.
6049 */
05f3fb3c
JA
6050 if (file->f_op == &io_uring_fops) {
6051 fput(file);
6b06314c
JA
6052 break;
6053 }
6b06314c 6054 ret = 0;
05f3fb3c 6055 table->files[index] = file;
6b06314c
JA
6056 }
6057
6058 if (ret) {
65e19f54 6059 for (i = 0; i < ctx->nr_user_files; i++) {
65e19f54
JA
6060 file = io_file_from_index(ctx, i);
6061 if (file)
6062 fput(file);
6063 }
6064 for (i = 0; i < nr_tables; i++)
05f3fb3c 6065 kfree(ctx->file_data->table[i].files);
6b06314c 6066
05f3fb3c
JA
6067 kfree(ctx->file_data->table);
6068 kfree(ctx->file_data);
6069 ctx->file_data = NULL;
6b06314c
JA
6070 ctx->nr_user_files = 0;
6071 return ret;
6072 }
6073
6074 ret = io_sqe_files_scm(ctx);
6075 if (ret)
6076 io_sqe_files_unregister(ctx);
6077
6078 return ret;
6079}
6080
c3a31e60
JA
6081static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
6082 int index)
6083{
6084#if defined(CONFIG_UNIX)
6085 struct sock *sock = ctx->ring_sock->sk;
6086 struct sk_buff_head *head = &sock->sk_receive_queue;
6087 struct sk_buff *skb;
6088
6089 /*
6090 * See if we can merge this file into an existing skb SCM_RIGHTS
6091 * file set. If there's no room, fall back to allocating a new skb
6092 * and filling it in.
6093 */
6094 spin_lock_irq(&head->lock);
6095 skb = skb_peek(head);
6096 if (skb) {
6097 struct scm_fp_list *fpl = UNIXCB(skb).fp;
6098
6099 if (fpl->count < SCM_MAX_FD) {
6100 __skb_unlink(skb, head);
6101 spin_unlock_irq(&head->lock);
6102 fpl->fp[fpl->count] = get_file(file);
6103 unix_inflight(fpl->user, fpl->fp[fpl->count]);
6104 fpl->count++;
6105 spin_lock_irq(&head->lock);
6106 __skb_queue_head(head, skb);
6107 } else {
6108 skb = NULL;
6109 }
6110 }
6111 spin_unlock_irq(&head->lock);
6112
6113 if (skb) {
6114 fput(file);
6115 return 0;
6116 }
6117
6118 return __io_sqe_files_scm(ctx, 1, index);
6119#else
6120 return 0;
6121#endif
6122}
6123
05f3fb3c 6124static void io_atomic_switch(struct percpu_ref *ref)
c3a31e60 6125{
05f3fb3c
JA
6126 struct fixed_file_data *data;
6127
dd3db2a3
JA
6128 /*
6129 * Juggle reference to ensure we hit zero, if needed, so we can
6130 * switch back to percpu mode
6131 */
05f3fb3c 6132 data = container_of(ref, struct fixed_file_data, refs);
dd3db2a3
JA
6133 percpu_ref_put(&data->refs);
6134 percpu_ref_get(&data->refs);
05f3fb3c
JA
6135}
6136
6137static bool io_queue_file_removal(struct fixed_file_data *data,
6138 struct file *file)
6139{
6140 struct io_file_put *pfile, pfile_stack;
6141 DECLARE_COMPLETION_ONSTACK(done);
6142
6143 /*
6144 * If we fail allocating the struct we need for doing async reomval
6145 * of this file, just punt to sync and wait for it.
6146 */
6147 pfile = kzalloc(sizeof(*pfile), GFP_KERNEL);
6148 if (!pfile) {
6149 pfile = &pfile_stack;
6150 pfile->done = &done;
6151 }
6152
6153 pfile->file = file;
6154 llist_add(&pfile->llist, &data->put_llist);
6155
6156 if (pfile == &pfile_stack) {
dd3db2a3 6157 percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch);
05f3fb3c
JA
6158 wait_for_completion(&done);
6159 flush_work(&data->ref_work);
6160 return false;
6161 }
6162
6163 return true;
6164}
6165
6166static int __io_sqe_files_update(struct io_ring_ctx *ctx,
6167 struct io_uring_files_update *up,
6168 unsigned nr_args)
6169{
6170 struct fixed_file_data *data = ctx->file_data;
6171 bool ref_switch = false;
6172 struct file *file;
c3a31e60
JA
6173 __s32 __user *fds;
6174 int fd, i, err;
6175 __u32 done;
6176
05f3fb3c 6177 if (check_add_overflow(up->offset, nr_args, &done))
c3a31e60
JA
6178 return -EOVERFLOW;
6179 if (done > ctx->nr_user_files)
6180 return -EINVAL;
6181
6182 done = 0;
05f3fb3c 6183 fds = u64_to_user_ptr(up->fds);
c3a31e60 6184 while (nr_args) {
65e19f54
JA
6185 struct fixed_file_table *table;
6186 unsigned index;
6187
c3a31e60
JA
6188 err = 0;
6189 if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
6190 err = -EFAULT;
6191 break;
6192 }
05f3fb3c
JA
6193 i = array_index_nospec(up->offset, ctx->nr_user_files);
6194 table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
65e19f54
JA
6195 index = i & IORING_FILE_TABLE_MASK;
6196 if (table->files[index]) {
05f3fb3c 6197 file = io_file_from_index(ctx, index);
65e19f54 6198 table->files[index] = NULL;
05f3fb3c
JA
6199 if (io_queue_file_removal(data, file))
6200 ref_switch = true;
c3a31e60
JA
6201 }
6202 if (fd != -1) {
c3a31e60
JA
6203 file = fget(fd);
6204 if (!file) {
6205 err = -EBADF;
6206 break;
6207 }
6208 /*
6209 * Don't allow io_uring instances to be registered. If
6210 * UNIX isn't enabled, then this causes a reference
6211 * cycle and this instance can never get freed. If UNIX
6212 * is enabled we'll handle it just fine, but there's
6213 * still no point in allowing a ring fd as it doesn't
6214 * support regular read/write anyway.
6215 */
6216 if (file->f_op == &io_uring_fops) {
6217 fput(file);
6218 err = -EBADF;
6219 break;
6220 }
65e19f54 6221 table->files[index] = file;
c3a31e60
JA
6222 err = io_sqe_file_register(ctx, file, i);
6223 if (err)
6224 break;
6225 }
6226 nr_args--;
6227 done++;
05f3fb3c
JA
6228 up->offset++;
6229 }
6230
dd3db2a3 6231 if (ref_switch)
05f3fb3c 6232 percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch);
c3a31e60
JA
6233
6234 return done ? done : err;
6235}
05f3fb3c
JA
6236static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
6237 unsigned nr_args)
6238{
6239 struct io_uring_files_update up;
6240
6241 if (!ctx->file_data)
6242 return -ENXIO;
6243 if (!nr_args)
6244 return -EINVAL;
6245 if (copy_from_user(&up, arg, sizeof(up)))
6246 return -EFAULT;
6247 if (up.resv)
6248 return -EINVAL;
6249
6250 return __io_sqe_files_update(ctx, &up, nr_args);
6251}
c3a31e60 6252
e9fd9396 6253static void io_free_work(struct io_wq_work *work)
7d723065
JA
6254{
6255 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
6256
e9fd9396 6257 /* Consider that io_steal_work() relies on this ref */
7d723065
JA
6258 io_put_req(req);
6259}
6260
24369c2e
PB
6261static int io_init_wq_offload(struct io_ring_ctx *ctx,
6262 struct io_uring_params *p)
6263{
6264 struct io_wq_data data;
6265 struct fd f;
6266 struct io_ring_ctx *ctx_attach;
6267 unsigned int concurrency;
6268 int ret = 0;
6269
6270 data.user = ctx->user;
e9fd9396 6271 data.free_work = io_free_work;
24369c2e
PB
6272
6273 if (!(p->flags & IORING_SETUP_ATTACH_WQ)) {
6274 /* Do QD, or 4 * CPUS, whatever is smallest */
6275 concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
6276
6277 ctx->io_wq = io_wq_create(concurrency, &data);
6278 if (IS_ERR(ctx->io_wq)) {
6279 ret = PTR_ERR(ctx->io_wq);
6280 ctx->io_wq = NULL;
6281 }
6282 return ret;
6283 }
6284
6285 f = fdget(p->wq_fd);
6286 if (!f.file)
6287 return -EBADF;
6288
6289 if (f.file->f_op != &io_uring_fops) {
6290 ret = -EINVAL;
6291 goto out_fput;
6292 }
6293
6294 ctx_attach = f.file->private_data;
6295 /* @io_wq is protected by holding the fd */
6296 if (!io_wq_get(ctx_attach->io_wq, &data)) {
6297 ret = -EINVAL;
6298 goto out_fput;
6299 }
6300
6301 ctx->io_wq = ctx_attach->io_wq;
6302out_fput:
6303 fdput(f);
6304 return ret;
6305}
6306
6c271ce2
JA
6307static int io_sq_offload_start(struct io_ring_ctx *ctx,
6308 struct io_uring_params *p)
2b188cc1
JA
6309{
6310 int ret;
6311
6c271ce2 6312 init_waitqueue_head(&ctx->sqo_wait);
2b188cc1
JA
6313 mmgrab(current->mm);
6314 ctx->sqo_mm = current->mm;
6315
6c271ce2 6316 if (ctx->flags & IORING_SETUP_SQPOLL) {
3ec482d1
JA
6317 ret = -EPERM;
6318 if (!capable(CAP_SYS_ADMIN))
6319 goto err;
6320
917257da
JA
6321 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
6322 if (!ctx->sq_thread_idle)
6323 ctx->sq_thread_idle = HZ;
6324
6c271ce2 6325 if (p->flags & IORING_SETUP_SQ_AFF) {
44a9bd18 6326 int cpu = p->sq_thread_cpu;
6c271ce2 6327
917257da 6328 ret = -EINVAL;
44a9bd18
JA
6329 if (cpu >= nr_cpu_ids)
6330 goto err;
7889f44d 6331 if (!cpu_online(cpu))
917257da
JA
6332 goto err;
6333
6c271ce2
JA
6334 ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
6335 ctx, cpu,
6336 "io_uring-sq");
6337 } else {
6338 ctx->sqo_thread = kthread_create(io_sq_thread, ctx,
6339 "io_uring-sq");
6340 }
6341 if (IS_ERR(ctx->sqo_thread)) {
6342 ret = PTR_ERR(ctx->sqo_thread);
6343 ctx->sqo_thread = NULL;
6344 goto err;
6345 }
6346 wake_up_process(ctx->sqo_thread);
6347 } else if (p->flags & IORING_SETUP_SQ_AFF) {
6348 /* Can't have SQ_AFF without SQPOLL */
6349 ret = -EINVAL;
6350 goto err;
6351 }
6352
24369c2e
PB
6353 ret = io_init_wq_offload(ctx, p);
6354 if (ret)
2b188cc1 6355 goto err;
2b188cc1
JA
6356
6357 return 0;
6358err:
54a91f3b 6359 io_finish_async(ctx);
2b188cc1
JA
6360 mmdrop(ctx->sqo_mm);
6361 ctx->sqo_mm = NULL;
6362 return ret;
6363}
6364
6365static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages)
6366{
6367 atomic_long_sub(nr_pages, &user->locked_vm);
6368}
6369
6370static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
6371{
6372 unsigned long page_limit, cur_pages, new_pages;
6373
6374 /* Don't allow more pages than we can safely lock */
6375 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
6376
6377 do {
6378 cur_pages = atomic_long_read(&user->locked_vm);
6379 new_pages = cur_pages + nr_pages;
6380 if (new_pages > page_limit)
6381 return -ENOMEM;
6382 } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
6383 new_pages) != cur_pages);
6384
6385 return 0;
6386}
6387
6388static void io_mem_free(void *ptr)
6389{
52e04ef4
MR
6390 struct page *page;
6391
6392 if (!ptr)
6393 return;
2b188cc1 6394
52e04ef4 6395 page = virt_to_head_page(ptr);
2b188cc1
JA
6396 if (put_page_testzero(page))
6397 free_compound_page(page);
6398}
6399
6400static void *io_mem_alloc(size_t size)
6401{
6402 gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
6403 __GFP_NORETRY;
6404
6405 return (void *) __get_free_pages(gfp_flags, get_order(size));
6406}
6407
75b28aff
HV
6408static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
6409 size_t *sq_offset)
6410{
6411 struct io_rings *rings;
6412 size_t off, sq_array_size;
6413
6414 off = struct_size(rings, cqes, cq_entries);
6415 if (off == SIZE_MAX)
6416 return SIZE_MAX;
6417
6418#ifdef CONFIG_SMP
6419 off = ALIGN(off, SMP_CACHE_BYTES);
6420 if (off == 0)
6421 return SIZE_MAX;
6422#endif
6423
6424 sq_array_size = array_size(sizeof(u32), sq_entries);
6425 if (sq_array_size == SIZE_MAX)
6426 return SIZE_MAX;
6427
6428 if (check_add_overflow(off, sq_array_size, &off))
6429 return SIZE_MAX;
6430
6431 if (sq_offset)
6432 *sq_offset = off;
6433
6434 return off;
6435}
6436
2b188cc1
JA
6437static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
6438{
75b28aff 6439 size_t pages;
2b188cc1 6440
75b28aff
HV
6441 pages = (size_t)1 << get_order(
6442 rings_size(sq_entries, cq_entries, NULL));
6443 pages += (size_t)1 << get_order(
6444 array_size(sizeof(struct io_uring_sqe), sq_entries));
2b188cc1 6445
75b28aff 6446 return pages;
2b188cc1
JA
6447}
6448
edafccee
JA
6449static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
6450{
6451 int i, j;
6452
6453 if (!ctx->user_bufs)
6454 return -ENXIO;
6455
6456 for (i = 0; i < ctx->nr_user_bufs; i++) {
6457 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
6458
6459 for (j = 0; j < imu->nr_bvecs; j++)
f1f6a7dd 6460 unpin_user_page(imu->bvec[j].bv_page);
edafccee
JA
6461
6462 if (ctx->account_mem)
6463 io_unaccount_mem(ctx->user, imu->nr_bvecs);
d4ef6475 6464 kvfree(imu->bvec);
edafccee
JA
6465 imu->nr_bvecs = 0;
6466 }
6467
6468 kfree(ctx->user_bufs);
6469 ctx->user_bufs = NULL;
6470 ctx->nr_user_bufs = 0;
6471 return 0;
6472}
6473
6474static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
6475 void __user *arg, unsigned index)
6476{
6477 struct iovec __user *src;
6478
6479#ifdef CONFIG_COMPAT
6480 if (ctx->compat) {
6481 struct compat_iovec __user *ciovs;
6482 struct compat_iovec ciov;
6483
6484 ciovs = (struct compat_iovec __user *) arg;
6485 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
6486 return -EFAULT;
6487
d55e5f5b 6488 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
edafccee
JA
6489 dst->iov_len = ciov.iov_len;
6490 return 0;
6491 }
6492#endif
6493 src = (struct iovec __user *) arg;
6494 if (copy_from_user(dst, &src[index], sizeof(*dst)))
6495 return -EFAULT;
6496 return 0;
6497}
6498
6499static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
6500 unsigned nr_args)
6501{
6502 struct vm_area_struct **vmas = NULL;
6503 struct page **pages = NULL;
6504 int i, j, got_pages = 0;
6505 int ret = -EINVAL;
6506
6507 if (ctx->user_bufs)
6508 return -EBUSY;
6509 if (!nr_args || nr_args > UIO_MAXIOV)
6510 return -EINVAL;
6511
6512 ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
6513 GFP_KERNEL);
6514 if (!ctx->user_bufs)
6515 return -ENOMEM;
6516
6517 for (i = 0; i < nr_args; i++) {
6518 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
6519 unsigned long off, start, end, ubuf;
6520 int pret, nr_pages;
6521 struct iovec iov;
6522 size_t size;
6523
6524 ret = io_copy_iov(ctx, &iov, arg, i);
6525 if (ret)
a278682d 6526 goto err;
edafccee
JA
6527
6528 /*
6529 * Don't impose further limits on the size and buffer
6530 * constraints here, we'll -EINVAL later when IO is
6531 * submitted if they are wrong.
6532 */
6533 ret = -EFAULT;
6534 if (!iov.iov_base || !iov.iov_len)
6535 goto err;
6536
6537 /* arbitrary limit, but we need something */
6538 if (iov.iov_len > SZ_1G)
6539 goto err;
6540
6541 ubuf = (unsigned long) iov.iov_base;
6542 end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
6543 start = ubuf >> PAGE_SHIFT;
6544 nr_pages = end - start;
6545
6546 if (ctx->account_mem) {
6547 ret = io_account_mem(ctx->user, nr_pages);
6548 if (ret)
6549 goto err;
6550 }
6551
6552 ret = 0;
6553 if (!pages || nr_pages > got_pages) {
6554 kfree(vmas);
6555 kfree(pages);
d4ef6475 6556 pages = kvmalloc_array(nr_pages, sizeof(struct page *),
edafccee 6557 GFP_KERNEL);
d4ef6475 6558 vmas = kvmalloc_array(nr_pages,
edafccee
JA
6559 sizeof(struct vm_area_struct *),
6560 GFP_KERNEL);
6561 if (!pages || !vmas) {
6562 ret = -ENOMEM;
6563 if (ctx->account_mem)
6564 io_unaccount_mem(ctx->user, nr_pages);
6565 goto err;
6566 }
6567 got_pages = nr_pages;
6568 }
6569
d4ef6475 6570 imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
edafccee
JA
6571 GFP_KERNEL);
6572 ret = -ENOMEM;
6573 if (!imu->bvec) {
6574 if (ctx->account_mem)
6575 io_unaccount_mem(ctx->user, nr_pages);
6576 goto err;
6577 }
6578
6579 ret = 0;
6580 down_read(&current->mm->mmap_sem);
2113b05d 6581 pret = pin_user_pages(ubuf, nr_pages,
932f4a63
IW
6582 FOLL_WRITE | FOLL_LONGTERM,
6583 pages, vmas);
edafccee
JA
6584 if (pret == nr_pages) {
6585 /* don't support file backed memory */
6586 for (j = 0; j < nr_pages; j++) {
6587 struct vm_area_struct *vma = vmas[j];
6588
6589 if (vma->vm_file &&
6590 !is_file_hugepages(vma->vm_file)) {
6591 ret = -EOPNOTSUPP;
6592 break;
6593 }
6594 }
6595 } else {
6596 ret = pret < 0 ? pret : -EFAULT;
6597 }
6598 up_read(&current->mm->mmap_sem);
6599 if (ret) {
6600 /*
6601 * if we did partial map, or found file backed vmas,
6602 * release any pages we did get
6603 */
27c4d3a3 6604 if (pret > 0)
f1f6a7dd 6605 unpin_user_pages(pages, pret);
edafccee
JA
6606 if (ctx->account_mem)
6607 io_unaccount_mem(ctx->user, nr_pages);
d4ef6475 6608 kvfree(imu->bvec);
edafccee
JA
6609 goto err;
6610 }
6611
6612 off = ubuf & ~PAGE_MASK;
6613 size = iov.iov_len;
6614 for (j = 0; j < nr_pages; j++) {
6615 size_t vec_len;
6616
6617 vec_len = min_t(size_t, size, PAGE_SIZE - off);
6618 imu->bvec[j].bv_page = pages[j];
6619 imu->bvec[j].bv_len = vec_len;
6620 imu->bvec[j].bv_offset = off;
6621 off = 0;
6622 size -= vec_len;
6623 }
6624 /* store original address for later verification */
6625 imu->ubuf = ubuf;
6626 imu->len = iov.iov_len;
6627 imu->nr_bvecs = nr_pages;
6628
6629 ctx->nr_user_bufs++;
6630 }
d4ef6475
MR
6631 kvfree(pages);
6632 kvfree(vmas);
edafccee
JA
6633 return 0;
6634err:
d4ef6475
MR
6635 kvfree(pages);
6636 kvfree(vmas);
edafccee
JA
6637 io_sqe_buffer_unregister(ctx);
6638 return ret;
6639}
6640
9b402849
JA
6641static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
6642{
6643 __s32 __user *fds = arg;
6644 int fd;
6645
6646 if (ctx->cq_ev_fd)
6647 return -EBUSY;
6648
6649 if (copy_from_user(&fd, fds, sizeof(*fds)))
6650 return -EFAULT;
6651
6652 ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
6653 if (IS_ERR(ctx->cq_ev_fd)) {
6654 int ret = PTR_ERR(ctx->cq_ev_fd);
6655 ctx->cq_ev_fd = NULL;
6656 return ret;
6657 }
6658
6659 return 0;
6660}
6661
6662static int io_eventfd_unregister(struct io_ring_ctx *ctx)
6663{
6664 if (ctx->cq_ev_fd) {
6665 eventfd_ctx_put(ctx->cq_ev_fd);
6666 ctx->cq_ev_fd = NULL;
6667 return 0;
6668 }
6669
6670 return -ENXIO;
6671}
6672
5a2e745d
JA
6673static int __io_destroy_buffers(int id, void *p, void *data)
6674{
6675 struct io_ring_ctx *ctx = data;
6676 struct io_buffer *buf = p;
6677
6678 /* the head kbuf is the list itself */
6679 while (!list_empty(&buf->list)) {
6680 struct io_buffer *nxt;
6681
6682 nxt = list_first_entry(&buf->list, struct io_buffer, list);
6683 list_del(&nxt->list);
6684 kfree(nxt);
6685 }
6686 kfree(buf);
6687 idr_remove(&ctx->io_buffer_idr, id);
6688 return 0;
6689}
6690
6691static void io_destroy_buffers(struct io_ring_ctx *ctx)
6692{
6693 idr_for_each(&ctx->io_buffer_idr, __io_destroy_buffers, ctx);
6694 idr_destroy(&ctx->io_buffer_idr);
6695}
6696
2b188cc1
JA
6697static void io_ring_ctx_free(struct io_ring_ctx *ctx)
6698{
6b06314c 6699 io_finish_async(ctx);
2b188cc1
JA
6700 if (ctx->sqo_mm)
6701 mmdrop(ctx->sqo_mm);
def596e9
JA
6702
6703 io_iopoll_reap_events(ctx);
edafccee 6704 io_sqe_buffer_unregister(ctx);
6b06314c 6705 io_sqe_files_unregister(ctx);
9b402849 6706 io_eventfd_unregister(ctx);
5a2e745d 6707 io_destroy_buffers(ctx);
41726c9a 6708 idr_destroy(&ctx->personality_idr);
def596e9 6709
2b188cc1 6710#if defined(CONFIG_UNIX)
355e8d26
EB
6711 if (ctx->ring_sock) {
6712 ctx->ring_sock->file = NULL; /* so that iput() is called */
2b188cc1 6713 sock_release(ctx->ring_sock);
355e8d26 6714 }
2b188cc1
JA
6715#endif
6716
75b28aff 6717 io_mem_free(ctx->rings);
2b188cc1 6718 io_mem_free(ctx->sq_sqes);
2b188cc1
JA
6719
6720 percpu_ref_exit(&ctx->refs);
6721 if (ctx->account_mem)
6722 io_unaccount_mem(ctx->user,
6723 ring_pages(ctx->sq_entries, ctx->cq_entries));
6724 free_uid(ctx->user);
181e448d 6725 put_cred(ctx->creds);
206aefde 6726 kfree(ctx->completions);
78076bb6 6727 kfree(ctx->cancel_hash);
0ddf92e8 6728 kmem_cache_free(req_cachep, ctx->fallback_req);
2b188cc1
JA
6729 kfree(ctx);
6730}
6731
6732static __poll_t io_uring_poll(struct file *file, poll_table *wait)
6733{
6734 struct io_ring_ctx *ctx = file->private_data;
6735 __poll_t mask = 0;
6736
6737 poll_wait(file, &ctx->cq_wait, wait);
4f7067c3
SB
6738 /*
6739 * synchronizes with barrier from wq_has_sleeper call in
6740 * io_commit_cqring
6741 */
2b188cc1 6742 smp_rmb();
75b28aff
HV
6743 if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head !=
6744 ctx->rings->sq_ring_entries)
2b188cc1 6745 mask |= EPOLLOUT | EPOLLWRNORM;
63e5d81f 6746 if (io_cqring_events(ctx, false))
2b188cc1
JA
6747 mask |= EPOLLIN | EPOLLRDNORM;
6748
6749 return mask;
6750}
6751
6752static int io_uring_fasync(int fd, struct file *file, int on)
6753{
6754 struct io_ring_ctx *ctx = file->private_data;
6755
6756 return fasync_helper(fd, file, on, &ctx->cq_fasync);
6757}
6758
071698e1
JA
6759static int io_remove_personalities(int id, void *p, void *data)
6760{
6761 struct io_ring_ctx *ctx = data;
6762 const struct cred *cred;
6763
6764 cred = idr_remove(&ctx->personality_idr, id);
6765 if (cred)
6766 put_cred(cred);
6767 return 0;
6768}
6769
2b188cc1
JA
6770static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
6771{
6772 mutex_lock(&ctx->uring_lock);
6773 percpu_ref_kill(&ctx->refs);
6774 mutex_unlock(&ctx->uring_lock);
6775
df069d80
JA
6776 /*
6777 * Wait for sq thread to idle, if we have one. It won't spin on new
6778 * work after we've killed the ctx ref above. This is important to do
6779 * before we cancel existing commands, as the thread could otherwise
6780 * be queueing new work post that. If that's work we need to cancel,
6781 * it could cause shutdown to hang.
6782 */
6783 while (ctx->sqo_thread && !wq_has_sleeper(&ctx->sqo_wait))
6784 cpu_relax();
6785
5262f567 6786 io_kill_timeouts(ctx);
221c5eb2 6787 io_poll_remove_all(ctx);
561fb04a
JA
6788
6789 if (ctx->io_wq)
6790 io_wq_cancel_all(ctx->io_wq);
6791
def596e9 6792 io_iopoll_reap_events(ctx);
15dff286
JA
6793 /* if we failed setting up the ctx, we might not have any rings */
6794 if (ctx->rings)
6795 io_cqring_overflow_flush(ctx, true);
071698e1 6796 idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
206aefde 6797 wait_for_completion(&ctx->completions[0]);
2b188cc1
JA
6798 io_ring_ctx_free(ctx);
6799}
6800
6801static int io_uring_release(struct inode *inode, struct file *file)
6802{
6803 struct io_ring_ctx *ctx = file->private_data;
6804
6805 file->private_data = NULL;
6806 io_ring_ctx_wait_and_kill(ctx);
6807 return 0;
6808}
6809
fcb323cc
JA
6810static void io_uring_cancel_files(struct io_ring_ctx *ctx,
6811 struct files_struct *files)
6812{
6813 struct io_kiocb *req;
6814 DEFINE_WAIT(wait);
6815
6816 while (!list_empty_careful(&ctx->inflight_list)) {
768134d4 6817 struct io_kiocb *cancel_req = NULL;
fcb323cc
JA
6818
6819 spin_lock_irq(&ctx->inflight_lock);
6820 list_for_each_entry(req, &ctx->inflight_list, inflight_entry) {
768134d4
JA
6821 if (req->work.files != files)
6822 continue;
6823 /* req is being completed, ignore */
6824 if (!refcount_inc_not_zero(&req->refs))
6825 continue;
6826 cancel_req = req;
6827 break;
fcb323cc 6828 }
768134d4 6829 if (cancel_req)
fcb323cc 6830 prepare_to_wait(&ctx->inflight_wait, &wait,
768134d4 6831 TASK_UNINTERRUPTIBLE);
fcb323cc
JA
6832 spin_unlock_irq(&ctx->inflight_lock);
6833
768134d4
JA
6834 /* We need to keep going until we don't find a matching req */
6835 if (!cancel_req)
fcb323cc 6836 break;
2f6d9b9d 6837
2ca10259
JA
6838 if (cancel_req->flags & REQ_F_OVERFLOW) {
6839 spin_lock_irq(&ctx->completion_lock);
6840 list_del(&cancel_req->list);
6841 cancel_req->flags &= ~REQ_F_OVERFLOW;
6842 if (list_empty(&ctx->cq_overflow_list)) {
6843 clear_bit(0, &ctx->sq_check_overflow);
6844 clear_bit(0, &ctx->cq_check_overflow);
6845 }
6846 spin_unlock_irq(&ctx->completion_lock);
6847
6848 WRITE_ONCE(ctx->rings->cq_overflow,
6849 atomic_inc_return(&ctx->cached_cq_overflow));
6850
6851 /*
6852 * Put inflight ref and overflow ref. If that's
6853 * all we had, then we're done with this request.
6854 */
6855 if (refcount_sub_and_test(2, &cancel_req->refs)) {
6856 io_put_req(cancel_req);
6857 continue;
6858 }
6859 }
6860
2f6d9b9d
BL
6861 io_wq_cancel_work(ctx->io_wq, &cancel_req->work);
6862 io_put_req(cancel_req);
fcb323cc
JA
6863 schedule();
6864 }
768134d4 6865 finish_wait(&ctx->inflight_wait, &wait);
fcb323cc
JA
6866}
6867
6868static int io_uring_flush(struct file *file, void *data)
6869{
6870 struct io_ring_ctx *ctx = file->private_data;
6871
6872 io_uring_cancel_files(ctx, data);
6ab23144
JA
6873
6874 /*
6875 * If the task is going away, cancel work it may have pending
6876 */
6877 if (fatal_signal_pending(current) || (current->flags & PF_EXITING))
6878 io_wq_cancel_pid(ctx->io_wq, task_pid_vnr(current));
6879
fcb323cc
JA
6880 return 0;
6881}
6882
6c5c240e
RP
6883static void *io_uring_validate_mmap_request(struct file *file,
6884 loff_t pgoff, size_t sz)
2b188cc1 6885{
2b188cc1 6886 struct io_ring_ctx *ctx = file->private_data;
6c5c240e 6887 loff_t offset = pgoff << PAGE_SHIFT;
2b188cc1
JA
6888 struct page *page;
6889 void *ptr;
6890
6891 switch (offset) {
6892 case IORING_OFF_SQ_RING:
75b28aff
HV
6893 case IORING_OFF_CQ_RING:
6894 ptr = ctx->rings;
2b188cc1
JA
6895 break;
6896 case IORING_OFF_SQES:
6897 ptr = ctx->sq_sqes;
6898 break;
2b188cc1 6899 default:
6c5c240e 6900 return ERR_PTR(-EINVAL);
2b188cc1
JA
6901 }
6902
6903 page = virt_to_head_page(ptr);
a50b854e 6904 if (sz > page_size(page))
6c5c240e
RP
6905 return ERR_PTR(-EINVAL);
6906
6907 return ptr;
6908}
6909
6910#ifdef CONFIG_MMU
6911
6912static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
6913{
6914 size_t sz = vma->vm_end - vma->vm_start;
6915 unsigned long pfn;
6916 void *ptr;
6917
6918 ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
6919 if (IS_ERR(ptr))
6920 return PTR_ERR(ptr);
2b188cc1
JA
6921
6922 pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
6923 return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
6924}
6925
6c5c240e
RP
6926#else /* !CONFIG_MMU */
6927
6928static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
6929{
6930 return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
6931}
6932
6933static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
6934{
6935 return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
6936}
6937
6938static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
6939 unsigned long addr, unsigned long len,
6940 unsigned long pgoff, unsigned long flags)
6941{
6942 void *ptr;
6943
6944 ptr = io_uring_validate_mmap_request(file, pgoff, len);
6945 if (IS_ERR(ptr))
6946 return PTR_ERR(ptr);
6947
6948 return (unsigned long) ptr;
6949}
6950
6951#endif /* !CONFIG_MMU */
6952
2b188cc1
JA
6953SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
6954 u32, min_complete, u32, flags, const sigset_t __user *, sig,
6955 size_t, sigsz)
6956{
6957 struct io_ring_ctx *ctx;
6958 long ret = -EBADF;
6959 int submitted = 0;
6960 struct fd f;
6961
b41e9852
JA
6962 if (current->task_works)
6963 task_work_run();
6964
6c271ce2 6965 if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))
2b188cc1
JA
6966 return -EINVAL;
6967
6968 f = fdget(fd);
6969 if (!f.file)
6970 return -EBADF;
6971
6972 ret = -EOPNOTSUPP;
6973 if (f.file->f_op != &io_uring_fops)
6974 goto out_fput;
6975
6976 ret = -ENXIO;
6977 ctx = f.file->private_data;
6978 if (!percpu_ref_tryget(&ctx->refs))
6979 goto out_fput;
6980
6c271ce2
JA
6981 /*
6982 * For SQ polling, the thread will do all submissions and completions.
6983 * Just return the requested submit count, and wake the thread if
6984 * we were asked to.
6985 */
b2a9eada 6986 ret = 0;
6c271ce2 6987 if (ctx->flags & IORING_SETUP_SQPOLL) {
c1edbf5f
JA
6988 if (!list_empty_careful(&ctx->cq_overflow_list))
6989 io_cqring_overflow_flush(ctx, false);
6c271ce2
JA
6990 if (flags & IORING_ENTER_SQ_WAKEUP)
6991 wake_up(&ctx->sqo_wait);
6992 submitted = to_submit;
b2a9eada 6993 } else if (to_submit) {
ae9428ca 6994 struct mm_struct *cur_mm;
2b188cc1
JA
6995
6996 mutex_lock(&ctx->uring_lock);
ae9428ca
PB
6997 /* already have mm, so io_submit_sqes() won't try to grab it */
6998 cur_mm = ctx->sqo_mm;
6999 submitted = io_submit_sqes(ctx, to_submit, f.file, fd,
7000 &cur_mm, false);
2b188cc1 7001 mutex_unlock(&ctx->uring_lock);
7c504e65
PB
7002
7003 if (submitted != to_submit)
7004 goto out;
2b188cc1
JA
7005 }
7006 if (flags & IORING_ENTER_GETEVENTS) {
def596e9
JA
7007 unsigned nr_events = 0;
7008
2b188cc1
JA
7009 min_complete = min(min_complete, ctx->cq_entries);
7010
def596e9 7011 if (ctx->flags & IORING_SETUP_IOPOLL) {
def596e9 7012 ret = io_iopoll_check(ctx, &nr_events, min_complete);
def596e9
JA
7013 } else {
7014 ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
7015 }
2b188cc1
JA
7016 }
7017
7c504e65 7018out:
6805b32e 7019 percpu_ref_put(&ctx->refs);
2b188cc1
JA
7020out_fput:
7021 fdput(f);
7022 return submitted ? submitted : ret;
7023}
7024
bebdb65e 7025#ifdef CONFIG_PROC_FS
87ce955b
JA
7026static int io_uring_show_cred(int id, void *p, void *data)
7027{
7028 const struct cred *cred = p;
7029 struct seq_file *m = data;
7030 struct user_namespace *uns = seq_user_ns(m);
7031 struct group_info *gi;
7032 kernel_cap_t cap;
7033 unsigned __capi;
7034 int g;
7035
7036 seq_printf(m, "%5d\n", id);
7037 seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
7038 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
7039 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
7040 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
7041 seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
7042 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
7043 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
7044 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
7045 seq_puts(m, "\n\tGroups:\t");
7046 gi = cred->group_info;
7047 for (g = 0; g < gi->ngroups; g++) {
7048 seq_put_decimal_ull(m, g ? " " : "",
7049 from_kgid_munged(uns, gi->gid[g]));
7050 }
7051 seq_puts(m, "\n\tCapEff:\t");
7052 cap = cred->cap_effective;
7053 CAP_FOR_EACH_U32(__capi)
7054 seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
7055 seq_putc(m, '\n');
7056 return 0;
7057}
7058
7059static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
7060{
7061 int i;
7062
7063 mutex_lock(&ctx->uring_lock);
7064 seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
7065 for (i = 0; i < ctx->nr_user_files; i++) {
7066 struct fixed_file_table *table;
7067 struct file *f;
7068
7069 table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
7070 f = table->files[i & IORING_FILE_TABLE_MASK];
7071 if (f)
7072 seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
7073 else
7074 seq_printf(m, "%5u: <none>\n", i);
7075 }
7076 seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
7077 for (i = 0; i < ctx->nr_user_bufs; i++) {
7078 struct io_mapped_ubuf *buf = &ctx->user_bufs[i];
7079
7080 seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf,
7081 (unsigned int) buf->len);
7082 }
7083 if (!idr_is_empty(&ctx->personality_idr)) {
7084 seq_printf(m, "Personalities:\n");
7085 idr_for_each(&ctx->personality_idr, io_uring_show_cred, m);
7086 }
d7718a9d
JA
7087 seq_printf(m, "PollList:\n");
7088 spin_lock_irq(&ctx->completion_lock);
7089 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
7090 struct hlist_head *list = &ctx->cancel_hash[i];
7091 struct io_kiocb *req;
7092
7093 hlist_for_each_entry(req, list, hash_node)
7094 seq_printf(m, " op=%d, task_works=%d\n", req->opcode,
7095 req->task->task_works != NULL);
7096 }
7097 spin_unlock_irq(&ctx->completion_lock);
87ce955b
JA
7098 mutex_unlock(&ctx->uring_lock);
7099}
7100
7101static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
7102{
7103 struct io_ring_ctx *ctx = f->private_data;
7104
7105 if (percpu_ref_tryget(&ctx->refs)) {
7106 __io_uring_show_fdinfo(ctx, m);
7107 percpu_ref_put(&ctx->refs);
7108 }
7109}
bebdb65e 7110#endif
87ce955b 7111
2b188cc1
JA
7112static const struct file_operations io_uring_fops = {
7113 .release = io_uring_release,
fcb323cc 7114 .flush = io_uring_flush,
2b188cc1 7115 .mmap = io_uring_mmap,
6c5c240e
RP
7116#ifndef CONFIG_MMU
7117 .get_unmapped_area = io_uring_nommu_get_unmapped_area,
7118 .mmap_capabilities = io_uring_nommu_mmap_capabilities,
7119#endif
2b188cc1
JA
7120 .poll = io_uring_poll,
7121 .fasync = io_uring_fasync,
bebdb65e 7122#ifdef CONFIG_PROC_FS
87ce955b 7123 .show_fdinfo = io_uring_show_fdinfo,
bebdb65e 7124#endif
2b188cc1
JA
7125};
7126
7127static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
7128 struct io_uring_params *p)
7129{
75b28aff
HV
7130 struct io_rings *rings;
7131 size_t size, sq_array_offset;
2b188cc1 7132
75b28aff
HV
7133 size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
7134 if (size == SIZE_MAX)
7135 return -EOVERFLOW;
7136
7137 rings = io_mem_alloc(size);
7138 if (!rings)
2b188cc1
JA
7139 return -ENOMEM;
7140
75b28aff
HV
7141 ctx->rings = rings;
7142 ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
7143 rings->sq_ring_mask = p->sq_entries - 1;
7144 rings->cq_ring_mask = p->cq_entries - 1;
7145 rings->sq_ring_entries = p->sq_entries;
7146 rings->cq_ring_entries = p->cq_entries;
7147 ctx->sq_mask = rings->sq_ring_mask;
7148 ctx->cq_mask = rings->cq_ring_mask;
7149 ctx->sq_entries = rings->sq_ring_entries;
7150 ctx->cq_entries = rings->cq_ring_entries;
2b188cc1
JA
7151
7152 size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
eb065d30
JA
7153 if (size == SIZE_MAX) {
7154 io_mem_free(ctx->rings);
7155 ctx->rings = NULL;
2b188cc1 7156 return -EOVERFLOW;
eb065d30 7157 }
2b188cc1
JA
7158
7159 ctx->sq_sqes = io_mem_alloc(size);
eb065d30
JA
7160 if (!ctx->sq_sqes) {
7161 io_mem_free(ctx->rings);
7162 ctx->rings = NULL;
2b188cc1 7163 return -ENOMEM;
eb065d30 7164 }
2b188cc1 7165
2b188cc1
JA
7166 return 0;
7167}
7168
7169/*
7170 * Allocate an anonymous fd, this is what constitutes the application
7171 * visible backing of an io_uring instance. The application mmaps this
7172 * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
7173 * we have to tie this fd to a socket for file garbage collection purposes.
7174 */
7175static int io_uring_get_fd(struct io_ring_ctx *ctx)
7176{
7177 struct file *file;
7178 int ret;
7179
7180#if defined(CONFIG_UNIX)
7181 ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
7182 &ctx->ring_sock);
7183 if (ret)
7184 return ret;
7185#endif
7186
7187 ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
7188 if (ret < 0)
7189 goto err;
7190
7191 file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
7192 O_RDWR | O_CLOEXEC);
7193 if (IS_ERR(file)) {
7194 put_unused_fd(ret);
7195 ret = PTR_ERR(file);
7196 goto err;
7197 }
7198
7199#if defined(CONFIG_UNIX)
7200 ctx->ring_sock->file = file;
7201#endif
7202 fd_install(ret, file);
7203 return ret;
7204err:
7205#if defined(CONFIG_UNIX)
7206 sock_release(ctx->ring_sock);
7207 ctx->ring_sock = NULL;
7208#endif
7209 return ret;
7210}
7211
7212static int io_uring_create(unsigned entries, struct io_uring_params *p)
7213{
7214 struct user_struct *user = NULL;
7215 struct io_ring_ctx *ctx;
7216 bool account_mem;
7217 int ret;
7218
8110c1a6 7219 if (!entries)
2b188cc1 7220 return -EINVAL;
8110c1a6
JA
7221 if (entries > IORING_MAX_ENTRIES) {
7222 if (!(p->flags & IORING_SETUP_CLAMP))
7223 return -EINVAL;
7224 entries = IORING_MAX_ENTRIES;
7225 }
2b188cc1
JA
7226
7227 /*
7228 * Use twice as many entries for the CQ ring. It's possible for the
7229 * application to drive a higher depth than the size of the SQ ring,
7230 * since the sqes are only used at submission time. This allows for
33a107f0
JA
7231 * some flexibility in overcommitting a bit. If the application has
7232 * set IORING_SETUP_CQSIZE, it will have passed in the desired number
7233 * of CQ ring entries manually.
2b188cc1
JA
7234 */
7235 p->sq_entries = roundup_pow_of_two(entries);
33a107f0
JA
7236 if (p->flags & IORING_SETUP_CQSIZE) {
7237 /*
7238 * If IORING_SETUP_CQSIZE is set, we do the same roundup
7239 * to a power-of-two, if it isn't already. We do NOT impose
7240 * any cq vs sq ring sizing.
7241 */
8110c1a6 7242 if (p->cq_entries < p->sq_entries)
33a107f0 7243 return -EINVAL;
8110c1a6
JA
7244 if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
7245 if (!(p->flags & IORING_SETUP_CLAMP))
7246 return -EINVAL;
7247 p->cq_entries = IORING_MAX_CQ_ENTRIES;
7248 }
33a107f0
JA
7249 p->cq_entries = roundup_pow_of_two(p->cq_entries);
7250 } else {
7251 p->cq_entries = 2 * p->sq_entries;
7252 }
2b188cc1
JA
7253
7254 user = get_uid(current_user());
7255 account_mem = !capable(CAP_IPC_LOCK);
7256
7257 if (account_mem) {
7258 ret = io_account_mem(user,
7259 ring_pages(p->sq_entries, p->cq_entries));
7260 if (ret) {
7261 free_uid(user);
7262 return ret;
7263 }
7264 }
7265
7266 ctx = io_ring_ctx_alloc(p);
7267 if (!ctx) {
7268 if (account_mem)
7269 io_unaccount_mem(user, ring_pages(p->sq_entries,
7270 p->cq_entries));
7271 free_uid(user);
7272 return -ENOMEM;
7273 }
7274 ctx->compat = in_compat_syscall();
7275 ctx->account_mem = account_mem;
7276 ctx->user = user;
0b8c0ec7 7277 ctx->creds = get_current_cred();
2b188cc1
JA
7278
7279 ret = io_allocate_scq_urings(ctx, p);
7280 if (ret)
7281 goto err;
7282
6c271ce2 7283 ret = io_sq_offload_start(ctx, p);
2b188cc1
JA
7284 if (ret)
7285 goto err;
7286
2b188cc1 7287 memset(&p->sq_off, 0, sizeof(p->sq_off));
75b28aff
HV
7288 p->sq_off.head = offsetof(struct io_rings, sq.head);
7289 p->sq_off.tail = offsetof(struct io_rings, sq.tail);
7290 p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
7291 p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
7292 p->sq_off.flags = offsetof(struct io_rings, sq_flags);
7293 p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
7294 p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
2b188cc1
JA
7295
7296 memset(&p->cq_off, 0, sizeof(p->cq_off));
75b28aff
HV
7297 p->cq_off.head = offsetof(struct io_rings, cq.head);
7298 p->cq_off.tail = offsetof(struct io_rings, cq.tail);
7299 p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
7300 p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
7301 p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
7302 p->cq_off.cqes = offsetof(struct io_rings, cqes);
ac90f249 7303
044c1ab3
JA
7304 /*
7305 * Install ring fd as the very last thing, so we don't risk someone
7306 * having closed it before we finish setup
7307 */
7308 ret = io_uring_get_fd(ctx);
7309 if (ret < 0)
7310 goto err;
7311
da8c9690 7312 p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
cccf0ee8 7313 IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
d7718a9d 7314 IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL;
c826bd7a 7315 trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
2b188cc1
JA
7316 return ret;
7317err:
7318 io_ring_ctx_wait_and_kill(ctx);
7319 return ret;
7320}
7321
7322/*
7323 * Sets up an aio uring context, and returns the fd. Applications asks for a
7324 * ring size, we return the actual sq/cq ring sizes (among other things) in the
7325 * params structure passed in.
7326 */
7327static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
7328{
7329 struct io_uring_params p;
7330 long ret;
7331 int i;
7332
7333 if (copy_from_user(&p, params, sizeof(p)))
7334 return -EFAULT;
7335 for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
7336 if (p.resv[i])
7337 return -EINVAL;
7338 }
7339
6c271ce2 7340 if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
8110c1a6 7341 IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
24369c2e 7342 IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ))
2b188cc1
JA
7343 return -EINVAL;
7344
7345 ret = io_uring_create(entries, &p);
7346 if (ret < 0)
7347 return ret;
7348
7349 if (copy_to_user(params, &p, sizeof(p)))
7350 return -EFAULT;
7351
7352 return ret;
7353}
7354
7355SYSCALL_DEFINE2(io_uring_setup, u32, entries,
7356 struct io_uring_params __user *, params)
7357{
7358 return io_uring_setup(entries, params);
7359}
7360
66f4af93
JA
7361static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
7362{
7363 struct io_uring_probe *p;
7364 size_t size;
7365 int i, ret;
7366
7367 size = struct_size(p, ops, nr_args);
7368 if (size == SIZE_MAX)
7369 return -EOVERFLOW;
7370 p = kzalloc(size, GFP_KERNEL);
7371 if (!p)
7372 return -ENOMEM;
7373
7374 ret = -EFAULT;
7375 if (copy_from_user(p, arg, size))
7376 goto out;
7377 ret = -EINVAL;
7378 if (memchr_inv(p, 0, size))
7379 goto out;
7380
7381 p->last_op = IORING_OP_LAST - 1;
7382 if (nr_args > IORING_OP_LAST)
7383 nr_args = IORING_OP_LAST;
7384
7385 for (i = 0; i < nr_args; i++) {
7386 p->ops[i].op = i;
7387 if (!io_op_defs[i].not_supported)
7388 p->ops[i].flags = IO_URING_OP_SUPPORTED;
7389 }
7390 p->ops_len = i;
7391
7392 ret = 0;
7393 if (copy_to_user(arg, p, size))
7394 ret = -EFAULT;
7395out:
7396 kfree(p);
7397 return ret;
7398}
7399
071698e1
JA
7400static int io_register_personality(struct io_ring_ctx *ctx)
7401{
7402 const struct cred *creds = get_current_cred();
7403 int id;
7404
7405 id = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1,
7406 USHRT_MAX, GFP_KERNEL);
7407 if (id < 0)
7408 put_cred(creds);
7409 return id;
7410}
7411
7412static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
7413{
7414 const struct cred *old_creds;
7415
7416 old_creds = idr_remove(&ctx->personality_idr, id);
7417 if (old_creds) {
7418 put_cred(old_creds);
7419 return 0;
7420 }
7421
7422 return -EINVAL;
7423}
7424
7425static bool io_register_op_must_quiesce(int op)
7426{
7427 switch (op) {
7428 case IORING_UNREGISTER_FILES:
7429 case IORING_REGISTER_FILES_UPDATE:
7430 case IORING_REGISTER_PROBE:
7431 case IORING_REGISTER_PERSONALITY:
7432 case IORING_UNREGISTER_PERSONALITY:
7433 return false;
7434 default:
7435 return true;
7436 }
7437}
7438
edafccee
JA
7439static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
7440 void __user *arg, unsigned nr_args)
b19062a5
JA
7441 __releases(ctx->uring_lock)
7442 __acquires(ctx->uring_lock)
edafccee
JA
7443{
7444 int ret;
7445
35fa71a0
JA
7446 /*
7447 * We're inside the ring mutex, if the ref is already dying, then
7448 * someone else killed the ctx or is already going through
7449 * io_uring_register().
7450 */
7451 if (percpu_ref_is_dying(&ctx->refs))
7452 return -ENXIO;
7453
071698e1 7454 if (io_register_op_must_quiesce(opcode)) {
05f3fb3c 7455 percpu_ref_kill(&ctx->refs);
b19062a5 7456
05f3fb3c
JA
7457 /*
7458 * Drop uring mutex before waiting for references to exit. If
7459 * another thread is currently inside io_uring_enter() it might
7460 * need to grab the uring_lock to make progress. If we hold it
7461 * here across the drain wait, then we can deadlock. It's safe
7462 * to drop the mutex here, since no new references will come in
7463 * after we've killed the percpu ref.
7464 */
7465 mutex_unlock(&ctx->uring_lock);
c150368b 7466 ret = wait_for_completion_interruptible(&ctx->completions[0]);
05f3fb3c 7467 mutex_lock(&ctx->uring_lock);
c150368b
JA
7468 if (ret) {
7469 percpu_ref_resurrect(&ctx->refs);
7470 ret = -EINTR;
7471 goto out;
7472 }
05f3fb3c 7473 }
edafccee
JA
7474
7475 switch (opcode) {
7476 case IORING_REGISTER_BUFFERS:
7477 ret = io_sqe_buffer_register(ctx, arg, nr_args);
7478 break;
7479 case IORING_UNREGISTER_BUFFERS:
7480 ret = -EINVAL;
7481 if (arg || nr_args)
7482 break;
7483 ret = io_sqe_buffer_unregister(ctx);
7484 break;
6b06314c
JA
7485 case IORING_REGISTER_FILES:
7486 ret = io_sqe_files_register(ctx, arg, nr_args);
7487 break;
7488 case IORING_UNREGISTER_FILES:
7489 ret = -EINVAL;
7490 if (arg || nr_args)
7491 break;
7492 ret = io_sqe_files_unregister(ctx);
7493 break;
c3a31e60
JA
7494 case IORING_REGISTER_FILES_UPDATE:
7495 ret = io_sqe_files_update(ctx, arg, nr_args);
7496 break;
9b402849 7497 case IORING_REGISTER_EVENTFD:
f2842ab5 7498 case IORING_REGISTER_EVENTFD_ASYNC:
9b402849
JA
7499 ret = -EINVAL;
7500 if (nr_args != 1)
7501 break;
7502 ret = io_eventfd_register(ctx, arg);
f2842ab5
JA
7503 if (ret)
7504 break;
7505 if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
7506 ctx->eventfd_async = 1;
7507 else
7508 ctx->eventfd_async = 0;
9b402849
JA
7509 break;
7510 case IORING_UNREGISTER_EVENTFD:
7511 ret = -EINVAL;
7512 if (arg || nr_args)
7513 break;
7514 ret = io_eventfd_unregister(ctx);
7515 break;
66f4af93
JA
7516 case IORING_REGISTER_PROBE:
7517 ret = -EINVAL;
7518 if (!arg || nr_args > 256)
7519 break;
7520 ret = io_probe(ctx, arg, nr_args);
7521 break;
071698e1
JA
7522 case IORING_REGISTER_PERSONALITY:
7523 ret = -EINVAL;
7524 if (arg || nr_args)
7525 break;
7526 ret = io_register_personality(ctx);
7527 break;
7528 case IORING_UNREGISTER_PERSONALITY:
7529 ret = -EINVAL;
7530 if (arg)
7531 break;
7532 ret = io_unregister_personality(ctx, nr_args);
7533 break;
edafccee
JA
7534 default:
7535 ret = -EINVAL;
7536 break;
7537 }
7538
071698e1 7539 if (io_register_op_must_quiesce(opcode)) {
05f3fb3c 7540 /* bring the ctx back to life */
05f3fb3c 7541 percpu_ref_reinit(&ctx->refs);
c150368b
JA
7542out:
7543 reinit_completion(&ctx->completions[0]);
05f3fb3c 7544 }
edafccee
JA
7545 return ret;
7546}
7547
7548SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
7549 void __user *, arg, unsigned int, nr_args)
7550{
7551 struct io_ring_ctx *ctx;
7552 long ret = -EBADF;
7553 struct fd f;
7554
7555 f = fdget(fd);
7556 if (!f.file)
7557 return -EBADF;
7558
7559 ret = -EOPNOTSUPP;
7560 if (f.file->f_op != &io_uring_fops)
7561 goto out_fput;
7562
7563 ctx = f.file->private_data;
7564
7565 mutex_lock(&ctx->uring_lock);
7566 ret = __io_uring_register(ctx, opcode, arg, nr_args);
7567 mutex_unlock(&ctx->uring_lock);
c826bd7a
DD
7568 trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
7569 ctx->cq_ev_fd != NULL, ret);
edafccee
JA
7570out_fput:
7571 fdput(f);
7572 return ret;
7573}
7574
2b188cc1
JA
7575static int __init io_uring_init(void)
7576{
d7f62e82
SM
7577#define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
7578 BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
7579 BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
7580} while (0)
7581
7582#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
7583 __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
7584 BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
7585 BUILD_BUG_SQE_ELEM(0, __u8, opcode);
7586 BUILD_BUG_SQE_ELEM(1, __u8, flags);
7587 BUILD_BUG_SQE_ELEM(2, __u16, ioprio);
7588 BUILD_BUG_SQE_ELEM(4, __s32, fd);
7589 BUILD_BUG_SQE_ELEM(8, __u64, off);
7590 BUILD_BUG_SQE_ELEM(8, __u64, addr2);
7591 BUILD_BUG_SQE_ELEM(16, __u64, addr);
7d67af2c 7592 BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in);
d7f62e82
SM
7593 BUILD_BUG_SQE_ELEM(24, __u32, len);
7594 BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags);
7595 BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags);
7596 BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
7597 BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags);
7598 BUILD_BUG_SQE_ELEM(28, __u16, poll_events);
7599 BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags);
7600 BUILD_BUG_SQE_ELEM(28, __u32, msg_flags);
7601 BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags);
7602 BUILD_BUG_SQE_ELEM(28, __u32, accept_flags);
7603 BUILD_BUG_SQE_ELEM(28, __u32, cancel_flags);
7604 BUILD_BUG_SQE_ELEM(28, __u32, open_flags);
7605 BUILD_BUG_SQE_ELEM(28, __u32, statx_flags);
7606 BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice);
7d67af2c 7607 BUILD_BUG_SQE_ELEM(28, __u32, splice_flags);
d7f62e82
SM
7608 BUILD_BUG_SQE_ELEM(32, __u64, user_data);
7609 BUILD_BUG_SQE_ELEM(40, __u16, buf_index);
7610 BUILD_BUG_SQE_ELEM(42, __u16, personality);
7d67af2c 7611 BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in);
d7f62e82 7612
d3656344 7613 BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
2b188cc1
JA
7614 req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
7615 return 0;
7616};
7617__initcall(io_uring_init);