1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/kernel.h>
3 #include <linux/errno.h>
5 #include <linux/file.h>
6 #include <linux/blk-mq.h>
8 #include <linux/slab.h>
9 #include <linux/fsnotify.h>
10 #include <linux/poll.h>
11 #include <linux/nospec.h>
12 #include <linux/compat.h>
13 #include <linux/io_uring/cmd.h>
14 #include <linux/indirect_call_wrapper.h>
16 #include <uapi/linux/io_uring.h>
26 /* NOTE: kiocb has the file as the first member, so don't do it here */
33 static inline bool io_file_supports_nowait(struct io_kiocb
*req
)
35 return req
->flags
& REQ_F_SUPPORT_NOWAIT
;
39 static int io_iov_compat_buffer_select_prep(struct io_rw
*rw
)
41 struct compat_iovec __user
*uiov
;
44 uiov
= u64_to_user_ptr(rw
->addr
);
45 if (!access_ok(uiov
, sizeof(*uiov
)))
47 if (__get_user(clen
, &uiov
->iov_len
))
57 static int io_iov_buffer_select_prep(struct io_kiocb
*req
)
59 struct iovec __user
*uiov
;
61 struct io_rw
*rw
= io_kiocb_to_cmd(req
, struct io_rw
);
68 return io_iov_compat_buffer_select_prep(rw
);
71 uiov
= u64_to_user_ptr(rw
->addr
);
72 if (copy_from_user(&iov
, uiov
, sizeof(*uiov
)))
74 rw
->len
= iov
.iov_len
;
78 int io_prep_rw(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
80 struct io_rw
*rw
= io_kiocb_to_cmd(req
, struct io_rw
);
84 rw
->kiocb
.ki_pos
= READ_ONCE(sqe
->off
);
85 /* used for fixed read/write too - just read unconditionally */
86 req
->buf_index
= READ_ONCE(sqe
->buf_index
);
88 ioprio
= READ_ONCE(sqe
->ioprio
);
90 ret
= ioprio_check_cap(ioprio
);
94 rw
->kiocb
.ki_ioprio
= ioprio
;
96 rw
->kiocb
.ki_ioprio
= get_current_ioprio();
98 rw
->kiocb
.dio_complete
= NULL
;
100 rw
->addr
= READ_ONCE(sqe
->addr
);
101 rw
->len
= READ_ONCE(sqe
->len
);
102 rw
->flags
= READ_ONCE(sqe
->rw_flags
);
106 int io_prep_rwv(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
110 ret
= io_prep_rw(req
, sqe
);
115 * Have to do this validation here, as this is in io_read() rw->len
116 * might have chanaged due to buffer selection
118 if (req
->flags
& REQ_F_BUFFER_SELECT
)
119 return io_iov_buffer_select_prep(req
);
124 int io_prep_rw_fixed(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
126 struct io_ring_ctx
*ctx
= req
->ctx
;
130 ret
= io_prep_rw(req
, sqe
);
134 if (unlikely(req
->buf_index
>= ctx
->nr_user_bufs
))
136 index
= array_index_nospec(req
->buf_index
, ctx
->nr_user_bufs
);
137 req
->imu
= ctx
->user_bufs
[index
];
138 io_req_set_rsrc_node(req
, ctx
, 0);
143 * Multishot read is prepared just like a normal read/write request, only
144 * difference is that we set the MULTISHOT flag.
146 int io_read_mshot_prep(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
148 struct io_rw
*rw
= io_kiocb_to_cmd(req
, struct io_rw
);
151 /* must be used with provided buffers */
152 if (!(req
->flags
& REQ_F_BUFFER_SELECT
))
155 ret
= io_prep_rw(req
, sqe
);
159 if (rw
->addr
|| rw
->len
)
162 req
->flags
|= REQ_F_APOLL_MULTISHOT
;
166 void io_readv_writev_cleanup(struct io_kiocb
*req
)
168 struct io_async_rw
*io
= req
->async_data
;
170 kfree(io
->free_iovec
);
173 static inline loff_t
*io_kiocb_update_pos(struct io_kiocb
*req
)
175 struct io_rw
*rw
= io_kiocb_to_cmd(req
, struct io_rw
);
177 if (rw
->kiocb
.ki_pos
!= -1)
178 return &rw
->kiocb
.ki_pos
;
180 if (!(req
->file
->f_mode
& FMODE_STREAM
)) {
181 req
->flags
|= REQ_F_CUR_POS
;
182 rw
->kiocb
.ki_pos
= req
->file
->f_pos
;
183 return &rw
->kiocb
.ki_pos
;
186 rw
->kiocb
.ki_pos
= 0;
190 static void io_req_task_queue_reissue(struct io_kiocb
*req
)
192 req
->io_task_work
.func
= io_queue_iowq
;
193 io_req_task_work_add(req
);
197 static bool io_resubmit_prep(struct io_kiocb
*req
)
199 struct io_async_rw
*io
= req
->async_data
;
201 if (!req_has_async_data(req
))
202 return !io_req_prep_async(req
);
203 iov_iter_restore(&io
->s
.iter
, &io
->s
.iter_state
);
207 static bool io_rw_should_reissue(struct io_kiocb
*req
)
209 umode_t mode
= file_inode(req
->file
)->i_mode
;
210 struct io_ring_ctx
*ctx
= req
->ctx
;
212 if (!S_ISBLK(mode
) && !S_ISREG(mode
))
214 if ((req
->flags
& REQ_F_NOWAIT
) || (io_wq_current_is_worker() &&
215 !(ctx
->flags
& IORING_SETUP_IOPOLL
)))
218 * If ref is dying, we might be running poll reap from the exit work.
219 * Don't attempt to reissue from that path, just let it fail with
222 if (percpu_ref_is_dying(&ctx
->refs
))
225 * Play it safe and assume not safe to re-import and reissue if we're
226 * not in the original thread group (or in task context).
228 if (!same_thread_group(req
->task
, current
) || !in_task())
233 static bool io_resubmit_prep(struct io_kiocb
*req
)
237 static bool io_rw_should_reissue(struct io_kiocb
*req
)
243 static void io_req_end_write(struct io_kiocb
*req
)
245 if (req
->flags
& REQ_F_ISREG
) {
246 struct io_rw
*rw
= io_kiocb_to_cmd(req
, struct io_rw
);
248 kiocb_end_write(&rw
->kiocb
);
253 * Trigger the notifications after having done some IO, and finish the write
254 * accounting, if any.
256 static void io_req_io_end(struct io_kiocb
*req
)
258 struct io_rw
*rw
= io_kiocb_to_cmd(req
, struct io_rw
);
260 if (rw
->kiocb
.ki_flags
& IOCB_WRITE
) {
261 io_req_end_write(req
);
262 fsnotify_modify(req
->file
);
264 fsnotify_access(req
->file
);
268 static bool __io_complete_rw_common(struct io_kiocb
*req
, long res
)
270 if (unlikely(res
!= req
->cqe
.res
)) {
271 if ((res
== -EAGAIN
|| res
== -EOPNOTSUPP
) &&
272 io_rw_should_reissue(req
)) {
274 * Reissue will start accounting again, finish the
278 req
->flags
|= REQ_F_REISSUE
| REQ_F_BL_NO_RECYCLE
;
287 static inline int io_fixup_rw_res(struct io_kiocb
*req
, long res
)
289 struct io_async_rw
*io
= req
->async_data
;
291 /* add previously done IO, if any */
292 if (req_has_async_data(req
) && io
->bytes_done
> 0) {
294 res
= io
->bytes_done
;
296 res
+= io
->bytes_done
;
301 void io_req_rw_complete(struct io_kiocb
*req
, struct io_tw_state
*ts
)
303 struct io_rw
*rw
= io_kiocb_to_cmd(req
, struct io_rw
);
304 struct kiocb
*kiocb
= &rw
->kiocb
;
306 if ((kiocb
->ki_flags
& IOCB_DIO_CALLER_COMP
) && kiocb
->dio_complete
) {
307 long res
= kiocb
->dio_complete(rw
->kiocb
.private);
309 io_req_set_res(req
, io_fixup_rw_res(req
, res
), 0);
314 if (req
->flags
& (REQ_F_BUFFER_SELECTED
|REQ_F_BUFFER_RING
)) {
315 unsigned issue_flags
= ts
->locked
? 0 : IO_URING_F_UNLOCKED
;
317 req
->cqe
.flags
|= io_put_kbuf(req
, issue_flags
);
319 io_req_task_complete(req
, ts
);
322 static void io_complete_rw(struct kiocb
*kiocb
, long res
)
324 struct io_rw
*rw
= container_of(kiocb
, struct io_rw
, kiocb
);
325 struct io_kiocb
*req
= cmd_to_io_kiocb(rw
);
327 if (!kiocb
->dio_complete
|| !(kiocb
->ki_flags
& IOCB_DIO_CALLER_COMP
)) {
328 if (__io_complete_rw_common(req
, res
))
330 io_req_set_res(req
, io_fixup_rw_res(req
, res
), 0);
332 req
->io_task_work
.func
= io_req_rw_complete
;
333 __io_req_task_work_add(req
, IOU_F_TWQ_LAZY_WAKE
);
336 static void io_complete_rw_iopoll(struct kiocb
*kiocb
, long res
)
338 struct io_rw
*rw
= container_of(kiocb
, struct io_rw
, kiocb
);
339 struct io_kiocb
*req
= cmd_to_io_kiocb(rw
);
341 if (kiocb
->ki_flags
& IOCB_WRITE
)
342 io_req_end_write(req
);
343 if (unlikely(res
!= req
->cqe
.res
)) {
344 if (res
== -EAGAIN
&& io_rw_should_reissue(req
)) {
345 req
->flags
|= REQ_F_REISSUE
| REQ_F_BL_NO_RECYCLE
;
351 /* order with io_iopoll_complete() checking ->iopoll_completed */
352 smp_store_release(&req
->iopoll_completed
, 1);
355 static inline void io_rw_done(struct kiocb
*kiocb
, ssize_t ret
)
357 /* IO was queued async, completion will happen later */
358 if (ret
== -EIOCBQUEUED
)
361 /* transform internal restart error codes */
362 if (unlikely(ret
< 0)) {
365 case -ERESTARTNOINTR
:
366 case -ERESTARTNOHAND
:
367 case -ERESTART_RESTARTBLOCK
:
369 * We can't just restart the syscall, since previously
370 * submitted sqes may already be in progress. Just fail
371 * this IO with EINTR.
378 INDIRECT_CALL_2(kiocb
->ki_complete
, io_complete_rw_iopoll
,
379 io_complete_rw
, kiocb
, ret
);
382 static int kiocb_done(struct io_kiocb
*req
, ssize_t ret
,
383 unsigned int issue_flags
)
385 struct io_rw
*rw
= io_kiocb_to_cmd(req
, struct io_rw
);
386 unsigned final_ret
= io_fixup_rw_res(req
, ret
);
388 if (ret
>= 0 && req
->flags
& REQ_F_CUR_POS
)
389 req
->file
->f_pos
= rw
->kiocb
.ki_pos
;
390 if (ret
>= 0 && (rw
->kiocb
.ki_complete
== io_complete_rw
)) {
391 if (!__io_complete_rw_common(req
, ret
)) {
393 * Safe to call io_end from here as we're inline
394 * from the submission path.
397 io_req_set_res(req
, final_ret
,
398 io_put_kbuf(req
, issue_flags
));
402 io_rw_done(&rw
->kiocb
, ret
);
405 if (req
->flags
& REQ_F_REISSUE
) {
406 req
->flags
&= ~REQ_F_REISSUE
;
407 if (io_resubmit_prep(req
))
408 io_req_task_queue_reissue(req
);
410 io_req_task_queue_fail(req
, final_ret
);
412 return IOU_ISSUE_SKIP_COMPLETE
;
415 static struct iovec
*__io_import_iovec(int ddir
, struct io_kiocb
*req
,
416 struct io_rw_state
*s
,
417 unsigned int issue_flags
)
419 struct io_rw
*rw
= io_kiocb_to_cmd(req
, struct io_rw
);
420 struct iov_iter
*iter
= &s
->iter
;
421 u8 opcode
= req
->opcode
;
427 if (opcode
== IORING_OP_READ_FIXED
|| opcode
== IORING_OP_WRITE_FIXED
) {
428 ret
= io_import_fixed(ddir
, iter
, req
->imu
, rw
->addr
, rw
->len
);
434 buf
= u64_to_user_ptr(rw
->addr
);
437 if (!io_issue_defs
[opcode
].vectored
|| req
->flags
& REQ_F_BUFFER_SELECT
) {
438 if (io_do_buffer_select(req
)) {
439 buf
= io_buffer_select(req
, &sqe_len
, issue_flags
);
441 return ERR_PTR(-ENOBUFS
);
442 rw
->addr
= (unsigned long) buf
;
446 ret
= import_ubuf(ddir
, buf
, sqe_len
, iter
);
453 ret
= __import_iovec(ddir
, buf
, sqe_len
, UIO_FASTIOV
, &iovec
, iter
,
455 if (unlikely(ret
< 0))
460 static inline int io_import_iovec(int rw
, struct io_kiocb
*req
,
461 struct iovec
**iovec
, struct io_rw_state
*s
,
462 unsigned int issue_flags
)
464 *iovec
= __io_import_iovec(rw
, req
, s
, issue_flags
);
466 return PTR_ERR(*iovec
);
468 iov_iter_save_state(&s
->iter
, &s
->iter_state
);
472 static inline loff_t
*io_kiocb_ppos(struct kiocb
*kiocb
)
474 return (kiocb
->ki_filp
->f_mode
& FMODE_STREAM
) ? NULL
: &kiocb
->ki_pos
;
478 * For files that don't have ->read_iter() and ->write_iter(), handle them
479 * by looping over ->read() or ->write() manually.
481 static ssize_t
loop_rw_iter(int ddir
, struct io_rw
*rw
, struct iov_iter
*iter
)
483 struct kiocb
*kiocb
= &rw
->kiocb
;
484 struct file
*file
= kiocb
->ki_filp
;
489 * Don't support polled IO through this interface, and we can't
490 * support non-blocking either. For the latter, this just causes
491 * the kiocb to be handled from an async context.
493 if (kiocb
->ki_flags
& IOCB_HIPRI
)
495 if ((kiocb
->ki_flags
& IOCB_NOWAIT
) &&
496 !(kiocb
->ki_filp
->f_flags
& O_NONBLOCK
))
499 ppos
= io_kiocb_ppos(kiocb
);
501 while (iov_iter_count(iter
)) {
506 if (iter_is_ubuf(iter
)) {
507 addr
= iter
->ubuf
+ iter
->iov_offset
;
508 len
= iov_iter_count(iter
);
509 } else if (!iov_iter_is_bvec(iter
)) {
510 addr
= iter_iov_addr(iter
);
511 len
= iter_iov_len(iter
);
513 addr
= u64_to_user_ptr(rw
->addr
);
518 nr
= file
->f_op
->read(file
, addr
, len
, ppos
);
520 nr
= file
->f_op
->write(file
, addr
, len
, ppos
);
528 if (!iov_iter_is_bvec(iter
)) {
529 iov_iter_advance(iter
, nr
);
543 static void io_req_map_rw(struct io_kiocb
*req
, const struct iovec
*iovec
,
544 const struct iovec
*fast_iov
, struct iov_iter
*iter
)
546 struct io_async_rw
*io
= req
->async_data
;
548 memcpy(&io
->s
.iter
, iter
, sizeof(*iter
));
549 io
->free_iovec
= iovec
;
551 /* can only be fixed buffers, no need to do anything */
552 if (iov_iter_is_bvec(iter
) || iter_is_ubuf(iter
))
555 unsigned iov_off
= 0;
557 io
->s
.iter
.__iov
= io
->s
.fast_iov
;
558 if (iter
->__iov
!= fast_iov
) {
559 iov_off
= iter_iov(iter
) - fast_iov
;
560 io
->s
.iter
.__iov
+= iov_off
;
562 if (io
->s
.fast_iov
!= fast_iov
)
563 memcpy(io
->s
.fast_iov
+ iov_off
, fast_iov
+ iov_off
,
564 sizeof(struct iovec
) * iter
->nr_segs
);
566 req
->flags
|= REQ_F_NEED_CLEANUP
;
570 static int io_setup_async_rw(struct io_kiocb
*req
, const struct iovec
*iovec
,
571 struct io_rw_state
*s
, bool force
)
573 if (!force
&& !io_cold_defs
[req
->opcode
].prep_async
)
575 /* opcode type doesn't need async data */
576 if (!io_cold_defs
[req
->opcode
].async_size
)
578 if (!req_has_async_data(req
)) {
579 struct io_async_rw
*iorw
;
581 if (io_alloc_async_data(req
)) {
586 io_req_map_rw(req
, iovec
, s
->fast_iov
, &s
->iter
);
587 iorw
= req
->async_data
;
588 /* we've copied and mapped the iter, ensure state is saved */
589 iov_iter_save_state(&iorw
->s
.iter
, &iorw
->s
.iter_state
);
594 static inline int io_rw_prep_async(struct io_kiocb
*req
, int rw
)
596 struct io_async_rw
*iorw
= req
->async_data
;
600 iorw
->bytes_done
= 0;
601 iorw
->free_iovec
= NULL
;
603 /* submission path, ->uring_lock should already be taken */
604 ret
= io_import_iovec(rw
, req
, &iov
, &iorw
->s
, 0);
605 if (unlikely(ret
< 0))
609 iorw
->free_iovec
= iov
;
610 req
->flags
|= REQ_F_NEED_CLEANUP
;
616 int io_readv_prep_async(struct io_kiocb
*req
)
618 return io_rw_prep_async(req
, ITER_DEST
);
621 int io_writev_prep_async(struct io_kiocb
*req
)
623 return io_rw_prep_async(req
, ITER_SOURCE
);
627 * This is our waitqueue callback handler, registered through __folio_lock_async()
628 * when we initially tried to do the IO with the iocb armed our waitqueue.
629 * This gets called when the page is unlocked, and we generally expect that to
630 * happen when the page IO is completed and the page is now uptodate. This will
631 * queue a task_work based retry of the operation, attempting to copy the data
632 * again. If the latter fails because the page was NOT uptodate, then we will
633 * do a thread based blocking retry of the operation. That's the unexpected
636 static int io_async_buf_func(struct wait_queue_entry
*wait
, unsigned mode
,
639 struct wait_page_queue
*wpq
;
640 struct io_kiocb
*req
= wait
->private;
641 struct io_rw
*rw
= io_kiocb_to_cmd(req
, struct io_rw
);
642 struct wait_page_key
*key
= arg
;
644 wpq
= container_of(wait
, struct wait_page_queue
, wait
);
646 if (!wake_page_match(wpq
, key
))
649 rw
->kiocb
.ki_flags
&= ~IOCB_WAITQ
;
650 list_del_init(&wait
->entry
);
651 io_req_task_queue(req
);
656 * This controls whether a given IO request should be armed for async page
657 * based retry. If we return false here, the request is handed to the async
658 * worker threads for retry. If we're doing buffered reads on a regular file,
659 * we prepare a private wait_page_queue entry and retry the operation. This
660 * will either succeed because the page is now uptodate and unlocked, or it
661 * will register a callback when the page is unlocked at IO completion. Through
662 * that callback, io_uring uses task_work to setup a retry of the operation.
663 * That retry will attempt the buffered read again. The retry will generally
664 * succeed, or in rare cases where it fails, we then fall back to using the
665 * async worker threads for a blocking retry.
667 static bool io_rw_should_retry(struct io_kiocb
*req
)
669 struct io_async_rw
*io
= req
->async_data
;
670 struct wait_page_queue
*wait
= &io
->wpq
;
671 struct io_rw
*rw
= io_kiocb_to_cmd(req
, struct io_rw
);
672 struct kiocb
*kiocb
= &rw
->kiocb
;
674 /* never retry for NOWAIT, we just complete with -EAGAIN */
675 if (req
->flags
& REQ_F_NOWAIT
)
678 /* Only for buffered IO */
679 if (kiocb
->ki_flags
& (IOCB_DIRECT
| IOCB_HIPRI
))
683 * just use poll if we can, and don't attempt if the fs doesn't
684 * support callback based unlocks
686 if (io_file_can_poll(req
) || !(req
->file
->f_mode
& FMODE_BUF_RASYNC
))
689 wait
->wait
.func
= io_async_buf_func
;
690 wait
->wait
.private = req
;
691 wait
->wait
.flags
= 0;
692 INIT_LIST_HEAD(&wait
->wait
.entry
);
693 kiocb
->ki_flags
|= IOCB_WAITQ
;
694 kiocb
->ki_flags
&= ~IOCB_NOWAIT
;
695 kiocb
->ki_waitq
= wait
;
699 static inline int io_iter_do_read(struct io_rw
*rw
, struct iov_iter
*iter
)
701 struct file
*file
= rw
->kiocb
.ki_filp
;
703 if (likely(file
->f_op
->read_iter
))
704 return call_read_iter(file
, &rw
->kiocb
, iter
);
705 else if (file
->f_op
->read
)
706 return loop_rw_iter(READ
, rw
, iter
);
711 static bool need_complete_io(struct io_kiocb
*req
)
713 return req
->flags
& REQ_F_ISREG
||
714 S_ISBLK(file_inode(req
->file
)->i_mode
);
717 static int io_rw_init_file(struct io_kiocb
*req
, fmode_t mode
)
719 struct io_rw
*rw
= io_kiocb_to_cmd(req
, struct io_rw
);
720 struct kiocb
*kiocb
= &rw
->kiocb
;
721 struct io_ring_ctx
*ctx
= req
->ctx
;
722 struct file
*file
= req
->file
;
725 if (unlikely(!(file
->f_mode
& mode
)))
728 if (!(req
->flags
& REQ_F_FIXED_FILE
))
729 req
->flags
|= io_file_get_flags(file
);
731 kiocb
->ki_flags
= file
->f_iocb_flags
;
732 ret
= kiocb_set_rw_flags(kiocb
, rw
->flags
);
735 kiocb
->ki_flags
|= IOCB_ALLOC_CACHE
;
738 * If the file is marked O_NONBLOCK, still allow retry for it if it
739 * supports async. Otherwise it's impossible to use O_NONBLOCK files
740 * reliably. If not, or it IOCB_NOWAIT is set, don't retry.
742 if ((kiocb
->ki_flags
& IOCB_NOWAIT
) ||
743 ((file
->f_flags
& O_NONBLOCK
) && !io_file_supports_nowait(req
)))
744 req
->flags
|= REQ_F_NOWAIT
;
746 if (ctx
->flags
& IORING_SETUP_IOPOLL
) {
747 if (!(kiocb
->ki_flags
& IOCB_DIRECT
) || !file
->f_op
->iopoll
)
750 kiocb
->private = NULL
;
751 kiocb
->ki_flags
|= IOCB_HIPRI
;
752 kiocb
->ki_complete
= io_complete_rw_iopoll
;
753 req
->iopoll_completed
= 0;
755 if (kiocb
->ki_flags
& IOCB_HIPRI
)
757 kiocb
->ki_complete
= io_complete_rw
;
763 static int __io_read(struct io_kiocb
*req
, unsigned int issue_flags
)
765 struct io_rw
*rw
= io_kiocb_to_cmd(req
, struct io_rw
);
766 struct io_rw_state __s
, *s
= &__s
;
768 struct kiocb
*kiocb
= &rw
->kiocb
;
769 bool force_nonblock
= issue_flags
& IO_URING_F_NONBLOCK
;
770 struct io_async_rw
*io
;
774 if (!req_has_async_data(req
)) {
775 ret
= io_import_iovec(ITER_DEST
, req
, &iovec
, s
, issue_flags
);
776 if (unlikely(ret
< 0))
779 io
= req
->async_data
;
783 * Safe and required to re-import if we're using provided
784 * buffers, as we dropped the selected one before retry.
786 if (io_do_buffer_select(req
)) {
787 ret
= io_import_iovec(ITER_DEST
, req
, &iovec
, s
, issue_flags
);
788 if (unlikely(ret
< 0))
793 * We come here from an earlier attempt, restore our state to
794 * match in case it doesn't. It's cheap enough that we don't
795 * need to make this conditional.
797 iov_iter_restore(&s
->iter
, &s
->iter_state
);
800 ret
= io_rw_init_file(req
, FMODE_READ
);
805 req
->cqe
.res
= iov_iter_count(&s
->iter
);
807 if (force_nonblock
) {
808 /* If the file doesn't support async, just async punt */
809 if (unlikely(!io_file_supports_nowait(req
))) {
810 ret
= io_setup_async_rw(req
, iovec
, s
, true);
811 return ret
?: -EAGAIN
;
813 kiocb
->ki_flags
|= IOCB_NOWAIT
;
815 /* Ensure we clear previously set non-block flag */
816 kiocb
->ki_flags
&= ~IOCB_NOWAIT
;
819 ppos
= io_kiocb_update_pos(req
);
821 ret
= rw_verify_area(READ
, req
->file
, ppos
, req
->cqe
.res
);
827 ret
= io_iter_do_read(rw
, &s
->iter
);
829 if (ret
== -EAGAIN
|| (req
->flags
& REQ_F_REISSUE
)) {
830 req
->flags
&= ~REQ_F_REISSUE
;
832 * If we can poll, just do that. For a vectored read, we'll
833 * need to copy state first.
835 if (io_file_can_poll(req
) && !io_issue_defs
[req
->opcode
].vectored
)
837 /* IOPOLL retry should happen for io-wq threads */
838 if (!force_nonblock
&& !(req
->ctx
->flags
& IORING_SETUP_IOPOLL
))
840 /* no retry on NONBLOCK nor RWF_NOWAIT */
841 if (req
->flags
& REQ_F_NOWAIT
)
844 } else if (ret
== -EIOCBQUEUED
) {
847 return IOU_ISSUE_SKIP_COMPLETE
;
848 } else if (ret
== req
->cqe
.res
|| ret
<= 0 || !force_nonblock
||
849 (req
->flags
& REQ_F_NOWAIT
) || !need_complete_io(req
)) {
850 /* read all, failed, already did sync or don't want to retry */
855 * Don't depend on the iter state matching what was consumed, or being
856 * untouched in case of error. Restore it and we'll advance it
857 * manually if we need to.
859 iov_iter_restore(&s
->iter
, &s
->iter_state
);
861 ret2
= io_setup_async_rw(req
, iovec
, s
, true);
864 ret
= ret
> 0 ? ret
: ret2
;
868 io
= req
->async_data
;
871 * Now use our persistent iterator and state, if we aren't already.
872 * We've restored and mapped the iter to match.
877 * We end up here because of a partial read, either from
878 * above or inside this loop. Advance the iter by the bytes
879 * that were consumed.
881 iov_iter_advance(&s
->iter
, ret
);
882 if (!iov_iter_count(&s
->iter
))
884 io
->bytes_done
+= ret
;
885 iov_iter_save_state(&s
->iter
, &s
->iter_state
);
887 /* if we can retry, do so with the callbacks armed */
888 if (!io_rw_should_retry(req
)) {
889 kiocb
->ki_flags
&= ~IOCB_WAITQ
;
893 req
->cqe
.res
= iov_iter_count(&s
->iter
);
895 * Now retry read with the IOCB_WAITQ parts set in the iocb. If
896 * we get -EIOCBQUEUED, then we'll get a notification when the
897 * desired page gets unlocked. We can also get a partial read
898 * here, and if we do, then just retry at the new offset.
900 ret
= io_iter_do_read(rw
, &s
->iter
);
901 if (ret
== -EIOCBQUEUED
)
902 return IOU_ISSUE_SKIP_COMPLETE
;
903 /* we got some bytes, but not all. retry. */
904 kiocb
->ki_flags
&= ~IOCB_WAITQ
;
905 iov_iter_restore(&s
->iter
, &s
->iter_state
);
908 /* it's faster to check here then delegate to kfree */
914 int io_read(struct io_kiocb
*req
, unsigned int issue_flags
)
918 ret
= __io_read(req
, issue_flags
);
920 return kiocb_done(req
, ret
, issue_flags
);
925 int io_read_mshot(struct io_kiocb
*req
, unsigned int issue_flags
)
927 struct io_rw
*rw
= io_kiocb_to_cmd(req
, struct io_rw
);
928 unsigned int cflags
= 0;
932 * Multishot MUST be used on a pollable file
934 if (!io_file_can_poll(req
))
937 ret
= __io_read(req
, issue_flags
);
940 * If the file doesn't support proper NOWAIT, then disable multishot
941 * and stay in single shot mode.
943 if (!io_file_supports_nowait(req
))
944 req
->flags
&= ~REQ_F_APOLL_MULTISHOT
;
947 * If we get -EAGAIN, recycle our buffer and just let normal poll
950 if (ret
== -EAGAIN
) {
952 * Reset rw->len to 0 again to avoid clamping future mshot
953 * reads, in case the buffer size varies.
955 if (io_kbuf_recycle(req
, issue_flags
))
957 if (issue_flags
& IO_URING_F_MULTISHOT
)
958 return IOU_ISSUE_SKIP_COMPLETE
;
963 * Any successful return value will keep the multishot read armed.
965 if (ret
> 0 && req
->flags
& REQ_F_APOLL_MULTISHOT
) {
967 * Put our buffer and post a CQE. If we fail to post a CQE, then
968 * jump to the termination path. This request is then done.
970 cflags
= io_put_kbuf(req
, issue_flags
);
971 rw
->len
= 0; /* similarly to above, reset len to 0 */
973 if (io_fill_cqe_req_aux(req
,
974 issue_flags
& IO_URING_F_COMPLETE_DEFER
,
975 ret
, cflags
| IORING_CQE_F_MORE
)) {
976 if (issue_flags
& IO_URING_F_MULTISHOT
) {
978 * Force retry, as we might have more data to
979 * be read and otherwise it won't get retried
980 * until (if ever) another poll is triggered.
982 io_poll_multishot_retry(req
);
983 return IOU_ISSUE_SKIP_COMPLETE
;
990 * Either an error, or we've hit overflow posting the CQE. For any
991 * multishot request, hitting overflow will terminate it.
993 io_req_set_res(req
, ret
, cflags
);
994 if (issue_flags
& IO_URING_F_MULTISHOT
)
995 return IOU_STOP_MULTISHOT
;
999 int io_write(struct io_kiocb
*req
, unsigned int issue_flags
)
1001 struct io_rw
*rw
= io_kiocb_to_cmd(req
, struct io_rw
);
1002 struct io_rw_state __s
, *s
= &__s
;
1003 struct iovec
*iovec
;
1004 struct kiocb
*kiocb
= &rw
->kiocb
;
1005 bool force_nonblock
= issue_flags
& IO_URING_F_NONBLOCK
;
1009 if (!req_has_async_data(req
)) {
1010 ret
= io_import_iovec(ITER_SOURCE
, req
, &iovec
, s
, issue_flags
);
1011 if (unlikely(ret
< 0))
1014 struct io_async_rw
*io
= req
->async_data
;
1017 iov_iter_restore(&s
->iter
, &s
->iter_state
);
1020 ret
= io_rw_init_file(req
, FMODE_WRITE
);
1021 if (unlikely(ret
)) {
1025 req
->cqe
.res
= iov_iter_count(&s
->iter
);
1027 if (force_nonblock
) {
1028 /* If the file doesn't support async, just async punt */
1029 if (unlikely(!io_file_supports_nowait(req
)))
1032 /* File path supports NOWAIT for non-direct_IO only for block devices. */
1033 if (!(kiocb
->ki_flags
& IOCB_DIRECT
) &&
1034 !(kiocb
->ki_filp
->f_mode
& FMODE_BUF_WASYNC
) &&
1035 (req
->flags
& REQ_F_ISREG
))
1038 kiocb
->ki_flags
|= IOCB_NOWAIT
;
1040 /* Ensure we clear previously set non-block flag */
1041 kiocb
->ki_flags
&= ~IOCB_NOWAIT
;
1044 ppos
= io_kiocb_update_pos(req
);
1046 ret
= rw_verify_area(WRITE
, req
->file
, ppos
, req
->cqe
.res
);
1047 if (unlikely(ret
)) {
1052 if (req
->flags
& REQ_F_ISREG
)
1053 kiocb_start_write(kiocb
);
1054 kiocb
->ki_flags
|= IOCB_WRITE
;
1056 if (likely(req
->file
->f_op
->write_iter
))
1057 ret2
= call_write_iter(req
->file
, kiocb
, &s
->iter
);
1058 else if (req
->file
->f_op
->write
)
1059 ret2
= loop_rw_iter(WRITE
, rw
, &s
->iter
);
1063 if (req
->flags
& REQ_F_REISSUE
) {
1064 req
->flags
&= ~REQ_F_REISSUE
;
1069 * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
1070 * retry them without IOCB_NOWAIT.
1072 if (ret2
== -EOPNOTSUPP
&& (kiocb
->ki_flags
& IOCB_NOWAIT
))
1074 /* no retry on NONBLOCK nor RWF_NOWAIT */
1075 if (ret2
== -EAGAIN
&& (req
->flags
& REQ_F_NOWAIT
))
1077 if (!force_nonblock
|| ret2
!= -EAGAIN
) {
1078 /* IOPOLL retry should happen for io-wq threads */
1079 if (ret2
== -EAGAIN
&& (req
->ctx
->flags
& IORING_SETUP_IOPOLL
))
1082 if (ret2
!= req
->cqe
.res
&& ret2
>= 0 && need_complete_io(req
)) {
1083 struct io_async_rw
*io
;
1085 trace_io_uring_short_write(req
->ctx
, kiocb
->ki_pos
- ret2
,
1086 req
->cqe
.res
, ret2
);
1088 /* This is a partial write. The file pos has already been
1089 * updated, setup the async struct to complete the request
1090 * in the worker. Also update bytes_done to account for
1091 * the bytes already written.
1093 iov_iter_save_state(&s
->iter
, &s
->iter_state
);
1094 ret
= io_setup_async_rw(req
, iovec
, s
, true);
1096 io
= req
->async_data
;
1098 io
->bytes_done
+= ret2
;
1100 if (kiocb
->ki_flags
& IOCB_WRITE
)
1101 io_req_end_write(req
);
1102 return ret
? ret
: -EAGAIN
;
1105 ret
= kiocb_done(req
, ret2
, issue_flags
);
1108 iov_iter_restore(&s
->iter
, &s
->iter_state
);
1109 ret
= io_setup_async_rw(req
, iovec
, s
, false);
1111 if (kiocb
->ki_flags
& IOCB_WRITE
)
1112 io_req_end_write(req
);
1117 /* it's reportedly faster than delegating the null check to kfree() */
1123 void io_rw_fail(struct io_kiocb
*req
)
1127 res
= io_fixup_rw_res(req
, req
->cqe
.res
);
1128 io_req_set_res(req
, res
, req
->cqe
.flags
);
1131 int io_do_iopoll(struct io_ring_ctx
*ctx
, bool force_nonspin
)
1133 struct io_wq_work_node
*pos
, *start
, *prev
;
1134 unsigned int poll_flags
= 0;
1135 DEFINE_IO_COMP_BATCH(iob
);
1139 * Only spin for completions if we don't have multiple devices hanging
1140 * off our complete list.
1142 if (ctx
->poll_multi_queue
|| force_nonspin
)
1143 poll_flags
|= BLK_POLL_ONESHOT
;
1145 wq_list_for_each(pos
, start
, &ctx
->iopoll_list
) {
1146 struct io_kiocb
*req
= container_of(pos
, struct io_kiocb
, comp_list
);
1147 struct file
*file
= req
->file
;
1151 * Move completed and retryable entries to our local lists.
1152 * If we find a request that requires polling, break out
1153 * and complete those lists first, if we have entries there.
1155 if (READ_ONCE(req
->iopoll_completed
))
1158 if (req
->opcode
== IORING_OP_URING_CMD
) {
1159 struct io_uring_cmd
*ioucmd
;
1161 ioucmd
= io_kiocb_to_cmd(req
, struct io_uring_cmd
);
1162 ret
= file
->f_op
->uring_cmd_iopoll(ioucmd
, &iob
,
1165 struct io_rw
*rw
= io_kiocb_to_cmd(req
, struct io_rw
);
1167 ret
= file
->f_op
->iopoll(&rw
->kiocb
, &iob
, poll_flags
);
1169 if (unlikely(ret
< 0))
1172 poll_flags
|= BLK_POLL_ONESHOT
;
1174 /* iopoll may have completed current req */
1175 if (!rq_list_empty(iob
.req_list
) ||
1176 READ_ONCE(req
->iopoll_completed
))
1180 if (!rq_list_empty(iob
.req_list
))
1186 wq_list_for_each_resume(pos
, prev
) {
1187 struct io_kiocb
*req
= container_of(pos
, struct io_kiocb
, comp_list
);
1189 /* order with io_complete_rw_iopoll(), e.g. ->result updates */
1190 if (!smp_load_acquire(&req
->iopoll_completed
))
1193 req
->cqe
.flags
= io_put_kbuf(req
, 0);
1195 if (unlikely(!nr_events
))
1198 pos
= start
? start
->next
: ctx
->iopoll_list
.first
;
1199 wq_list_cut(&ctx
->iopoll_list
, prev
, start
);
1201 if (WARN_ON_ONCE(!wq_list_empty(&ctx
->submit_state
.compl_reqs
)))
1203 ctx
->submit_state
.compl_reqs
.first
= pos
;
1204 __io_submit_flush_completions(ctx
);