]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
io_uring: IOPOLL polling improvements
authorJens Axboe <axboe@kernel.dk>
Thu, 11 Dec 2025 10:25:41 +0000 (03:25 -0700)
committerJens Axboe <axboe@kernel.dk>
Sun, 28 Dec 2025 22:54:45 +0000 (15:54 -0700)
io_uring manages issued and pending IOPOLL read/write requests in a
singly linked list. One downside of that is that individual items
cannot easily be removed from that list, and as a result, io_uring
will only complete a completed request N in that list if 0..N-1 are
also complete. For homogenous IO this isn't necessarily an issue,
but if different devices are involved in polling in the same ring, or
if disparate IO from the same device is being polled for, this can
defer completion of some requests unnecessarily.

Move to a doubly linked list for iopoll completions instead, making it
possible to easily complete whatever requests that were polled done
successfully.

Co-developed-by: Fengnan Chang <fengnanchang@gmail.com>
Link: https://lore.kernel.org/io-uring/20251210085501.84261-1-changfengnan@bytedance.com/
Signed-off-by: Jens Axboe <axboe@kernel.dk>
include/linux/io_uring_types.h
io_uring/cancel.c
io_uring/io_uring.c
io_uring/rw.c
io_uring/slist.h
io_uring/sqpoll.c

index e1adb0d20a0af2131c145f3f69efadffb34b47cd..54fd30abf2b81ba56b3b1d2491fc0887f0fe91e5 100644 (file)
@@ -316,7 +316,7 @@ struct io_ring_ctx {
                 * manipulate the list, hence no extra locking is needed there.
                 */
                bool                    poll_multi_queue;
-               struct io_wq_work_list  iopoll_list;
+               struct list_head        iopoll_list;
 
                struct io_file_table    file_table;
                struct io_rsrc_data     buf_table;
@@ -708,7 +708,16 @@ struct io_kiocb {
 
        atomic_t                        refs;
        bool                            cancel_seq_set;
-       struct io_task_work             io_task_work;
+
+       /*
+        * IOPOLL doesn't use task_work, so use the ->iopoll_node list
+        * entry to manage pending iopoll requests.
+        */
+       union {
+               struct io_task_work     io_task_work;
+               struct list_head        iopoll_node;
+       };
+
        union {
                /*
                 * for polled requests, i.e. IORING_OP_POLL_ADD and async armed
index ca12ac10c0ae9e7da128a5e983c5dfa8504be4da..4136bf04464afb3b997a5efb24fd2bc33817c594 100644 (file)
@@ -534,7 +534,7 @@ __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
        /* SQPOLL thread does its own polling */
        if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
            is_sqpoll_thread) {
-               while (!wq_list_empty(&ctx->iopoll_list)) {
+               while (!list_empty(&ctx->iopoll_list)) {
                        io_iopoll_try_reap_events(ctx);
                        ret = true;
                        cond_resched();
index 6cb24cdf8e684661dbe2c43a06ac18724b45b794..05a660c9731663b546943dd8a5896125d878a842 100644 (file)
@@ -334,7 +334,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
        init_waitqueue_head(&ctx->poll_wq);
        spin_lock_init(&ctx->completion_lock);
        raw_spin_lock_init(&ctx->timeout_lock);
-       INIT_WQ_LIST(&ctx->iopoll_list);
+       INIT_LIST_HEAD(&ctx->iopoll_list);
        INIT_LIST_HEAD(&ctx->defer_list);
        INIT_LIST_HEAD(&ctx->timeout_list);
        INIT_LIST_HEAD(&ctx->ltimeout_list);
@@ -1561,7 +1561,7 @@ __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
                return;
 
        mutex_lock(&ctx->uring_lock);
-       while (!wq_list_empty(&ctx->iopoll_list)) {
+       while (!list_empty(&ctx->iopoll_list)) {
                /* let it sleep and repeat later if can't complete a request */
                if (io_do_iopoll(ctx, true) == 0)
                        break;
@@ -1626,21 +1626,18 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned int min_events)
                 * forever, while the workqueue is stuck trying to acquire the
                 * very same mutex.
                 */
-               if (wq_list_empty(&ctx->iopoll_list) ||
-                   io_task_work_pending(ctx)) {
+               if (list_empty(&ctx->iopoll_list) || io_task_work_pending(ctx)) {
                        u32 tail = ctx->cached_cq_tail;
 
                        (void) io_run_local_work_locked(ctx, min_events);
 
-                       if (task_work_pending(current) ||
-                           wq_list_empty(&ctx->iopoll_list)) {
+                       if (task_work_pending(current) || list_empty(&ctx->iopoll_list)) {
                                mutex_unlock(&ctx->uring_lock);
                                io_run_task_work();
                                mutex_lock(&ctx->uring_lock);
                        }
                        /* some requests don't go through iopoll_list */
-                       if (tail != ctx->cached_cq_tail ||
-                           wq_list_empty(&ctx->iopoll_list))
+                       if (tail != ctx->cached_cq_tail || list_empty(&ctx->iopoll_list))
                                break;
                }
                ret = io_do_iopoll(ctx, !min_events);
@@ -1683,25 +1680,17 @@ static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags)
         * how we do polling eventually, not spinning if we're on potentially
         * different devices.
         */
-       if (wq_list_empty(&ctx->iopoll_list)) {
+       if (list_empty(&ctx->iopoll_list)) {
                ctx->poll_multi_queue = false;
        } else if (!ctx->poll_multi_queue) {
                struct io_kiocb *list_req;
 
-               list_req = container_of(ctx->iopoll_list.first, struct io_kiocb,
-                                       comp_list);
+               list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb, iopoll_node);
                if (list_req->file != req->file)
                        ctx->poll_multi_queue = true;
        }
 
-       /*
-        * For fast devices, IO may have already completed. If it has, add
-        * it to the front so we find it first.
-        */
-       if (READ_ONCE(req->iopoll_completed))
-               wq_list_add_head(&req->comp_list, &ctx->iopoll_list);
-       else
-               wq_list_add_tail(&req->comp_list, &ctx->iopoll_list);
+       list_add_tail(&req->iopoll_node, &ctx->iopoll_list);
 
        if (unlikely(needs_lock)) {
                /*
index 70ca88cc1f547144e916c839849112cfde8137d5..307f1f39d9f391b6b87ce1767c92f4653706a0e0 100644 (file)
@@ -1315,9 +1315,9 @@ static int io_uring_hybrid_poll(struct io_kiocb *req,
 
 int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 {
-       struct io_wq_work_node *pos, *start, *prev;
        unsigned int poll_flags = 0;
        DEFINE_IO_COMP_BATCH(iob);
+       struct io_kiocb *req, *tmp;
        int nr_events = 0;
 
        /*
@@ -1327,8 +1327,7 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
        if (ctx->poll_multi_queue || force_nonspin)
                poll_flags |= BLK_POLL_ONESHOT;
 
-       wq_list_for_each(pos, start, &ctx->iopoll_list) {
-               struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
+       list_for_each_entry(req, &ctx->iopoll_list, iopoll_node) {
                int ret;
 
                /*
@@ -1357,31 +1356,20 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 
        if (!rq_list_empty(&iob.req_list))
                iob.complete(&iob);
-       else if (!pos)
-               return 0;
-
-       prev = start;
-       wq_list_for_each_resume(pos, prev) {
-               struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
 
+       list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, iopoll_node) {
                /* order with io_complete_rw_iopoll(), e.g. ->result updates */
                if (!smp_load_acquire(&req->iopoll_completed))
-                       break;
+                       continue;
+               list_del(&req->iopoll_node);
+               wq_list_add_tail(&req->comp_list, &ctx->submit_state.compl_reqs);
                nr_events++;
                req->cqe.flags = io_put_kbuf(req, req->cqe.res, NULL);
                if (req->opcode != IORING_OP_URING_CMD)
                        io_req_rw_cleanup(req, 0);
        }
-       if (unlikely(!nr_events))
-               return 0;
-
-       pos = start ? start->next : ctx->iopoll_list.first;
-       wq_list_cut(&ctx->iopoll_list, prev, start);
-
-       if (WARN_ON_ONCE(!wq_list_empty(&ctx->submit_state.compl_reqs)))
-               return 0;
-       ctx->submit_state.compl_reqs.first = pos;
-       __io_submit_flush_completions(ctx);
+       if (nr_events)
+               __io_submit_flush_completions(ctx);
        return nr_events;
 }
 
index 7ef747442754819b38d5e280cddea915404fb1d8..0aec51076bad3de4e0890b24ca4b9af6115ce1f2 100644 (file)
@@ -9,9 +9,6 @@
 #define wq_list_for_each(pos, prv, head)                       \
        for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)
 
-#define wq_list_for_each_resume(pos, prv)                      \
-       for (; pos; prv = pos, pos = (pos)->next)
-
 #define wq_list_empty(list)    (READ_ONCE((list)->first) == NULL)
 
 #define INIT_WQ_LIST(list)     do {                            \
@@ -43,15 +40,6 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node,
        }
 }
 
-static inline void wq_list_add_head(struct io_wq_work_node *node,
-                                   struct io_wq_work_list *list)
-{
-       node->next = list->first;
-       if (!node->next)
-               list->last = node;
-       WRITE_ONCE(list->first, node);
-}
-
 static inline void wq_list_cut(struct io_wq_work_list *list,
                               struct io_wq_work_node *last,
                               struct io_wq_work_node *prev)
index 74c1a130cd87445a68fe080331693f2ae7822629..becdfdd323a94890c5ef064be0ced44628a56d82 100644 (file)
@@ -212,7 +212,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, struct io_sq_data *sqd,
        if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE)
                to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE;
 
-       if (to_submit || !wq_list_empty(&ctx->iopoll_list)) {
+       if (to_submit || !list_empty(&ctx->iopoll_list)) {
                const struct cred *creds = NULL;
 
                io_sq_start_worktime(ist);
@@ -221,7 +221,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, struct io_sq_data *sqd,
                        creds = override_creds(ctx->sq_creds);
 
                mutex_lock(&ctx->uring_lock);
-               if (!wq_list_empty(&ctx->iopoll_list))
+               if (!list_empty(&ctx->iopoll_list))
                        io_do_iopoll(ctx, true);
 
                /*
@@ -344,7 +344,7 @@ static int io_sq_thread(void *data)
                list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
                        int ret = __io_sq_thread(ctx, sqd, cap_entries, &ist);
 
-                       if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list)))
+                       if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list)))
                                sqt_spin = true;
                }
                if (io_sq_tw(&retry_list, IORING_TW_CAP_ENTRIES_VALUE))
@@ -379,7 +379,7 @@ static int io_sq_thread(void *data)
                                atomic_or(IORING_SQ_NEED_WAKEUP,
                                                &ctx->rings->sq_flags);
                                if ((ctx->flags & IORING_SETUP_IOPOLL) &&
-                                   !wq_list_empty(&ctx->iopoll_list)) {
+                                   !list_empty(&ctx->iopoll_list)) {
                                        needs_sched = false;
                                        break;
                                }