io_uring: drain based on allocates reqs

author Pavel Begunkov <asml.silence@gmail.com>

Fri, 9 May 2025 11:12:54 +0000 (12:12 +0100)

committer Jens Axboe <axboe@kernel.dk>

Mon, 12 May 2025 13:52:52 +0000 (07:52 -0600)
author Pavel Begunkov <asml.silence@gmail.com>
Fri, 9 May 2025 11:12:54 +0000 (12:12 +0100)
committer Jens Axboe <axboe@kernel.dk>
Mon, 12 May 2025 13:52:52 +0000 (07:52 -0600)
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h

index 73b289b48280db5d709edd7e028593575ed7b7b4..00dbd7cd0e7db7f64c70db1e61dd69234bf1a573 100644 (file)
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -341,7 +341,6 @@ struct io_ring_ctx {
                 unsigned                cached_cq_tail;
                 unsigned                cq_entries;
                 struct io_ev_fd __rcu   *io_ev_fd;
-               unsigned                cq_extra;
  
                 void                    *cq_wait_arg;
                 size_t                  cq_wait_size;
@@ -417,6 +416,7 @@ struct io_ring_ctx {
  
         struct callback_head            poll_wq_task_work;
         struct list_head                defer_list;
+       unsigned                        nr_drained;
  
         struct io_alloc_cache           msg_cache;
         spinlock_t                      msg_lock;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c

index 714b66ab34b0eff928b1ea0da10f3336438950f4..9a9b8d35349b519b918ee8a942ef209db3f8dcce 100644 (file)
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -129,7 +129,6 @@
  struct io_defer_entry {
         struct list_head        list;
         struct io_kiocb         *req;
-       u32                     seq;
  };
  
  /* requests with any of those set should undergo io_disarm_next() */
@@ -149,6 +148,7 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
                                          bool is_sqpoll_thread);
  
  static void io_queue_sqe(struct io_kiocb *req);
+static void __io_req_caches_free(struct io_ring_ctx *ctx);
  
  static __read_mostly DEFINE_STATIC_KEY_FALSE(io_key_has_sqarray);
  
@@ -540,46 +540,45 @@ void io_req_queue_iowq(struct io_kiocb *req)
         io_req_task_work_add(req);
  }
  
-static bool io_drain_defer_seq(struct io_kiocb *req, u32 seq)
+static unsigned io_linked_nr(struct io_kiocb *req)
  {
-       struct io_ring_ctx *ctx = req->ctx;
+       struct io_kiocb *tmp;
+       unsigned nr = 0;
  
-       return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
+       io_for_each_link(tmp, req)
+               nr++;
+       return nr;
  }
  
-static __cold noinline void __io_queue_deferred(struct io_ring_ctx *ctx)
+static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx)
  {
         bool drain_seen = false, first = true;
  
+       lockdep_assert_held(&ctx->uring_lock);
+       __io_req_caches_free(ctx);
+
         while (!list_empty(&ctx->defer_list)) {
                 struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
                                                 struct io_defer_entry, list);
  
                 drain_seen |= de->req->flags & REQ_F_IO_DRAIN;
-               if ((drain_seen || first) && io_drain_defer_seq(de->req, de->seq))
-                       break;
+               if ((drain_seen || first) && ctx->nr_req_allocated != ctx->nr_drained)
+                       return;
  
                 list_del_init(&de->list);
+               ctx->nr_drained -= io_linked_nr(de->req);
                 io_req_task_queue(de->req);
                 kfree(de);
                 first = false;
         }
  }
  
-static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx)
-{
-       guard(spinlock)(&ctx->completion_lock);
-       __io_queue_deferred(ctx);
-}
-
  void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
  {
         if (ctx->poll_activated)
                 io_poll_wq_wake(ctx);
         if (ctx->off_timeout_used)
                 io_flush_timeouts(ctx);
-       if (ctx->drain_active)
-               io_queue_deferred(ctx);
         if (ctx->has_evfd)
                 io_eventfd_signal(ctx, true);
  }
@@ -742,7 +741,6 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
                  * on the floor.
                  */
                 WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
-               ctx->cq_extra--;
                 set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq);
                 return false;
         }
@@ -812,8 +810,6 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
  {
         struct io_uring_cqe *cqe;
  
-       ctx->cq_extra++;
-
         if (likely(io_get_cqe(ctx, &cqe))) {
                 WRITE_ONCE(cqe->user_data, user_data);
                 WRITE_ONCE(cqe->res, res);
@@ -1459,6 +1455,10 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
                 io_free_batch_list(ctx, state->compl_reqs.first);
                 INIT_WQ_LIST(&state->compl_reqs);
         }
+
+       if (unlikely(ctx->drain_active))
+               io_queue_deferred(ctx);
+
         ctx->submit_state.cq_flush = false;
  }
  
@@ -1646,17 +1646,6 @@ io_req_flags_t io_file_get_flags(struct file *file)
         return res;
  }
  
-static u32 io_get_sequence(struct io_kiocb *req)
-{
-       u32 seq = req->ctx->cached_sq_head;
-       struct io_kiocb *cur;
-
-       /* need original cached_sq_head, but it was increased for each req */
-       io_for_each_link(cur, req)
-               seq--;
-       return seq;
-}
-
  static __cold void io_drain_req(struct io_kiocb *req)
         __must_hold(&ctx->uring_lock)
  {
@@ -1673,14 +1662,12 @@ static __cold void io_drain_req(struct io_kiocb *req)
         io_prep_async_link(req);
         trace_io_uring_defer(req);
         de->req = req;
-       de->seq = io_get_sequence(req);
  
-       scoped_guard(spinlock, &ctx->completion_lock) {
-               list_add_tail(&de->list, &ctx->defer_list);
-               __io_queue_deferred(ctx);
-               if (!drain && list_empty(&ctx->defer_list))
-                       ctx->drain_active = false;
-       }
+       ctx->nr_drained += io_linked_nr(req);
+       list_add_tail(&de->list, &ctx->defer_list);
+       io_queue_deferred(ctx);
+       if (!drain && list_empty(&ctx->defer_list))
+               ctx->drain_active = false;
  }
  
  static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def,
@@ -2263,10 +2250,6 @@ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe)
             (!(ctx->flags & IORING_SETUP_NO_SQARRAY))) {
                 head = READ_ONCE(ctx->sq_array[head]);
                 if (unlikely(head >= ctx->sq_entries)) {
-                       /* drop invalid entries */
-                       spin_lock(&ctx->completion_lock);
-                       ctx->cq_extra--;
-                       spin_unlock(&ctx->completion_lock);
                         WRITE_ONCE(ctx->rings->sq_dropped,
                                    READ_ONCE(ctx->rings->sq_dropped) + 1);
                         return false;
@@ -2684,13 +2667,11 @@ unsigned long rings_size(unsigned int flags, unsigned int sq_entries,
         return off;
  }
  
-static void io_req_caches_free(struct io_ring_ctx *ctx)
+static __cold void __io_req_caches_free(struct io_ring_ctx *ctx)
  {
         struct io_kiocb *req;
         int nr = 0;
  
-       mutex_lock(&ctx->uring_lock);
-
         while (!io_req_cache_empty(ctx)) {
                 req = io_extract_req(ctx);
                 kmem_cache_free(req_cachep, req);
@@ -2700,7 +2681,12 @@ static void io_req_caches_free(struct io_ring_ctx *ctx)
                 ctx->nr_req_allocated -= nr;
                 percpu_ref_put_many(&ctx->refs, nr);
         }
-       mutex_unlock(&ctx->uring_lock);
+}
+
+static __cold void io_req_caches_free(struct io_ring_ctx *ctx)
+{
+       guard(mutex)(&ctx->uring_lock);
+       __io_req_caches_free(ctx);
  }
  
  static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
@@ -3005,20 +2991,19 @@ static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
         struct io_defer_entry *de;
         LIST_HEAD(list);
  
-       spin_lock(&ctx->completion_lock);
         list_for_each_entry_reverse(de, &ctx->defer_list, list) {
                 if (io_match_task_safe(de->req, tctx, cancel_all)) {
                         list_cut_position(&list, &ctx->defer_list, &de->list);
                         break;
                 }
         }
-       spin_unlock(&ctx->completion_lock);
         if (list_empty(&list))
                 return false;
  
         while (!list_empty(&list)) {
                 de = list_first_entry(&list, struct io_defer_entry, list);
                 list_del_init(&de->list);
+               ctx->nr_drained -= io_linked_nr(de->req);
                 io_req_task_queue_fail(de->req, -ECANCELED);
                 kfree(de);
         }
@@ -3093,8 +3078,8 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
         if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
             io_allowed_defer_tw_run(ctx))
                 ret |= io_run_local_work(ctx, INT_MAX, INT_MAX) > 0;
-       ret |= io_cancel_defer_files(ctx, tctx, cancel_all);
         mutex_lock(&ctx->uring_lock);
+       ret |= io_cancel_defer_files(ctx, tctx, cancel_all);
         ret |= io_poll_remove_all(ctx, tctx, cancel_all);
         ret |= io_waitid_remove_all(ctx, tctx, cancel_all);
         ret |= io_futex_remove_all(ctx, tctx, cancel_all);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h

index e4050b2d0821f529585ab0ba10f8cd383acfa34a..81f22196a57de69f88edc2fe3660af6d3b3f470d 100644 (file)
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -196,7 +196,6 @@ static inline bool io_defer_get_uncommited_cqe(struct io_ring_ctx *ctx,
  {
         io_lockdep_assert_cq_locked(ctx);
  
-       ctx->cq_extra++;
         ctx->submit_state.cq_flush = true;
         return io_get_cqe(ctx, cqe_ret);
  }
@@ -414,7 +413,7 @@ static inline void io_req_complete_defer(struct io_kiocb *req)
  
  static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx)
  {
-       if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
+       if (unlikely(ctx->off_timeout_used ||
                      ctx->has_evfd || ctx->poll_activated))
                 __io_commit_cqring_flush(ctx);
  }
author	Pavel Begunkov <asml.silence@gmail.com>
	Fri, 9 May 2025 11:12:54 +0000 (12:12 +0100)
committer	Jens Axboe <axboe@kernel.dk>
	Mon, 12 May 2025 13:52:52 +0000 (07:52 -0600)
include/linux/io_uring_types.h		patch \| blob \| blame \| history
io_uring/io_uring.c		patch \| blob \| blame \| history
io_uring/io_uring.h		patch \| blob \| blame \| history