io_uring/kbuf: switch to storing struct io_buffer_list locally

author Jens Axboe <axboe@kernel.dk>

Thu, 21 Aug 2025 02:03:39 +0000 (20:03 -0600)

committer Jens Axboe <axboe@kernel.dk>

Sun, 24 Aug 2025 17:41:12 +0000 (11:41 -0600)
author Jens Axboe <axboe@kernel.dk>
Thu, 21 Aug 2025 02:03:39 +0000 (20:03 -0600)
committer Jens Axboe <axboe@kernel.dk>
Sun, 24 Aug 2025 17:41:12 +0000 (11:41 -0600)
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h

index 80a178f3d89688e1cc3cef2a24df9428ad9c700d..1d33984611bc9546de6721e0962964766e839ffc 100644 (file)
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -674,12 +674,6 @@ struct io_kiocb {
                 /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
                 struct io_buffer        *kbuf;
  
-               /*
-                * stores buffer ID for ring provided buffers, valid IFF
-                * REQ_F_BUFFER_RING is set.
-                */
-               struct io_buffer_list   *buf_list;
-
                 struct io_rsrc_node     *buf_node;
         };
  
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c

index 27bc1486f07b6f1f6d1e645eabc2f88fd3e8f02c..985b4681e5131f06eb48959949669b06dc7feb2e 100644 (file)
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1007,7 +1007,7 @@ void io_req_defer_failed(struct io_kiocb *req, s32 res)
         lockdep_assert_held(&req->ctx->uring_lock);
  
         req_set_fail(req);
-       io_req_set_res(req, res, io_put_kbuf(req, res, req->buf_list));
+       io_req_set_res(req, res, io_put_kbuf(req, res, NULL));
         if (def->fail)
                 def->fail(req);
         io_req_complete_defer(req);
@@ -2025,11 +2025,11 @@ fail:
  
         switch (io_arm_poll_handler(req, 0)) {
         case IO_APOLL_READY:
-               io_kbuf_recycle(req, req->buf_list, 0);
+               io_kbuf_recycle(req, NULL, 0);
                 io_req_task_queue(req);
                 break;
         case IO_APOLL_ABORTED:
-               io_kbuf_recycle(req, req->buf_list, 0);
+               io_kbuf_recycle(req, NULL, 0);
                 io_queue_iowq(req);
                 break;
         case IO_APOLL_OK:
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c

index 21c12c437ab9cb2cb3a52de6602a125c38831d3c..3e9aab21af9d474abdee2f5dccd09d6cb7ae277a 100644 (file)
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -171,8 +171,8 @@ static struct io_br_sel io_ring_buffer_select(struct io_kiocb *req, size_t *len,
         if (*len == 0 || *len > buf->len)
                 *len = buf->len;
         req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT;
-       req->buf_list = bl;
         req->buf_index = buf->bid;
+       sel.buf_list = bl;
         sel.addr = u64_to_user_ptr(buf->addr);
  
         if (issue_flags & IO_URING_F_UNLOCKED || !io_file_can_poll(req)) {
@@ -186,8 +186,8 @@ static struct io_br_sel io_ring_buffer_select(struct io_kiocb *req, size_t *len,
                  * the transfer completes (or if we get -EAGAIN and must poll of
                  * retry).
                  */
-               io_kbuf_commit(req, bl, *len, 1);
-               req->buf_list = NULL;
+               io_kbuf_commit(req, sel.buf_list, *len, 1);
+               sel.buf_list = NULL;
         }
         return sel;
  }
@@ -294,7 +294,6 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
                 req->flags |= REQ_F_BL_EMPTY;
  
         req->flags |= REQ_F_BUFFER_RING;
-       req->buf_list = bl;
         return iov - arg->iovs;
  }
  
@@ -302,16 +301,15 @@ int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
                       struct io_br_sel *sel, unsigned int issue_flags)
  {
         struct io_ring_ctx *ctx = req->ctx;
-       struct io_buffer_list *bl;
         int ret = -ENOENT;
  
         io_ring_submit_lock(ctx, issue_flags);
-       bl = io_buffer_get_list(ctx, arg->buf_group);
-       if (unlikely(!bl))
+       sel->buf_list = io_buffer_get_list(ctx, arg->buf_group);
+       if (unlikely(!sel->buf_list))
                 goto out_unlock;
  
-       if (bl->flags & IOBL_BUF_RING) {
-               ret = io_ring_buffers_peek(req, arg, bl);
+       if (sel->buf_list->flags & IOBL_BUF_RING) {
+               ret = io_ring_buffers_peek(req, arg, sel->buf_list);
                 /*
                  * Don't recycle these buffers if we need to go through poll.
                  * Nobody else can use them anyway, and holding on to provided
@@ -321,13 +319,16 @@ int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
                  */
                 if (ret > 0) {
                         req->flags |= REQ_F_BUFFERS_COMMIT | REQ_F_BL_NO_RECYCLE;
-                       io_kbuf_commit(req, bl, arg->out_len, ret);
+                       io_kbuf_commit(req, sel->buf_list, arg->out_len, ret);
                 }
         } else {
-               ret = io_provided_buffers_select(req, &arg->out_len, bl, arg->iovs);
+               ret = io_provided_buffers_select(req, &arg->out_len, sel->buf_list, arg->iovs);
         }
  out_unlock:
-       io_ring_submit_unlock(ctx, issue_flags);
+       if (issue_flags & IO_URING_F_UNLOCKED) {
+               sel->buf_list = NULL;
+               mutex_unlock(&ctx->uring_lock);
+       }
         return ret;
  }
  
@@ -348,10 +349,12 @@ int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
                 ret = io_ring_buffers_peek(req, arg, bl);
                 if (ret > 0)
                         req->flags |= REQ_F_BUFFERS_COMMIT;
+               sel->buf_list = bl;
                 return ret;
         }
  
         /* don't support multiple buffer selections for legacy */
+       sel->buf_list = NULL;
         return io_provided_buffers_select(req, &arg->max_len, bl, arg->iovs);
  }
  
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h

index b1723c2620dae53d80e2a947887f07567bdda6d7..1a539969fc9c36a983a2cee8398a224336002487 100644 (file)
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -63,11 +63,14 @@ struct buf_sel_arg {
  };
  
  /*
- * Return value from io_buffer_list selection. Just returns the error or
- * user address for now, will be extended to return the buffer list in the
- * future.
+ * Return value from io_buffer_list selection, to avoid stashing it in
+ * struct io_kiocb. For legacy/classic provided buffers, keeping a reference
+ * across execution contexts are fine. But for ring provided buffers, the
+ * list may go away as soon as ->uring_lock is dropped. As the io_kiocb
+ * persists, it's better to just keep the buffer local for those cases.
   */
  struct io_br_sel {
+       struct io_buffer_list *buf_list;
         /*
          * Some selection parts return the user address, others return an error.
          */
@@ -107,13 +110,6 @@ struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx,
  static inline bool io_kbuf_recycle_ring(struct io_kiocb *req,
                                         struct io_buffer_list *bl)
  {
-       /*
-        * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear
-        * the flag and hence ensure that bl->head doesn't get incremented.
-        * If the tail has already been incremented, hang on to it.
-        * The exception is partial io, that case we should increment bl->head
-        * to monopolize the buffer.
-        */
         if (bl) {
                 req->flags &= ~(REQ_F_BUFFER_RING|REQ_F_BUFFERS_COMMIT);
                 return true;
diff --git a/io_uring/net.c b/io_uring/net.c

index 4eb208d241690dd721787d34e7f45db4498adf3b..b00cd2f59091fa78b0bc2b85c395e3858a2c9ce2 100644 (file)
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -433,7 +433,6 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
                 if (req->opcode == IORING_OP_SENDMSG)
                         return -EINVAL;
                 sr->msg_flags |= MSG_WAITALL;
-               req->buf_list = NULL;
                 req->flags |= REQ_F_MULTISHOT;
         }
  
@@ -512,11 +511,11 @@ static inline bool io_send_finish(struct io_kiocb *req,
         unsigned int cflags;
  
         if (!(sr->flags & IORING_RECVSEND_BUNDLE)) {
-               cflags = io_put_kbuf(req, sel->val, req->buf_list);
+               cflags = io_put_kbuf(req, sel->val, sel->buf_list);
                 goto finish;
         }
  
-       cflags = io_put_kbufs(req, sel->val, req->buf_list, io_bundle_nbufs(kmsg, sel->val));
+       cflags = io_put_kbufs(req, sel->val, sel->buf_list, io_bundle_nbufs(kmsg, sel->val));
  
         if (bundle_finished || req->flags & REQ_F_BL_EMPTY)
                 goto finish;
@@ -657,6 +656,7 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
                 flags |= MSG_DONTWAIT;
  
  retry_bundle:
+       sel.buf_list = NULL;
         if (io_do_buffer_select(req)) {
                 ret = io_send_select_buffer(req, issue_flags, &sel, kmsg);
                 if (ret)
@@ -682,7 +682,7 @@ retry_bundle:
                         sr->len -= ret;
                         sr->buf += ret;
                         sr->done_io += ret;
-                       return io_net_kbuf_recyle(req, req->buf_list, kmsg, ret);
+                       return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret);
                 }
                 if (ret == -ERESTARTSYS)
                         ret = -EINTR;
@@ -795,18 +795,8 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
                 req->flags |= REQ_F_NOWAIT;
         if (sr->msg_flags & MSG_ERRQUEUE)
                 req->flags |= REQ_F_CLEAR_POLLIN;
-       if (req->flags & REQ_F_BUFFER_SELECT) {
-               /*
-                * Store the buffer group for this multishot receive separately,
-                * as if we end up doing an io-wq based issue that selects a
-                * buffer, it has to be committed immediately and that will
-                * clear ->buf_list. This means we lose the link to the buffer
-                * list, and the eventual buffer put on completion then cannot
-                * restore it.
-                */
+       if (req->flags & REQ_F_BUFFER_SELECT)
                 sr->buf_group = req->buf_index;
-               req->buf_list = NULL;
-       }
         sr->mshot_total_len = sr->mshot_len = 0;
         if (sr->flags & IORING_RECV_MULTISHOT) {
                 if (!(req->flags & REQ_F_BUFFER_SELECT))
@@ -874,7 +864,7 @@ static inline bool io_recv_finish(struct io_kiocb *req,
         if (sr->flags & IORING_RECVSEND_BUNDLE) {
                 size_t this_ret = sel->val - sr->done_io;
  
-               cflags |= io_put_kbufs(req, this_ret, req->buf_list, io_bundle_nbufs(kmsg, this_ret));
+               cflags |= io_put_kbufs(req, this_ret, sel->buf_list, io_bundle_nbufs(kmsg, this_ret));
                 if (sr->flags & IORING_RECV_RETRY)
                         cflags = req->cqe.flags | (cflags & CQE_F_MASK);
                 if (sr->mshot_len && sel->val >= sr->mshot_len)
@@ -896,7 +886,7 @@ static inline bool io_recv_finish(struct io_kiocb *req,
                         return false;
                 }
         } else {
-               cflags |= io_put_kbuf(req, sel->val, req->buf_list);
+               cflags |= io_put_kbuf(req, sel->val, sel->buf_list);
         }
  
         /*
@@ -1038,6 +1028,7 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
                 flags |= MSG_DONTWAIT;
  
  retry_multishot:
+       sel.buf_list = NULL;
         if (io_do_buffer_select(req)) {
                 size_t len = sr->len;
  
@@ -1048,7 +1039,7 @@ retry_multishot:
                 if (req->flags & REQ_F_APOLL_MULTISHOT) {
                         ret = io_recvmsg_prep_multishot(kmsg, sr, &sel.addr, &len);
                         if (ret) {
-                               io_kbuf_recycle(req, req->buf_list, issue_flags);
+                               io_kbuf_recycle(req, sel.buf_list, issue_flags);
                                 return ret;
                         }
                 }
@@ -1072,14 +1063,12 @@ retry_multishot:
  
         if (ret < min_ret) {
                 if (ret == -EAGAIN && force_nonblock) {
-                       if (issue_flags & IO_URING_F_MULTISHOT)
-                               io_kbuf_recycle(req, req->buf_list, issue_flags);
-
+                       io_kbuf_recycle(req, sel.buf_list, issue_flags);
                         return IOU_RETRY;
                 }
                 if (ret > 0 && io_net_retry(sock, flags)) {
                         sr->done_io += ret;
-                       return io_net_kbuf_recyle(req, req->buf_list, kmsg, ret);
+                       return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret);
                 }
                 if (ret == -ERESTARTSYS)
                         ret = -EINTR;
@@ -1093,7 +1082,7 @@ retry_multishot:
         else if (sr->done_io)
                 ret = sr->done_io;
         else
-               io_kbuf_recycle(req, req->buf_list, issue_flags);
+               io_kbuf_recycle(req, sel.buf_list, issue_flags);
  
         sel.val = ret;
         if (!io_recv_finish(req, kmsg, &sel, mshot_finished, issue_flags))
@@ -1178,7 +1167,7 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
  {
         struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
         struct io_async_msghdr *kmsg = req->async_data;
-       struct io_br_sel sel = { };
+       struct io_br_sel sel;
         struct socket *sock;
         unsigned flags;
         int ret, min_ret = 0;
@@ -1198,6 +1187,7 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
                 flags |= MSG_DONTWAIT;
  
  retry_multishot:
+       sel.buf_list = NULL;
         if (io_do_buffer_select(req)) {
                 sel.val = sr->len;
                 ret = io_recv_buf_select(req, kmsg, &sel, issue_flags);
@@ -1217,16 +1207,14 @@ retry_multishot:
         ret = sock_recvmsg(sock, &kmsg->msg, flags);
         if (ret < min_ret) {
                 if (ret == -EAGAIN && force_nonblock) {
-                       if (issue_flags & IO_URING_F_MULTISHOT)
-                               io_kbuf_recycle(req, req->buf_list, issue_flags);
-
+                       io_kbuf_recycle(req, sel.buf_list, issue_flags);
                         return IOU_RETRY;
                 }
                 if (ret > 0 && io_net_retry(sock, flags)) {
                         sr->len -= ret;
                         sr->buf += ret;
                         sr->done_io += ret;
-                       return io_net_kbuf_recyle(req, req->buf_list, kmsg, ret);
+                       return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret);
                 }
                 if (ret == -ERESTARTSYS)
                         ret = -EINTR;
@@ -1242,7 +1230,7 @@ out_free:
         else if (sr->done_io)
                 ret = sr->done_io;
         else
-               io_kbuf_recycle(req, req->buf_list, issue_flags);
+               io_kbuf_recycle(req, sel.buf_list, issue_flags);
  
         sel.val = ret;
         if (!io_recv_finish(req, kmsg, &sel, mshot_finished, issue_flags))
diff --git a/io_uring/poll.c b/io_uring/poll.c

index 07ab22380c7857c442a250a43c6bab081bc04a49..f3852bf7627b16d6332139af9540fa8341f9e021 100644 (file)
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -316,10 +316,10 @@ void io_poll_task_func(struct io_kiocb *req, io_tw_token_t tw)
  
         ret = io_poll_check_events(req, tw);
         if (ret == IOU_POLL_NO_ACTION) {
-               io_kbuf_recycle(req, req->buf_list, 0);
+               io_kbuf_recycle(req, NULL, 0);
                 return;
         } else if (ret == IOU_POLL_REQUEUE) {
-               io_kbuf_recycle(req, req->buf_list, 0);
+               io_kbuf_recycle(req, NULL, 0);
                 __io_poll_execute(req, 0);
                 return;
         }
@@ -686,7 +686,7 @@ int io_arm_apoll(struct io_kiocb *req, unsigned issue_flags, __poll_t mask)
         req->flags |= REQ_F_POLLED;
         ipt.pt._qproc = io_async_queue_proc;
  
-       io_kbuf_recycle(req, req->buf_list, issue_flags);
+       io_kbuf_recycle(req, NULL, issue_flags);
  
         ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, issue_flags);
         if (ret)
diff --git a/io_uring/rw.c b/io_uring/rw.c

index 2b106f644383f755a46511f99ecb3f4fc9e40862..906e869d330add1c0627c6e702a09b05529791f1 100644 (file)
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -579,7 +579,7 @@ void io_req_rw_complete(struct io_kiocb *req, io_tw_token_t tw)
         io_req_io_end(req);
  
         if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))
-               req->cqe.flags |= io_put_kbuf(req, req->cqe.res, req->buf_list);
+               req->cqe.flags |= io_put_kbuf(req, req->cqe.res, NULL);
  
         io_req_rw_cleanup(req, 0);
         io_req_task_complete(req, tw);
@@ -648,7 +648,7 @@ static inline void io_rw_done(struct io_kiocb *req, ssize_t ret)
  }
  
  static int kiocb_done(struct io_kiocb *req, ssize_t ret,
-                      unsigned int issue_flags)
+                     struct io_br_sel *sel, unsigned int issue_flags)
  {
         struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
         unsigned final_ret = io_fixup_rw_res(req, ret);
@@ -662,7 +662,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret,
                  * from the submission path.
                  */
                 io_req_io_end(req);
-               io_req_set_res(req, final_ret, io_put_kbuf(req, ret, req->buf_list));
+               io_req_set_res(req, final_ret, io_put_kbuf(req, ret, sel->buf_list));
                 io_req_rw_cleanup(req, issue_flags);
                 return IOU_COMPLETE;
         } else {
@@ -1024,10 +1024,10 @@ int io_read(struct io_kiocb *req, unsigned int issue_flags)
  
         ret = __io_read(req, &sel, issue_flags);
         if (ret >= 0)
-               return kiocb_done(req, ret, issue_flags);
+               return kiocb_done(req, ret, &sel, issue_flags);
  
         if (req->flags & REQ_F_BUFFERS_COMMIT)
-               io_kbuf_recycle(req, req->buf_list, issue_flags);
+               io_kbuf_recycle(req, sel.buf_list, issue_flags);
         return ret;
  }
  
@@ -1057,15 +1057,15 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)
                  * Reset rw->len to 0 again to avoid clamping future mshot
                  * reads, in case the buffer size varies.
                  */
-               if (io_kbuf_recycle(req, req->buf_list, issue_flags))
+               if (io_kbuf_recycle(req, sel.buf_list, issue_flags))
                         rw->len = 0;
                 return IOU_RETRY;
         } else if (ret <= 0) {
-               io_kbuf_recycle(req, req->buf_list, issue_flags);
+               io_kbuf_recycle(req, sel.buf_list, issue_flags);
                 if (ret < 0)
                         req_set_fail(req);
         } else if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
-               cflags = io_put_kbuf(req, ret, req->buf_list);
+               cflags = io_put_kbuf(req, ret, sel.buf_list);
         } else {
                 /*
                  * Any successful return value will keep the multishot read
@@ -1073,7 +1073,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)
                  * we fail to post a CQE, or multishot is no longer set, then
                  * jump to the termination path. This request is then done.
                  */
-               cflags = io_put_kbuf(req, ret, req->buf_list);
+               cflags = io_put_kbuf(req, ret, sel.buf_list);
                 rw->len = 0; /* similarly to above, reset len to 0 */
  
                 if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) {
@@ -1202,7 +1202,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
                         return -EAGAIN;
                 }
  done:
-               return kiocb_done(req, ret2, issue_flags);
+               return kiocb_done(req, ret2, NULL, issue_flags);
         } else {
  ret_eagain:
                 iov_iter_restore(&io->iter, &io->iter_state);
@@ -1370,7 +1370,7 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
                 if (!smp_load_acquire(&req->iopoll_completed))
                         break;
                 nr_events++;
-               req->cqe.flags = io_put_kbuf(req, req->cqe.res, req->buf_list);
+               req->cqe.flags = io_put_kbuf(req, req->cqe.res, NULL);
                 if (req->opcode != IORING_OP_URING_CMD)
                         io_req_rw_cleanup(req, 0);
         }
author	Jens Axboe <axboe@kernel.dk>
	Thu, 21 Aug 2025 02:03:39 +0000 (20:03 -0600)
committer	Jens Axboe <axboe@kernel.dk>
	Sun, 24 Aug 2025 17:41:12 +0000 (11:41 -0600)
include/linux/io_uring_types.h		patch \| blob \| blame \| history
io_uring/io_uring.c		patch \| blob \| blame \| history
io_uring/kbuf.c		patch \| blob \| blame \| history
io_uring/kbuf.h		patch \| blob \| blame \| history
io_uring/net.c		patch \| blob \| blame \| history
io_uring/poll.c		patch \| blob \| blame \| history
io_uring/rw.c		patch \| blob \| blame \| history