Pull io_uring updates from Jens Axboe:
- Rework the task_work infrastructure.
Both the local (DEFER_TASKRUN) and the normal (tctx) task_work lists
were llist based, which is LIFO ordered, and hence each run had to do
an O(n) list reversal pass first to restore queue order.
Additionally, to cap the amount of task_work run, each method needed
a retry list as well.
Add a lockless MPCS FIFO queue (based on Dmitry Vyukov's intrusive
MPSC algorithm) and switch both task_work lists to it. It performs
better than llists and we can then also ditch the retry lists as well
as entries are popped one-at-the-time.
On top of those changes, run the tctx fallback task_work directly and
remove the now-unused per-ctx fallback machinery entirely.
- zcrx user notifications.
Add a mechanism for zcrx to communicate conditions back to userspace
via a dedicated CQE, with the initial users being notification on
running out of buffers and on a frag copy fallback, plus
shared-memory notification statistics.
Alongside that, a series of zcrx reliability and cleanup fixes: more
reliable scrubbing, poisoning pointers on unregistration, dropping an
extra ifq close, adding a ctx back-pointer, reordering fd allocation
in the export path, and killing a dead 'sock' member.
- Allow using io_uring registered buffers for plain SEND and RECV, not
just for the zero-copy send path.
This enables targets like ublk's NBD backend to push/pull IO data
directly to/from a registered buffer over a plain send/recv on a TCP
socket.
- Registered buffer improvements: account huge pages correctly, bump
the io_mapped_ubuf length field to size_t, and raise the previous 1GB
registered buffer size limit.
- Restrict the ctx access exposed to io_uring BPF struct_ops programs
by handing them an opaque type rather than the full io_ring_ctx, and
add a separate MAINTAINERS entry for the bpf-ops code.
- Allow opcode filtering on IORING_OP_CONNECT.
- Validate ring-provided buffer addresses with access_ok(), and align
the legacy buffer add limit with MAX_BIDS_PER_BGID.
- Various other cleanups and minor fixes, including avoiding msghdr
async data on connect/bind, dropping async_size for OP_LISTEN, making
the POLL_FIRST receive side checks consistent, re-checking
IO_WQ_BIT_EXIT for each linked work item, and using
trace_call__##name() at guarded tracepoint call sites.
* tag 'for-7.2/io_uring-
20260615' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: (31 commits)
io_uring/bpf-ops: add a separate maintainer entry
io_uring/net: make POLL_FIRST receive side checks consistent
io_uring: remove the per-ctx fallback task_work machinery
io_uring: run the tctx task_work fallback directly
io_uring: switch normal task_work to a mpscq
io_uring: switch local task_work to a mpscq
io_uring/mpscq: add lockless multi-producer, single-consumer FIFO queue
io_uring: grab RCU read lock marking task run
io_uring/zcrx: kill dead 'sock' member in struct io_zcrx_args
io_uring/kbuf: validate ring provided buffer addresses with access_ok()
io_uring/net: support registered buffer for plain send and recv
io_uring/nop: Drop a wrong comment in struct io_nop
io_uring/net: Remove async_size for OP_LISTEN
io_uring/net: Avoid msghdr on op_connect/op_bind async data
io_uring/bpf-ops: restrict ctx access to BPF
io_uring/io-wq: re-check IO_WQ_BIT_EXIT for each linked work item
io_uring/kbuf: align legacy buffer add limit with MAX_BIDS_PER_BGID
io_uring/zcrx: add shared-memory notification statistics
io_uring/zcrx: notify user on frag copy fallback
io_uring/zcrx: notify user when out of buffers
...
ret |= io_waitid_remove_all(ctx, tctx, cancel_all);
ret |= io_futex_remove_all(ctx, tctx, cancel_all);
ret |= io_uring_try_cancel_uring_cmd(ctx, tctx, cancel_all);
- mutex_unlock(&ctx->uring_lock);
ret |= io_kill_timeouts(ctx, tctx, cancel_all);
+ mutex_unlock(&ctx->uring_lock);
if (tctx)
ret |= io_run_task_work() > 0;
- else
- ret |= flush_delayed_work(&ctx->fallback_work);
return ret;
}
return IOU_COMPLETE;
}
- static int io_bind_file_create(const struct io_async_msghdr *io, int addr_len)
+/*
+ * Check if bind request would potentially end up with filename_create(),
+ * which in turn end up in mnt_want_write() which will grab the fs
+ * percpu start write sem. This can trigger a lockdep warning.
+ */
- if (io->addr.ss_family != AF_UNIX)
++static int io_bind_file_create(const struct sockaddr_storage *addr, int addr_len)
+{
+ const struct sockaddr_un *sun;
+
- sun = (const struct sockaddr_un *) &io->addr;
++ if (addr->ss_family != AF_UNIX)
+ return 0;
+ if (addr_len <= offsetof(struct sockaddr_un, sun_path))
+ return 0;
++ sun = (const struct sockaddr_un *) addr;
+ return sun->sun_path[0] != '\0';
+}
+
int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind);
struct sockaddr __user *uaddr;
- struct io_async_msghdr *io;
+ struct sockaddr_storage *addr;
+ int ret;
if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
return -EINVAL;
uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr));
bind->addr_len = READ_ONCE(sqe->addr2);
- io = io_msg_alloc_async(req);
- if (unlikely(!io))
+ addr = io_uring_alloc_async_data(NULL, req);
+ if (unlikely(!addr))
return -ENOMEM;
- ret = move_addr_to_kernel(uaddr, bind->addr_len, &io->addr);
- return move_addr_to_kernel(uaddr, bind->addr_len, addr);
++ ret = move_addr_to_kernel(uaddr, bind->addr_len, addr);
+ if (unlikely(ret))
+ return ret;
- if (io_bind_file_create(io, bind->addr_len))
++ if (io_bind_file_create(addr, bind->addr_len))
+ req->flags |= REQ_F_FORCE_ASYNC;
+ return 0;
}
+
int io_bind(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind);