1 // SPDX-License-Identifier: GPL-2.0
3 * Code related to the io_uring_register() syscall
5 * Copyright (C) 2023 Jens Axboe
7 #include <linux/kernel.h>
8 #include <linux/errno.h>
9 #include <linux/syscalls.h>
10 #include <linux/refcount.h>
11 #include <linux/bits.h>
13 #include <linux/file.h>
14 #include <linux/slab.h>
15 #include <linux/uaccess.h>
16 #include <linux/nospec.h>
17 #include <linux/compat.h>
18 #include <linux/io_uring.h>
19 #include <linux/io_uring_types.h>
31 #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
32 IORING_REGISTER_LAST + IORING_OP_LAST)
34 static int io_eventfd_register(struct io_ring_ctx
*ctx
, void __user
*arg
,
35 unsigned int eventfd_async
)
37 struct io_ev_fd
*ev_fd
;
38 __s32 __user
*fds
= arg
;
41 ev_fd
= rcu_dereference_protected(ctx
->io_ev_fd
,
42 lockdep_is_held(&ctx
->uring_lock
));
46 if (copy_from_user(&fd
, fds
, sizeof(*fds
)))
49 ev_fd
= kmalloc(sizeof(*ev_fd
), GFP_KERNEL
);
53 ev_fd
->cq_ev_fd
= eventfd_ctx_fdget(fd
);
54 if (IS_ERR(ev_fd
->cq_ev_fd
)) {
55 int ret
= PTR_ERR(ev_fd
->cq_ev_fd
);
60 spin_lock(&ctx
->completion_lock
);
61 ctx
->evfd_last_cq_tail
= ctx
->cached_cq_tail
;
62 spin_unlock(&ctx
->completion_lock
);
64 ev_fd
->eventfd_async
= eventfd_async
;
66 rcu_assign_pointer(ctx
->io_ev_fd
, ev_fd
);
67 atomic_set(&ev_fd
->refs
, 1);
68 atomic_set(&ev_fd
->ops
, 0);
72 int io_eventfd_unregister(struct io_ring_ctx
*ctx
)
74 struct io_ev_fd
*ev_fd
;
76 ev_fd
= rcu_dereference_protected(ctx
->io_ev_fd
,
77 lockdep_is_held(&ctx
->uring_lock
));
79 ctx
->has_evfd
= false;
80 rcu_assign_pointer(ctx
->io_ev_fd
, NULL
);
81 if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT
), &ev_fd
->ops
))
82 call_rcu(&ev_fd
->rcu
, io_eventfd_ops
);
89 static __cold
int io_probe(struct io_ring_ctx
*ctx
, void __user
*arg
,
92 struct io_uring_probe
*p
;
96 size
= struct_size(p
, ops
, nr_args
);
99 p
= kzalloc(size
, GFP_KERNEL
);
104 if (copy_from_user(p
, arg
, size
))
107 if (memchr_inv(p
, 0, size
))
110 p
->last_op
= IORING_OP_LAST
- 1;
111 if (nr_args
> IORING_OP_LAST
)
112 nr_args
= IORING_OP_LAST
;
114 for (i
= 0; i
< nr_args
; i
++) {
116 if (!io_issue_defs
[i
].not_supported
)
117 p
->ops
[i
].flags
= IO_URING_OP_SUPPORTED
;
122 if (copy_to_user(arg
, p
, size
))
129 int io_unregister_personality(struct io_ring_ctx
*ctx
, unsigned id
)
131 const struct cred
*creds
;
133 creds
= xa_erase(&ctx
->personalities
, id
);
143 static int io_register_personality(struct io_ring_ctx
*ctx
)
145 const struct cred
*creds
;
149 creds
= get_current_cred();
151 ret
= xa_alloc_cyclic(&ctx
->personalities
, &id
, (void *)creds
,
152 XA_LIMIT(0, USHRT_MAX
), &ctx
->pers_next
, GFP_KERNEL
);
160 static __cold
int io_register_restrictions(struct io_ring_ctx
*ctx
,
161 void __user
*arg
, unsigned int nr_args
)
163 struct io_uring_restriction
*res
;
167 /* Restrictions allowed only if rings started disabled */
168 if (!(ctx
->flags
& IORING_SETUP_R_DISABLED
))
171 /* We allow only a single restrictions registration */
172 if (ctx
->restrictions
.registered
)
175 if (!arg
|| nr_args
> IORING_MAX_RESTRICTIONS
)
178 size
= array_size(nr_args
, sizeof(*res
));
179 if (size
== SIZE_MAX
)
182 res
= memdup_user(arg
, size
);
188 for (i
= 0; i
< nr_args
; i
++) {
189 switch (res
[i
].opcode
) {
190 case IORING_RESTRICTION_REGISTER_OP
:
191 if (res
[i
].register_op
>= IORING_REGISTER_LAST
) {
196 __set_bit(res
[i
].register_op
,
197 ctx
->restrictions
.register_op
);
199 case IORING_RESTRICTION_SQE_OP
:
200 if (res
[i
].sqe_op
>= IORING_OP_LAST
) {
205 __set_bit(res
[i
].sqe_op
, ctx
->restrictions
.sqe_op
);
207 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED
:
208 ctx
->restrictions
.sqe_flags_allowed
= res
[i
].sqe_flags
;
210 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED
:
211 ctx
->restrictions
.sqe_flags_required
= res
[i
].sqe_flags
;
220 /* Reset all restrictions if an error happened */
222 memset(&ctx
->restrictions
, 0, sizeof(ctx
->restrictions
));
224 ctx
->restrictions
.registered
= true;
230 static int io_register_enable_rings(struct io_ring_ctx
*ctx
)
232 if (!(ctx
->flags
& IORING_SETUP_R_DISABLED
))
235 if (ctx
->flags
& IORING_SETUP_SINGLE_ISSUER
&& !ctx
->submitter_task
) {
236 WRITE_ONCE(ctx
->submitter_task
, get_task_struct(current
));
238 * Lazy activation attempts would fail if it was polled before
239 * submitter_task is set.
241 if (wq_has_sleeper(&ctx
->poll_wq
))
242 io_activate_pollwq(ctx
);
245 if (ctx
->restrictions
.registered
)
248 ctx
->flags
&= ~IORING_SETUP_R_DISABLED
;
249 if (ctx
->sq_data
&& wq_has_sleeper(&ctx
->sq_data
->wait
))
250 wake_up(&ctx
->sq_data
->wait
);
254 static __cold
int __io_register_iowq_aff(struct io_ring_ctx
*ctx
,
255 cpumask_var_t new_mask
)
259 if (!(ctx
->flags
& IORING_SETUP_SQPOLL
)) {
260 ret
= io_wq_cpu_affinity(current
->io_uring
, new_mask
);
262 mutex_unlock(&ctx
->uring_lock
);
263 ret
= io_sqpoll_wq_cpu_affinity(ctx
, new_mask
);
264 mutex_lock(&ctx
->uring_lock
);
270 static __cold
int io_register_iowq_aff(struct io_ring_ctx
*ctx
,
271 void __user
*arg
, unsigned len
)
273 cpumask_var_t new_mask
;
276 if (!alloc_cpumask_var(&new_mask
, GFP_KERNEL
))
279 cpumask_clear(new_mask
);
280 if (len
> cpumask_size())
281 len
= cpumask_size();
284 if (in_compat_syscall())
285 ret
= compat_get_bitmap(cpumask_bits(new_mask
),
286 (const compat_ulong_t __user
*)arg
,
287 len
* 8 /* CHAR_BIT */);
290 ret
= copy_from_user(new_mask
, arg
, len
);
293 free_cpumask_var(new_mask
);
297 ret
= __io_register_iowq_aff(ctx
, new_mask
);
298 free_cpumask_var(new_mask
);
302 static __cold
int io_unregister_iowq_aff(struct io_ring_ctx
*ctx
)
304 return __io_register_iowq_aff(ctx
, NULL
);
307 static __cold
int io_register_iowq_max_workers(struct io_ring_ctx
*ctx
,
309 __must_hold(&ctx
->uring_lock
)
311 struct io_tctx_node
*node
;
312 struct io_uring_task
*tctx
= NULL
;
313 struct io_sq_data
*sqd
= NULL
;
317 if (copy_from_user(new_count
, arg
, sizeof(new_count
)))
319 for (i
= 0; i
< ARRAY_SIZE(new_count
); i
++)
320 if (new_count
[i
] > INT_MAX
)
323 if (ctx
->flags
& IORING_SETUP_SQPOLL
) {
327 * Observe the correct sqd->lock -> ctx->uring_lock
328 * ordering. Fine to drop uring_lock here, we hold
331 refcount_inc(&sqd
->refs
);
332 mutex_unlock(&ctx
->uring_lock
);
333 mutex_lock(&sqd
->lock
);
334 mutex_lock(&ctx
->uring_lock
);
336 tctx
= sqd
->thread
->io_uring
;
339 tctx
= current
->io_uring
;
342 BUILD_BUG_ON(sizeof(new_count
) != sizeof(ctx
->iowq_limits
));
344 for (i
= 0; i
< ARRAY_SIZE(new_count
); i
++)
346 ctx
->iowq_limits
[i
] = new_count
[i
];
347 ctx
->iowq_limits_set
= true;
349 if (tctx
&& tctx
->io_wq
) {
350 ret
= io_wq_max_workers(tctx
->io_wq
, new_count
);
354 memset(new_count
, 0, sizeof(new_count
));
358 mutex_unlock(&sqd
->lock
);
362 if (copy_to_user(arg
, new_count
, sizeof(new_count
)))
365 /* that's it for SQPOLL, only the SQPOLL task creates requests */
369 /* now propagate the restriction to all registered users */
370 list_for_each_entry(node
, &ctx
->tctx_list
, ctx_node
) {
371 struct io_uring_task
*tctx
= node
->task
->io_uring
;
373 if (WARN_ON_ONCE(!tctx
->io_wq
))
376 for (i
= 0; i
< ARRAY_SIZE(new_count
); i
++)
377 new_count
[i
] = ctx
->iowq_limits
[i
];
378 /* ignore errors, it always returns zero anyway */
379 (void)io_wq_max_workers(tctx
->io_wq
, new_count
);
384 mutex_unlock(&sqd
->lock
);
390 static int __io_uring_register(struct io_ring_ctx
*ctx
, unsigned opcode
,
391 void __user
*arg
, unsigned nr_args
)
392 __releases(ctx
->uring_lock
)
393 __acquires(ctx
->uring_lock
)
398 * We don't quiesce the refs for register anymore and so it can't be
399 * dying as we're holding a file ref here.
401 if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx
->refs
)))
404 if (ctx
->submitter_task
&& ctx
->submitter_task
!= current
)
407 if (ctx
->restricted
) {
408 opcode
= array_index_nospec(opcode
, IORING_REGISTER_LAST
);
409 if (!test_bit(opcode
, ctx
->restrictions
.register_op
))
414 case IORING_REGISTER_BUFFERS
:
418 ret
= io_sqe_buffers_register(ctx
, arg
, nr_args
, NULL
);
420 case IORING_UNREGISTER_BUFFERS
:
424 ret
= io_sqe_buffers_unregister(ctx
);
426 case IORING_REGISTER_FILES
:
430 ret
= io_sqe_files_register(ctx
, arg
, nr_args
, NULL
);
432 case IORING_UNREGISTER_FILES
:
436 ret
= io_sqe_files_unregister(ctx
);
438 case IORING_REGISTER_FILES_UPDATE
:
439 ret
= io_register_files_update(ctx
, arg
, nr_args
);
441 case IORING_REGISTER_EVENTFD
:
445 ret
= io_eventfd_register(ctx
, arg
, 0);
447 case IORING_REGISTER_EVENTFD_ASYNC
:
451 ret
= io_eventfd_register(ctx
, arg
, 1);
453 case IORING_UNREGISTER_EVENTFD
:
457 ret
= io_eventfd_unregister(ctx
);
459 case IORING_REGISTER_PROBE
:
461 if (!arg
|| nr_args
> 256)
463 ret
= io_probe(ctx
, arg
, nr_args
);
465 case IORING_REGISTER_PERSONALITY
:
469 ret
= io_register_personality(ctx
);
471 case IORING_UNREGISTER_PERSONALITY
:
475 ret
= io_unregister_personality(ctx
, nr_args
);
477 case IORING_REGISTER_ENABLE_RINGS
:
481 ret
= io_register_enable_rings(ctx
);
483 case IORING_REGISTER_RESTRICTIONS
:
484 ret
= io_register_restrictions(ctx
, arg
, nr_args
);
486 case IORING_REGISTER_FILES2
:
487 ret
= io_register_rsrc(ctx
, arg
, nr_args
, IORING_RSRC_FILE
);
489 case IORING_REGISTER_FILES_UPDATE2
:
490 ret
= io_register_rsrc_update(ctx
, arg
, nr_args
,
493 case IORING_REGISTER_BUFFERS2
:
494 ret
= io_register_rsrc(ctx
, arg
, nr_args
, IORING_RSRC_BUFFER
);
496 case IORING_REGISTER_BUFFERS_UPDATE
:
497 ret
= io_register_rsrc_update(ctx
, arg
, nr_args
,
500 case IORING_REGISTER_IOWQ_AFF
:
502 if (!arg
|| !nr_args
)
504 ret
= io_register_iowq_aff(ctx
, arg
, nr_args
);
506 case IORING_UNREGISTER_IOWQ_AFF
:
510 ret
= io_unregister_iowq_aff(ctx
);
512 case IORING_REGISTER_IOWQ_MAX_WORKERS
:
514 if (!arg
|| nr_args
!= 2)
516 ret
= io_register_iowq_max_workers(ctx
, arg
);
518 case IORING_REGISTER_RING_FDS
:
519 ret
= io_ringfd_register(ctx
, arg
, nr_args
);
521 case IORING_UNREGISTER_RING_FDS
:
522 ret
= io_ringfd_unregister(ctx
, arg
, nr_args
);
524 case IORING_REGISTER_PBUF_RING
:
526 if (!arg
|| nr_args
!= 1)
528 ret
= io_register_pbuf_ring(ctx
, arg
);
530 case IORING_UNREGISTER_PBUF_RING
:
532 if (!arg
|| nr_args
!= 1)
534 ret
= io_unregister_pbuf_ring(ctx
, arg
);
536 case IORING_REGISTER_SYNC_CANCEL
:
538 if (!arg
|| nr_args
!= 1)
540 ret
= io_sync_cancel(ctx
, arg
);
542 case IORING_REGISTER_FILE_ALLOC_RANGE
:
546 ret
= io_register_file_alloc_range(ctx
, arg
);
548 case IORING_REGISTER_PBUF_STATUS
:
550 if (!arg
|| nr_args
!= 1)
552 ret
= io_register_pbuf_status(ctx
, arg
);
554 case IORING_REGISTER_NAPI
:
556 if (!arg
|| nr_args
!= 1)
558 ret
= io_register_napi(ctx
, arg
);
560 case IORING_UNREGISTER_NAPI
:
564 ret
= io_unregister_napi(ctx
, arg
);
574 SYSCALL_DEFINE4(io_uring_register
, unsigned int, fd
, unsigned int, opcode
,
575 void __user
*, arg
, unsigned int, nr_args
)
577 struct io_ring_ctx
*ctx
;
580 bool use_registered_ring
;
582 use_registered_ring
= !!(opcode
& IORING_REGISTER_USE_REGISTERED_RING
);
583 opcode
&= ~IORING_REGISTER_USE_REGISTERED_RING
;
585 if (opcode
>= IORING_REGISTER_LAST
)
588 if (use_registered_ring
) {
590 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
591 * need only dereference our task private array to find it.
593 struct io_uring_task
*tctx
= current
->io_uring
;
595 if (unlikely(!tctx
|| fd
>= IO_RINGFD_REG_MAX
))
597 fd
= array_index_nospec(fd
, IO_RINGFD_REG_MAX
);
598 file
= tctx
->registered_rings
[fd
];
606 if (!io_is_uring_fops(file
))
610 ctx
= file
->private_data
;
612 mutex_lock(&ctx
->uring_lock
);
613 ret
= __io_uring_register(ctx
, opcode
, arg
, nr_args
);
614 mutex_unlock(&ctx
->uring_lock
);
615 trace_io_uring_register(ctx
, opcode
, ctx
->nr_user_files
, ctx
->nr_user_bufs
, ret
);
617 if (!use_registered_ring
)