]> git.ipfire.org Git - thirdparty/linux.git/blame - io_uring/register.c
Merge tag 'x86_urgent_for_v6.8_rc4' of git://git.kernel.org/pub/scm/linux/kernel...
[thirdparty/linux.git] / io_uring / register.c
CommitLineData
c4320315
JA
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Code related to the io_uring_register() syscall
4 *
5 * Copyright (C) 2023 Jens Axboe
6 */
7#include <linux/kernel.h>
8#include <linux/errno.h>
9#include <linux/syscalls.h>
10#include <linux/refcount.h>
11#include <linux/bits.h>
12#include <linux/fs.h>
13#include <linux/file.h>
14#include <linux/slab.h>
15#include <linux/uaccess.h>
16#include <linux/nospec.h>
baf59771 17#include <linux/compat.h>
c4320315
JA
18#include <linux/io_uring.h>
19#include <linux/io_uring_types.h>
20
21#include "io_uring.h"
22#include "opdef.h"
23#include "tctx.h"
24#include "rsrc.h"
25#include "sqpoll.h"
26#include "register.h"
27#include "cancel.h"
28#include "kbuf.h"
29
30#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
31 IORING_REGISTER_LAST + IORING_OP_LAST)
32
33static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
34 unsigned int eventfd_async)
35{
36 struct io_ev_fd *ev_fd;
37 __s32 __user *fds = arg;
38 int fd;
39
40 ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
41 lockdep_is_held(&ctx->uring_lock));
42 if (ev_fd)
43 return -EBUSY;
44
45 if (copy_from_user(&fd, fds, sizeof(*fds)))
46 return -EFAULT;
47
48 ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
49 if (!ev_fd)
50 return -ENOMEM;
51
52 ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
53 if (IS_ERR(ev_fd->cq_ev_fd)) {
54 int ret = PTR_ERR(ev_fd->cq_ev_fd);
55 kfree(ev_fd);
56 return ret;
57 }
58
59 spin_lock(&ctx->completion_lock);
60 ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
61 spin_unlock(&ctx->completion_lock);
62
63 ev_fd->eventfd_async = eventfd_async;
64 ctx->has_evfd = true;
65 rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
66 atomic_set(&ev_fd->refs, 1);
67 atomic_set(&ev_fd->ops, 0);
68 return 0;
69}
70
71int io_eventfd_unregister(struct io_ring_ctx *ctx)
72{
73 struct io_ev_fd *ev_fd;
74
75 ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
76 lockdep_is_held(&ctx->uring_lock));
77 if (ev_fd) {
78 ctx->has_evfd = false;
79 rcu_assign_pointer(ctx->io_ev_fd, NULL);
80 if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), &ev_fd->ops))
81 call_rcu(&ev_fd->rcu, io_eventfd_ops);
82 return 0;
83 }
84
85 return -ENXIO;
86}
87
88static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
89 unsigned nr_args)
90{
91 struct io_uring_probe *p;
92 size_t size;
93 int i, ret;
94
95 size = struct_size(p, ops, nr_args);
96 if (size == SIZE_MAX)
97 return -EOVERFLOW;
98 p = kzalloc(size, GFP_KERNEL);
99 if (!p)
100 return -ENOMEM;
101
102 ret = -EFAULT;
103 if (copy_from_user(p, arg, size))
104 goto out;
105 ret = -EINVAL;
106 if (memchr_inv(p, 0, size))
107 goto out;
108
109 p->last_op = IORING_OP_LAST - 1;
110 if (nr_args > IORING_OP_LAST)
111 nr_args = IORING_OP_LAST;
112
113 for (i = 0; i < nr_args; i++) {
114 p->ops[i].op = i;
115 if (!io_issue_defs[i].not_supported)
116 p->ops[i].flags = IO_URING_OP_SUPPORTED;
117 }
118 p->ops_len = i;
119
120 ret = 0;
121 if (copy_to_user(arg, p, size))
122 ret = -EFAULT;
123out:
124 kfree(p);
125 return ret;
126}
127
128int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
129{
130 const struct cred *creds;
131
132 creds = xa_erase(&ctx->personalities, id);
133 if (creds) {
134 put_cred(creds);
135 return 0;
136 }
137
138 return -EINVAL;
139}
140
141
142static int io_register_personality(struct io_ring_ctx *ctx)
143{
144 const struct cred *creds;
145 u32 id;
146 int ret;
147
148 creds = get_current_cred();
149
150 ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
151 XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
152 if (ret < 0) {
153 put_cred(creds);
154 return ret;
155 }
156 return id;
157}
158
159static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
160 void __user *arg, unsigned int nr_args)
161{
162 struct io_uring_restriction *res;
163 size_t size;
164 int i, ret;
165
166 /* Restrictions allowed only if rings started disabled */
167 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
168 return -EBADFD;
169
170 /* We allow only a single restrictions registration */
171 if (ctx->restrictions.registered)
172 return -EBUSY;
173
174 if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
175 return -EINVAL;
176
177 size = array_size(nr_args, sizeof(*res));
178 if (size == SIZE_MAX)
179 return -EOVERFLOW;
180
181 res = memdup_user(arg, size);
182 if (IS_ERR(res))
183 return PTR_ERR(res);
184
185 ret = 0;
186
187 for (i = 0; i < nr_args; i++) {
188 switch (res[i].opcode) {
189 case IORING_RESTRICTION_REGISTER_OP:
190 if (res[i].register_op >= IORING_REGISTER_LAST) {
191 ret = -EINVAL;
192 goto out;
193 }
194
195 __set_bit(res[i].register_op,
196 ctx->restrictions.register_op);
197 break;
198 case IORING_RESTRICTION_SQE_OP:
199 if (res[i].sqe_op >= IORING_OP_LAST) {
200 ret = -EINVAL;
201 goto out;
202 }
203
204 __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
205 break;
206 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
207 ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
208 break;
209 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
210 ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
211 break;
212 default:
213 ret = -EINVAL;
214 goto out;
215 }
216 }
217
218out:
219 /* Reset all restrictions if an error happened */
220 if (ret != 0)
221 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
222 else
223 ctx->restrictions.registered = true;
224
225 kfree(res);
226 return ret;
227}
228
229static int io_register_enable_rings(struct io_ring_ctx *ctx)
230{
231 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
232 return -EBADFD;
233
234 if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
235 WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
236 /*
237 * Lazy activation attempts would fail if it was polled before
238 * submitter_task is set.
239 */
240 if (wq_has_sleeper(&ctx->poll_wq))
241 io_activate_pollwq(ctx);
242 }
243
244 if (ctx->restrictions.registered)
245 ctx->restricted = 1;
246
247 ctx->flags &= ~IORING_SETUP_R_DISABLED;
248 if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
249 wake_up(&ctx->sq_data->wait);
250 return 0;
251}
252
253static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
254 cpumask_var_t new_mask)
255{
256 int ret;
257
258 if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
259 ret = io_wq_cpu_affinity(current->io_uring, new_mask);
260 } else {
261 mutex_unlock(&ctx->uring_lock);
262 ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
263 mutex_lock(&ctx->uring_lock);
264 }
265
266 return ret;
267}
268
269static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
270 void __user *arg, unsigned len)
271{
272 cpumask_var_t new_mask;
273 int ret;
274
275 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
276 return -ENOMEM;
277
278 cpumask_clear(new_mask);
279 if (len > cpumask_size())
280 len = cpumask_size();
281
baf59771
JA
282#ifdef CONFIG_COMPAT
283 if (in_compat_syscall())
c4320315
JA
284 ret = compat_get_bitmap(cpumask_bits(new_mask),
285 (const compat_ulong_t __user *)arg,
286 len * 8 /* CHAR_BIT */);
baf59771
JA
287 else
288#endif
c4320315 289 ret = copy_from_user(new_mask, arg, len);
c4320315
JA
290
291 if (ret) {
292 free_cpumask_var(new_mask);
293 return -EFAULT;
294 }
295
296 ret = __io_register_iowq_aff(ctx, new_mask);
297 free_cpumask_var(new_mask);
298 return ret;
299}
300
301static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
302{
303 return __io_register_iowq_aff(ctx, NULL);
304}
305
306static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
307 void __user *arg)
308 __must_hold(&ctx->uring_lock)
309{
310 struct io_tctx_node *node;
311 struct io_uring_task *tctx = NULL;
312 struct io_sq_data *sqd = NULL;
313 __u32 new_count[2];
314 int i, ret;
315
316 if (copy_from_user(new_count, arg, sizeof(new_count)))
317 return -EFAULT;
318 for (i = 0; i < ARRAY_SIZE(new_count); i++)
319 if (new_count[i] > INT_MAX)
320 return -EINVAL;
321
322 if (ctx->flags & IORING_SETUP_SQPOLL) {
323 sqd = ctx->sq_data;
324 if (sqd) {
325 /*
326 * Observe the correct sqd->lock -> ctx->uring_lock
327 * ordering. Fine to drop uring_lock here, we hold
328 * a ref to the ctx.
329 */
330 refcount_inc(&sqd->refs);
331 mutex_unlock(&ctx->uring_lock);
332 mutex_lock(&sqd->lock);
333 mutex_lock(&ctx->uring_lock);
334 if (sqd->thread)
335 tctx = sqd->thread->io_uring;
336 }
337 } else {
338 tctx = current->io_uring;
339 }
340
341 BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
342
343 for (i = 0; i < ARRAY_SIZE(new_count); i++)
344 if (new_count[i])
345 ctx->iowq_limits[i] = new_count[i];
346 ctx->iowq_limits_set = true;
347
348 if (tctx && tctx->io_wq) {
349 ret = io_wq_max_workers(tctx->io_wq, new_count);
350 if (ret)
351 goto err;
352 } else {
353 memset(new_count, 0, sizeof(new_count));
354 }
355
356 if (sqd) {
357 mutex_unlock(&sqd->lock);
358 io_put_sq_data(sqd);
359 }
360
361 if (copy_to_user(arg, new_count, sizeof(new_count)))
362 return -EFAULT;
363
364 /* that's it for SQPOLL, only the SQPOLL task creates requests */
365 if (sqd)
366 return 0;
367
368 /* now propagate the restriction to all registered users */
369 list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
370 struct io_uring_task *tctx = node->task->io_uring;
371
372 if (WARN_ON_ONCE(!tctx->io_wq))
373 continue;
374
375 for (i = 0; i < ARRAY_SIZE(new_count); i++)
376 new_count[i] = ctx->iowq_limits[i];
377 /* ignore errors, it always returns zero anyway */
378 (void)io_wq_max_workers(tctx->io_wq, new_count);
379 }
380 return 0;
381err:
382 if (sqd) {
383 mutex_unlock(&sqd->lock);
384 io_put_sq_data(sqd);
385 }
386 return ret;
387}
388
389static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
390 void __user *arg, unsigned nr_args)
391 __releases(ctx->uring_lock)
392 __acquires(ctx->uring_lock)
393{
394 int ret;
395
396 /*
397 * We don't quiesce the refs for register anymore and so it can't be
398 * dying as we're holding a file ref here.
399 */
400 if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
401 return -ENXIO;
402
403 if (ctx->submitter_task && ctx->submitter_task != current)
404 return -EEXIST;
405
406 if (ctx->restricted) {
407 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
408 if (!test_bit(opcode, ctx->restrictions.register_op))
409 return -EACCES;
410 }
411
412 switch (opcode) {
413 case IORING_REGISTER_BUFFERS:
414 ret = -EFAULT;
415 if (!arg)
416 break;
417 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
418 break;
419 case IORING_UNREGISTER_BUFFERS:
420 ret = -EINVAL;
421 if (arg || nr_args)
422 break;
423 ret = io_sqe_buffers_unregister(ctx);
424 break;
425 case IORING_REGISTER_FILES:
426 ret = -EFAULT;
427 if (!arg)
428 break;
429 ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
430 break;
431 case IORING_UNREGISTER_FILES:
432 ret = -EINVAL;
433 if (arg || nr_args)
434 break;
435 ret = io_sqe_files_unregister(ctx);
436 break;
437 case IORING_REGISTER_FILES_UPDATE:
438 ret = io_register_files_update(ctx, arg, nr_args);
439 break;
440 case IORING_REGISTER_EVENTFD:
441 ret = -EINVAL;
442 if (nr_args != 1)
443 break;
444 ret = io_eventfd_register(ctx, arg, 0);
445 break;
446 case IORING_REGISTER_EVENTFD_ASYNC:
447 ret = -EINVAL;
448 if (nr_args != 1)
449 break;
450 ret = io_eventfd_register(ctx, arg, 1);
451 break;
452 case IORING_UNREGISTER_EVENTFD:
453 ret = -EINVAL;
454 if (arg || nr_args)
455 break;
456 ret = io_eventfd_unregister(ctx);
457 break;
458 case IORING_REGISTER_PROBE:
459 ret = -EINVAL;
460 if (!arg || nr_args > 256)
461 break;
462 ret = io_probe(ctx, arg, nr_args);
463 break;
464 case IORING_REGISTER_PERSONALITY:
465 ret = -EINVAL;
466 if (arg || nr_args)
467 break;
468 ret = io_register_personality(ctx);
469 break;
470 case IORING_UNREGISTER_PERSONALITY:
471 ret = -EINVAL;
472 if (arg)
473 break;
474 ret = io_unregister_personality(ctx, nr_args);
475 break;
476 case IORING_REGISTER_ENABLE_RINGS:
477 ret = -EINVAL;
478 if (arg || nr_args)
479 break;
480 ret = io_register_enable_rings(ctx);
481 break;
482 case IORING_REGISTER_RESTRICTIONS:
483 ret = io_register_restrictions(ctx, arg, nr_args);
484 break;
485 case IORING_REGISTER_FILES2:
486 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
487 break;
488 case IORING_REGISTER_FILES_UPDATE2:
489 ret = io_register_rsrc_update(ctx, arg, nr_args,
490 IORING_RSRC_FILE);
491 break;
492 case IORING_REGISTER_BUFFERS2:
493 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
494 break;
495 case IORING_REGISTER_BUFFERS_UPDATE:
496 ret = io_register_rsrc_update(ctx, arg, nr_args,
497 IORING_RSRC_BUFFER);
498 break;
499 case IORING_REGISTER_IOWQ_AFF:
500 ret = -EINVAL;
501 if (!arg || !nr_args)
502 break;
503 ret = io_register_iowq_aff(ctx, arg, nr_args);
504 break;
505 case IORING_UNREGISTER_IOWQ_AFF:
506 ret = -EINVAL;
507 if (arg || nr_args)
508 break;
509 ret = io_unregister_iowq_aff(ctx);
510 break;
511 case IORING_REGISTER_IOWQ_MAX_WORKERS:
512 ret = -EINVAL;
513 if (!arg || nr_args != 2)
514 break;
515 ret = io_register_iowq_max_workers(ctx, arg);
516 break;
517 case IORING_REGISTER_RING_FDS:
518 ret = io_ringfd_register(ctx, arg, nr_args);
519 break;
520 case IORING_UNREGISTER_RING_FDS:
521 ret = io_ringfd_unregister(ctx, arg, nr_args);
522 break;
523 case IORING_REGISTER_PBUF_RING:
524 ret = -EINVAL;
525 if (!arg || nr_args != 1)
526 break;
527 ret = io_register_pbuf_ring(ctx, arg);
528 break;
529 case IORING_UNREGISTER_PBUF_RING:
530 ret = -EINVAL;
531 if (!arg || nr_args != 1)
532 break;
533 ret = io_unregister_pbuf_ring(ctx, arg);
534 break;
535 case IORING_REGISTER_SYNC_CANCEL:
536 ret = -EINVAL;
537 if (!arg || nr_args != 1)
538 break;
539 ret = io_sync_cancel(ctx, arg);
540 break;
541 case IORING_REGISTER_FILE_ALLOC_RANGE:
542 ret = -EINVAL;
543 if (!arg || nr_args)
544 break;
545 ret = io_register_file_alloc_range(ctx, arg);
546 break;
d293b1a8
JA
547 case IORING_REGISTER_PBUF_STATUS:
548 ret = -EINVAL;
549 if (!arg || nr_args != 1)
550 break;
551 ret = io_register_pbuf_status(ctx, arg);
552 break;
c4320315
JA
553 default:
554 ret = -EINVAL;
555 break;
556 }
557
558 return ret;
559}
560
561SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
562 void __user *, arg, unsigned int, nr_args)
563{
564 struct io_ring_ctx *ctx;
565 long ret = -EBADF;
566 struct file *file;
567 bool use_registered_ring;
568
569 use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
570 opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
571
572 if (opcode >= IORING_REGISTER_LAST)
573 return -EINVAL;
574
575 if (use_registered_ring) {
576 /*
577 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
578 * need only dereference our task private array to find it.
579 */
580 struct io_uring_task *tctx = current->io_uring;
581
582 if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
583 return -EINVAL;
584 fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
585 file = tctx->registered_rings[fd];
586 if (unlikely(!file))
587 return -EBADF;
588 } else {
589 file = fget(fd);
590 if (unlikely(!file))
591 return -EBADF;
592 ret = -EOPNOTSUPP;
593 if (!io_is_uring_fops(file))
594 goto out_fput;
595 }
596
597 ctx = file->private_data;
598
599 mutex_lock(&ctx->uring_lock);
600 ret = __io_uring_register(ctx, opcode, arg, nr_args);
601 mutex_unlock(&ctx->uring_lock);
602 trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
603out_fput:
604 if (!use_registered_ring)
605 fput(file);
606 return ret;
607}