]> git.ipfire.org Git - people/ms/linux.git/blob - io_uring/rsrc.c
Merge tag 'arm-soc/for-6.0/devicetree' of https://github.com/Broadcom/stblinux into...
[people/ms/linux.git] / io_uring / rsrc.c
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/kernel.h>
3 #include <linux/errno.h>
4 #include <linux/fs.h>
5 #include <linux/file.h>
6 #include <linux/mm.h>
7 #include <linux/slab.h>
8 #include <linux/nospec.h>
9 #include <linux/hugetlb.h>
10 #include <linux/compat.h>
11 #include <linux/io_uring.h>
12
13 #include <uapi/linux/io_uring.h>
14
15 #include "io_uring.h"
16 #include "openclose.h"
17 #include "rsrc.h"
18 #include "notif.h"
19
20 struct io_rsrc_update {
21 struct file *file;
22 u64 arg;
23 u32 nr_args;
24 u32 offset;
25 int type;
26 };
27
28 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
29 struct io_mapped_ubuf **pimu,
30 struct page **last_hpage);
31
32 #define IO_RSRC_REF_BATCH 100
33
34 /* only define max */
35 #define IORING_MAX_FIXED_FILES (1U << 20)
36 #define IORING_MAX_REG_BUFFERS (1U << 14)
37
38 void io_rsrc_refs_drop(struct io_ring_ctx *ctx)
39 __must_hold(&ctx->uring_lock)
40 {
41 if (ctx->rsrc_cached_refs) {
42 io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs);
43 ctx->rsrc_cached_refs = 0;
44 }
45 }
46
47 int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
48 {
49 unsigned long page_limit, cur_pages, new_pages;
50
51 if (!nr_pages)
52 return 0;
53
54 /* Don't allow more pages than we can safely lock */
55 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
56
57 cur_pages = atomic_long_read(&user->locked_vm);
58 do {
59 new_pages = cur_pages + nr_pages;
60 if (new_pages > page_limit)
61 return -ENOMEM;
62 } while (!atomic_long_try_cmpxchg(&user->locked_vm,
63 &cur_pages, new_pages));
64 return 0;
65 }
66
67 static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
68 {
69 if (ctx->user)
70 __io_unaccount_mem(ctx->user, nr_pages);
71
72 if (ctx->mm_account)
73 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
74 }
75
76 static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
77 {
78 int ret;
79
80 if (ctx->user) {
81 ret = __io_account_mem(ctx->user, nr_pages);
82 if (ret)
83 return ret;
84 }
85
86 if (ctx->mm_account)
87 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
88
89 return 0;
90 }
91
92 static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
93 void __user *arg, unsigned index)
94 {
95 struct iovec __user *src;
96
97 #ifdef CONFIG_COMPAT
98 if (ctx->compat) {
99 struct compat_iovec __user *ciovs;
100 struct compat_iovec ciov;
101
102 ciovs = (struct compat_iovec __user *) arg;
103 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
104 return -EFAULT;
105
106 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
107 dst->iov_len = ciov.iov_len;
108 return 0;
109 }
110 #endif
111 src = (struct iovec __user *) arg;
112 if (copy_from_user(dst, &src[index], sizeof(*dst)))
113 return -EFAULT;
114 return 0;
115 }
116
117 static int io_buffer_validate(struct iovec *iov)
118 {
119 unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
120
121 /*
122 * Don't impose further limits on the size and buffer
123 * constraints here, we'll -EINVAL later when IO is
124 * submitted if they are wrong.
125 */
126 if (!iov->iov_base)
127 return iov->iov_len ? -EFAULT : 0;
128 if (!iov->iov_len)
129 return -EFAULT;
130
131 /* arbitrary limit, but we need something */
132 if (iov->iov_len > SZ_1G)
133 return -EFAULT;
134
135 if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
136 return -EOVERFLOW;
137
138 return 0;
139 }
140
141 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
142 {
143 struct io_mapped_ubuf *imu = *slot;
144 unsigned int i;
145
146 if (imu != ctx->dummy_ubuf) {
147 for (i = 0; i < imu->nr_bvecs; i++)
148 unpin_user_page(imu->bvec[i].bv_page);
149 if (imu->acct_pages)
150 io_unaccount_mem(ctx, imu->acct_pages);
151 kvfree(imu);
152 }
153 *slot = NULL;
154 }
155
156 void io_rsrc_refs_refill(struct io_ring_ctx *ctx)
157 __must_hold(&ctx->uring_lock)
158 {
159 ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH;
160 percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH);
161 }
162
163 static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
164 {
165 struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
166 struct io_ring_ctx *ctx = rsrc_data->ctx;
167 struct io_rsrc_put *prsrc, *tmp;
168
169 list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
170 list_del(&prsrc->list);
171
172 if (prsrc->tag) {
173 if (ctx->flags & IORING_SETUP_IOPOLL) {
174 mutex_lock(&ctx->uring_lock);
175 io_post_aux_cqe(ctx, prsrc->tag, 0, 0, true);
176 mutex_unlock(&ctx->uring_lock);
177 } else {
178 io_post_aux_cqe(ctx, prsrc->tag, 0, 0, true);
179 }
180 }
181
182 rsrc_data->do_put(ctx, prsrc);
183 kfree(prsrc);
184 }
185
186 io_rsrc_node_destroy(ref_node);
187 if (atomic_dec_and_test(&rsrc_data->refs))
188 complete(&rsrc_data->done);
189 }
190
191 void io_rsrc_put_work(struct work_struct *work)
192 {
193 struct io_ring_ctx *ctx;
194 struct llist_node *node;
195
196 ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
197 node = llist_del_all(&ctx->rsrc_put_llist);
198
199 while (node) {
200 struct io_rsrc_node *ref_node;
201 struct llist_node *next = node->next;
202
203 ref_node = llist_entry(node, struct io_rsrc_node, llist);
204 __io_rsrc_put_work(ref_node);
205 node = next;
206 }
207 }
208
209 void io_wait_rsrc_data(struct io_rsrc_data *data)
210 {
211 if (data && !atomic_dec_and_test(&data->refs))
212 wait_for_completion(&data->done);
213 }
214
215 void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
216 {
217 percpu_ref_exit(&ref_node->refs);
218 kfree(ref_node);
219 }
220
221 static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref)
222 {
223 struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
224 struct io_ring_ctx *ctx = node->rsrc_data->ctx;
225 unsigned long flags;
226 bool first_add = false;
227 unsigned long delay = HZ;
228
229 spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
230 node->done = true;
231
232 /* if we are mid-quiesce then do not delay */
233 if (node->rsrc_data->quiesce)
234 delay = 0;
235
236 while (!list_empty(&ctx->rsrc_ref_list)) {
237 node = list_first_entry(&ctx->rsrc_ref_list,
238 struct io_rsrc_node, node);
239 /* recycle ref nodes in order */
240 if (!node->done)
241 break;
242 list_del(&node->node);
243 first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
244 }
245 spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);
246
247 if (first_add)
248 mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
249 }
250
251 static struct io_rsrc_node *io_rsrc_node_alloc(void)
252 {
253 struct io_rsrc_node *ref_node;
254
255 ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
256 if (!ref_node)
257 return NULL;
258
259 if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
260 0, GFP_KERNEL)) {
261 kfree(ref_node);
262 return NULL;
263 }
264 INIT_LIST_HEAD(&ref_node->node);
265 INIT_LIST_HEAD(&ref_node->rsrc_list);
266 ref_node->done = false;
267 return ref_node;
268 }
269
270 void io_rsrc_node_switch(struct io_ring_ctx *ctx,
271 struct io_rsrc_data *data_to_kill)
272 __must_hold(&ctx->uring_lock)
273 {
274 WARN_ON_ONCE(!ctx->rsrc_backup_node);
275 WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
276
277 io_rsrc_refs_drop(ctx);
278
279 if (data_to_kill) {
280 struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
281
282 rsrc_node->rsrc_data = data_to_kill;
283 spin_lock_irq(&ctx->rsrc_ref_lock);
284 list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
285 spin_unlock_irq(&ctx->rsrc_ref_lock);
286
287 atomic_inc(&data_to_kill->refs);
288 percpu_ref_kill(&rsrc_node->refs);
289 ctx->rsrc_node = NULL;
290 }
291
292 if (!ctx->rsrc_node) {
293 ctx->rsrc_node = ctx->rsrc_backup_node;
294 ctx->rsrc_backup_node = NULL;
295 }
296 }
297
298 int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
299 {
300 if (ctx->rsrc_backup_node)
301 return 0;
302 ctx->rsrc_backup_node = io_rsrc_node_alloc();
303 return ctx->rsrc_backup_node ? 0 : -ENOMEM;
304 }
305
306 __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
307 struct io_ring_ctx *ctx)
308 {
309 int ret;
310
311 /* As we may drop ->uring_lock, other task may have started quiesce */
312 if (data->quiesce)
313 return -ENXIO;
314
315 data->quiesce = true;
316 do {
317 ret = io_rsrc_node_switch_start(ctx);
318 if (ret)
319 break;
320 io_rsrc_node_switch(ctx, data);
321
322 /* kill initial ref, already quiesced if zero */
323 if (atomic_dec_and_test(&data->refs))
324 break;
325 mutex_unlock(&ctx->uring_lock);
326 flush_delayed_work(&ctx->rsrc_put_work);
327 ret = wait_for_completion_interruptible(&data->done);
328 if (!ret) {
329 mutex_lock(&ctx->uring_lock);
330 if (atomic_read(&data->refs) > 0) {
331 /*
332 * it has been revived by another thread while
333 * we were unlocked
334 */
335 mutex_unlock(&ctx->uring_lock);
336 } else {
337 break;
338 }
339 }
340
341 atomic_inc(&data->refs);
342 /* wait for all works potentially completing data->done */
343 flush_delayed_work(&ctx->rsrc_put_work);
344 reinit_completion(&data->done);
345
346 ret = io_run_task_work_sig();
347 mutex_lock(&ctx->uring_lock);
348 } while (ret >= 0);
349 data->quiesce = false;
350
351 return ret;
352 }
353
354 static void io_free_page_table(void **table, size_t size)
355 {
356 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
357
358 for (i = 0; i < nr_tables; i++)
359 kfree(table[i]);
360 kfree(table);
361 }
362
363 static void io_rsrc_data_free(struct io_rsrc_data *data)
364 {
365 size_t size = data->nr * sizeof(data->tags[0][0]);
366
367 if (data->tags)
368 io_free_page_table((void **)data->tags, size);
369 kfree(data);
370 }
371
372 static __cold void **io_alloc_page_table(size_t size)
373 {
374 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
375 size_t init_size = size;
376 void **table;
377
378 table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
379 if (!table)
380 return NULL;
381
382 for (i = 0; i < nr_tables; i++) {
383 unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
384
385 table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
386 if (!table[i]) {
387 io_free_page_table(table, init_size);
388 return NULL;
389 }
390 size -= this_size;
391 }
392 return table;
393 }
394
395 __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx,
396 rsrc_put_fn *do_put, u64 __user *utags,
397 unsigned nr, struct io_rsrc_data **pdata)
398 {
399 struct io_rsrc_data *data;
400 int ret = -ENOMEM;
401 unsigned i;
402
403 data = kzalloc(sizeof(*data), GFP_KERNEL);
404 if (!data)
405 return -ENOMEM;
406 data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
407 if (!data->tags) {
408 kfree(data);
409 return -ENOMEM;
410 }
411
412 data->nr = nr;
413 data->ctx = ctx;
414 data->do_put = do_put;
415 if (utags) {
416 ret = -EFAULT;
417 for (i = 0; i < nr; i++) {
418 u64 *tag_slot = io_get_tag_slot(data, i);
419
420 if (copy_from_user(tag_slot, &utags[i],
421 sizeof(*tag_slot)))
422 goto fail;
423 }
424 }
425
426 atomic_set(&data->refs, 1);
427 init_completion(&data->done);
428 *pdata = data;
429 return 0;
430 fail:
431 io_rsrc_data_free(data);
432 return ret;
433 }
434
435 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
436 struct io_uring_rsrc_update2 *up,
437 unsigned nr_args)
438 {
439 u64 __user *tags = u64_to_user_ptr(up->tags);
440 __s32 __user *fds = u64_to_user_ptr(up->data);
441 struct io_rsrc_data *data = ctx->file_data;
442 struct io_fixed_file *file_slot;
443 struct file *file;
444 int fd, i, err = 0;
445 unsigned int done;
446 bool needs_switch = false;
447
448 if (!ctx->file_data)
449 return -ENXIO;
450 if (up->offset + nr_args > ctx->nr_user_files)
451 return -EINVAL;
452
453 for (done = 0; done < nr_args; done++) {
454 u64 tag = 0;
455
456 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
457 copy_from_user(&fd, &fds[done], sizeof(fd))) {
458 err = -EFAULT;
459 break;
460 }
461 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
462 err = -EINVAL;
463 break;
464 }
465 if (fd == IORING_REGISTER_FILES_SKIP)
466 continue;
467
468 i = array_index_nospec(up->offset + done, ctx->nr_user_files);
469 file_slot = io_fixed_file_slot(&ctx->file_table, i);
470
471 if (file_slot->file_ptr) {
472 file = (struct file *)(file_slot->file_ptr & FFS_MASK);
473 err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file);
474 if (err)
475 break;
476 file_slot->file_ptr = 0;
477 io_file_bitmap_clear(&ctx->file_table, i);
478 needs_switch = true;
479 }
480 if (fd != -1) {
481 file = fget(fd);
482 if (!file) {
483 err = -EBADF;
484 break;
485 }
486 /*
487 * Don't allow io_uring instances to be registered. If
488 * UNIX isn't enabled, then this causes a reference
489 * cycle and this instance can never get freed. If UNIX
490 * is enabled we'll handle it just fine, but there's
491 * still no point in allowing a ring fd as it doesn't
492 * support regular read/write anyway.
493 */
494 if (io_is_uring_fops(file)) {
495 fput(file);
496 err = -EBADF;
497 break;
498 }
499 err = io_scm_file_account(ctx, file);
500 if (err) {
501 fput(file);
502 break;
503 }
504 *io_get_tag_slot(data, i) = tag;
505 io_fixed_file_set(file_slot, file);
506 io_file_bitmap_set(&ctx->file_table, i);
507 }
508 }
509
510 if (needs_switch)
511 io_rsrc_node_switch(ctx, data);
512 return done ? done : err;
513 }
514
515 static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
516 struct io_uring_rsrc_update2 *up,
517 unsigned int nr_args)
518 {
519 u64 __user *tags = u64_to_user_ptr(up->tags);
520 struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
521 struct page *last_hpage = NULL;
522 bool needs_switch = false;
523 __u32 done;
524 int i, err;
525
526 if (!ctx->buf_data)
527 return -ENXIO;
528 if (up->offset + nr_args > ctx->nr_user_bufs)
529 return -EINVAL;
530
531 for (done = 0; done < nr_args; done++) {
532 struct io_mapped_ubuf *imu;
533 int offset = up->offset + done;
534 u64 tag = 0;
535
536 err = io_copy_iov(ctx, &iov, iovs, done);
537 if (err)
538 break;
539 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
540 err = -EFAULT;
541 break;
542 }
543 err = io_buffer_validate(&iov);
544 if (err)
545 break;
546 if (!iov.iov_base && tag) {
547 err = -EINVAL;
548 break;
549 }
550 err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage);
551 if (err)
552 break;
553
554 i = array_index_nospec(offset, ctx->nr_user_bufs);
555 if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
556 err = io_queue_rsrc_removal(ctx->buf_data, i,
557 ctx->rsrc_node, ctx->user_bufs[i]);
558 if (unlikely(err)) {
559 io_buffer_unmap(ctx, &imu);
560 break;
561 }
562 ctx->user_bufs[i] = ctx->dummy_ubuf;
563 needs_switch = true;
564 }
565
566 ctx->user_bufs[i] = imu;
567 *io_get_tag_slot(ctx->buf_data, offset) = tag;
568 }
569
570 if (needs_switch)
571 io_rsrc_node_switch(ctx, ctx->buf_data);
572 return done ? done : err;
573 }
574
575 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
576 struct io_uring_rsrc_update2 *up,
577 unsigned nr_args)
578 {
579 __u32 tmp;
580 int err;
581
582 if (check_add_overflow(up->offset, nr_args, &tmp))
583 return -EOVERFLOW;
584 err = io_rsrc_node_switch_start(ctx);
585 if (err)
586 return err;
587
588 switch (type) {
589 case IORING_RSRC_FILE:
590 return __io_sqe_files_update(ctx, up, nr_args);
591 case IORING_RSRC_BUFFER:
592 return __io_sqe_buffers_update(ctx, up, nr_args);
593 }
594 return -EINVAL;
595 }
596
597 int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
598 unsigned nr_args)
599 {
600 struct io_uring_rsrc_update2 up;
601
602 if (!nr_args)
603 return -EINVAL;
604 memset(&up, 0, sizeof(up));
605 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
606 return -EFAULT;
607 if (up.resv || up.resv2)
608 return -EINVAL;
609 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
610 }
611
612 int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
613 unsigned size, unsigned type)
614 {
615 struct io_uring_rsrc_update2 up;
616
617 if (size != sizeof(up))
618 return -EINVAL;
619 if (copy_from_user(&up, arg, sizeof(up)))
620 return -EFAULT;
621 if (!up.nr || up.resv || up.resv2)
622 return -EINVAL;
623 return __io_register_rsrc_update(ctx, type, &up, up.nr);
624 }
625
626 __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
627 unsigned int size, unsigned int type)
628 {
629 struct io_uring_rsrc_register rr;
630
631 /* keep it extendible */
632 if (size != sizeof(rr))
633 return -EINVAL;
634
635 memset(&rr, 0, sizeof(rr));
636 if (copy_from_user(&rr, arg, size))
637 return -EFAULT;
638 if (!rr.nr || rr.resv2)
639 return -EINVAL;
640 if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE)
641 return -EINVAL;
642
643 switch (type) {
644 case IORING_RSRC_FILE:
645 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
646 break;
647 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
648 rr.nr, u64_to_user_ptr(rr.tags));
649 case IORING_RSRC_BUFFER:
650 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
651 break;
652 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
653 rr.nr, u64_to_user_ptr(rr.tags));
654 }
655 return -EINVAL;
656 }
657
658 int io_rsrc_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
659 {
660 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
661
662 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
663 return -EINVAL;
664 if (sqe->rw_flags || sqe->splice_fd_in)
665 return -EINVAL;
666
667 up->offset = READ_ONCE(sqe->off);
668 up->nr_args = READ_ONCE(sqe->len);
669 if (!up->nr_args)
670 return -EINVAL;
671 up->arg = READ_ONCE(sqe->addr);
672 up->type = READ_ONCE(sqe->ioprio);
673 return 0;
674 }
675
676 static int io_files_update_with_index_alloc(struct io_kiocb *req,
677 unsigned int issue_flags)
678 {
679 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
680 __s32 __user *fds = u64_to_user_ptr(up->arg);
681 unsigned int done;
682 struct file *file;
683 int ret, fd;
684
685 if (!req->ctx->file_data)
686 return -ENXIO;
687
688 for (done = 0; done < up->nr_args; done++) {
689 if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
690 ret = -EFAULT;
691 break;
692 }
693
694 file = fget(fd);
695 if (!file) {
696 ret = -EBADF;
697 break;
698 }
699 ret = io_fixed_fd_install(req, issue_flags, file,
700 IORING_FILE_INDEX_ALLOC);
701 if (ret < 0)
702 break;
703 if (copy_to_user(&fds[done], &ret, sizeof(ret))) {
704 __io_close_fixed(req->ctx, issue_flags, ret);
705 ret = -EFAULT;
706 break;
707 }
708 }
709
710 if (done)
711 return done;
712 return ret;
713 }
714
715 static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
716 {
717 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
718 struct io_ring_ctx *ctx = req->ctx;
719 struct io_uring_rsrc_update2 up2;
720 int ret;
721
722 up2.offset = up->offset;
723 up2.data = up->arg;
724 up2.nr = 0;
725 up2.tags = 0;
726 up2.resv = 0;
727 up2.resv2 = 0;
728
729 if (up->offset == IORING_FILE_INDEX_ALLOC) {
730 ret = io_files_update_with_index_alloc(req, issue_flags);
731 } else {
732 io_ring_submit_lock(ctx, issue_flags);
733 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
734 &up2, up->nr_args);
735 io_ring_submit_unlock(ctx, issue_flags);
736 }
737
738 if (ret < 0)
739 req_set_fail(req);
740 io_req_set_res(req, ret, 0);
741 return IOU_OK;
742 }
743
744 static int io_notif_update(struct io_kiocb *req, unsigned int issue_flags)
745 {
746 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
747 struct io_ring_ctx *ctx = req->ctx;
748 unsigned len = up->nr_args;
749 unsigned idx_end, idx = up->offset;
750 int ret = 0;
751
752 io_ring_submit_lock(ctx, issue_flags);
753 if (unlikely(check_add_overflow(idx, len, &idx_end))) {
754 ret = -EOVERFLOW;
755 goto out;
756 }
757 if (unlikely(idx_end > ctx->nr_notif_slots)) {
758 ret = -EINVAL;
759 goto out;
760 }
761
762 for (; idx < idx_end; idx++) {
763 struct io_notif_slot *slot = &ctx->notif_slots[idx];
764
765 if (!slot->notif)
766 continue;
767 if (up->arg)
768 slot->tag = up->arg;
769 io_notif_slot_flush_submit(slot, issue_flags);
770 }
771 out:
772 io_ring_submit_unlock(ctx, issue_flags);
773 if (ret < 0)
774 req_set_fail(req);
775 io_req_set_res(req, ret, 0);
776 return IOU_OK;
777 }
778
779 int io_rsrc_update(struct io_kiocb *req, unsigned int issue_flags)
780 {
781 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
782
783 switch (up->type) {
784 case IORING_RSRC_UPDATE_FILES:
785 return io_files_update(req, issue_flags);
786 case IORING_RSRC_UPDATE_NOTIF:
787 return io_notif_update(req, issue_flags);
788 }
789 return -EINVAL;
790 }
791
792 int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
793 struct io_rsrc_node *node, void *rsrc)
794 {
795 u64 *tag_slot = io_get_tag_slot(data, idx);
796 struct io_rsrc_put *prsrc;
797
798 prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
799 if (!prsrc)
800 return -ENOMEM;
801
802 prsrc->tag = *tag_slot;
803 *tag_slot = 0;
804 prsrc->rsrc = rsrc;
805 list_add(&prsrc->list, &node->rsrc_list);
806 return 0;
807 }
808
809 void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
810 {
811 #if !defined(IO_URING_SCM_ALL)
812 int i;
813
814 for (i = 0; i < ctx->nr_user_files; i++) {
815 struct file *file = io_file_from_index(&ctx->file_table, i);
816
817 if (!file)
818 continue;
819 if (io_fixed_file_slot(&ctx->file_table, i)->file_ptr & FFS_SCM)
820 continue;
821 io_file_bitmap_clear(&ctx->file_table, i);
822 fput(file);
823 }
824 #endif
825
826 #if defined(CONFIG_UNIX)
827 if (ctx->ring_sock) {
828 struct sock *sock = ctx->ring_sock->sk;
829 struct sk_buff *skb;
830
831 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
832 kfree_skb(skb);
833 }
834 #endif
835 io_free_file_tables(&ctx->file_table);
836 io_rsrc_data_free(ctx->file_data);
837 ctx->file_data = NULL;
838 ctx->nr_user_files = 0;
839 }
840
841 int io_sqe_files_unregister(struct io_ring_ctx *ctx)
842 {
843 unsigned nr = ctx->nr_user_files;
844 int ret;
845
846 if (!ctx->file_data)
847 return -ENXIO;
848
849 /*
850 * Quiesce may unlock ->uring_lock, and while it's not held
851 * prevent new requests using the table.
852 */
853 ctx->nr_user_files = 0;
854 ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
855 ctx->nr_user_files = nr;
856 if (!ret)
857 __io_sqe_files_unregister(ctx);
858 return ret;
859 }
860
861 /*
862 * Ensure the UNIX gc is aware of our file set, so we are certain that
863 * the io_uring can be safely unregistered on process exit, even if we have
864 * loops in the file referencing. We account only files that can hold other
865 * files because otherwise they can't form a loop and so are not interesting
866 * for GC.
867 */
868 int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file)
869 {
870 #if defined(CONFIG_UNIX)
871 struct sock *sk = ctx->ring_sock->sk;
872 struct sk_buff_head *head = &sk->sk_receive_queue;
873 struct scm_fp_list *fpl;
874 struct sk_buff *skb;
875
876 if (likely(!io_file_need_scm(file)))
877 return 0;
878
879 /*
880 * See if we can merge this file into an existing skb SCM_RIGHTS
881 * file set. If there's no room, fall back to allocating a new skb
882 * and filling it in.
883 */
884 spin_lock_irq(&head->lock);
885 skb = skb_peek(head);
886 if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD)
887 __skb_unlink(skb, head);
888 else
889 skb = NULL;
890 spin_unlock_irq(&head->lock);
891
892 if (!skb) {
893 fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
894 if (!fpl)
895 return -ENOMEM;
896
897 skb = alloc_skb(0, GFP_KERNEL);
898 if (!skb) {
899 kfree(fpl);
900 return -ENOMEM;
901 }
902
903 fpl->user = get_uid(current_user());
904 fpl->max = SCM_MAX_FD;
905 fpl->count = 0;
906
907 UNIXCB(skb).fp = fpl;
908 skb->sk = sk;
909 skb->destructor = unix_destruct_scm;
910 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
911 }
912
913 fpl = UNIXCB(skb).fp;
914 fpl->fp[fpl->count++] = get_file(file);
915 unix_inflight(fpl->user, file);
916 skb_queue_head(head, skb);
917 fput(file);
918 #endif
919 return 0;
920 }
921
922 static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
923 {
924 struct file *file = prsrc->file;
925 #if defined(CONFIG_UNIX)
926 struct sock *sock = ctx->ring_sock->sk;
927 struct sk_buff_head list, *head = &sock->sk_receive_queue;
928 struct sk_buff *skb;
929 int i;
930
931 if (!io_file_need_scm(file)) {
932 fput(file);
933 return;
934 }
935
936 __skb_queue_head_init(&list);
937
938 /*
939 * Find the skb that holds this file in its SCM_RIGHTS. When found,
940 * remove this entry and rearrange the file array.
941 */
942 skb = skb_dequeue(head);
943 while (skb) {
944 struct scm_fp_list *fp;
945
946 fp = UNIXCB(skb).fp;
947 for (i = 0; i < fp->count; i++) {
948 int left;
949
950 if (fp->fp[i] != file)
951 continue;
952
953 unix_notinflight(fp->user, fp->fp[i]);
954 left = fp->count - 1 - i;
955 if (left) {
956 memmove(&fp->fp[i], &fp->fp[i + 1],
957 left * sizeof(struct file *));
958 }
959 fp->count--;
960 if (!fp->count) {
961 kfree_skb(skb);
962 skb = NULL;
963 } else {
964 __skb_queue_tail(&list, skb);
965 }
966 fput(file);
967 file = NULL;
968 break;
969 }
970
971 if (!file)
972 break;
973
974 __skb_queue_tail(&list, skb);
975
976 skb = skb_dequeue(head);
977 }
978
979 if (skb_peek(&list)) {
980 spin_lock_irq(&head->lock);
981 while ((skb = __skb_dequeue(&list)) != NULL)
982 __skb_queue_tail(head, skb);
983 spin_unlock_irq(&head->lock);
984 }
985 #else
986 fput(file);
987 #endif
988 }
989
990 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
991 unsigned nr_args, u64 __user *tags)
992 {
993 __s32 __user *fds = (__s32 __user *) arg;
994 struct file *file;
995 int fd, ret;
996 unsigned i;
997
998 if (ctx->file_data)
999 return -EBUSY;
1000 if (!nr_args)
1001 return -EINVAL;
1002 if (nr_args > IORING_MAX_FIXED_FILES)
1003 return -EMFILE;
1004 if (nr_args > rlimit(RLIMIT_NOFILE))
1005 return -EMFILE;
1006 ret = io_rsrc_node_switch_start(ctx);
1007 if (ret)
1008 return ret;
1009 ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args,
1010 &ctx->file_data);
1011 if (ret)
1012 return ret;
1013
1014 if (!io_alloc_file_tables(&ctx->file_table, nr_args)) {
1015 io_rsrc_data_free(ctx->file_data);
1016 ctx->file_data = NULL;
1017 return -ENOMEM;
1018 }
1019
1020 for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
1021 struct io_fixed_file *file_slot;
1022
1023 if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) {
1024 ret = -EFAULT;
1025 goto fail;
1026 }
1027 /* allow sparse sets */
1028 if (!fds || fd == -1) {
1029 ret = -EINVAL;
1030 if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
1031 goto fail;
1032 continue;
1033 }
1034
1035 file = fget(fd);
1036 ret = -EBADF;
1037 if (unlikely(!file))
1038 goto fail;
1039
1040 /*
1041 * Don't allow io_uring instances to be registered. If UNIX
1042 * isn't enabled, then this causes a reference cycle and this
1043 * instance can never get freed. If UNIX is enabled we'll
1044 * handle it just fine, but there's still no point in allowing
1045 * a ring fd as it doesn't support regular read/write anyway.
1046 */
1047 if (io_is_uring_fops(file)) {
1048 fput(file);
1049 goto fail;
1050 }
1051 ret = io_scm_file_account(ctx, file);
1052 if (ret) {
1053 fput(file);
1054 goto fail;
1055 }
1056 file_slot = io_fixed_file_slot(&ctx->file_table, i);
1057 io_fixed_file_set(file_slot, file);
1058 io_file_bitmap_set(&ctx->file_table, i);
1059 }
1060
1061 /* default it to the whole table */
1062 io_file_table_set_alloc_range(ctx, 0, ctx->nr_user_files);
1063 io_rsrc_node_switch(ctx, NULL);
1064 return 0;
1065 fail:
1066 __io_sqe_files_unregister(ctx);
1067 return ret;
1068 }
1069
1070 static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
1071 {
1072 io_buffer_unmap(ctx, &prsrc->buf);
1073 prsrc->buf = NULL;
1074 }
1075
1076 void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
1077 {
1078 unsigned int i;
1079
1080 for (i = 0; i < ctx->nr_user_bufs; i++)
1081 io_buffer_unmap(ctx, &ctx->user_bufs[i]);
1082 kfree(ctx->user_bufs);
1083 io_rsrc_data_free(ctx->buf_data);
1084 ctx->user_bufs = NULL;
1085 ctx->buf_data = NULL;
1086 ctx->nr_user_bufs = 0;
1087 }
1088
1089 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
1090 {
1091 unsigned nr = ctx->nr_user_bufs;
1092 int ret;
1093
1094 if (!ctx->buf_data)
1095 return -ENXIO;
1096
1097 /*
1098 * Quiesce may unlock ->uring_lock, and while it's not held
1099 * prevent new requests using the table.
1100 */
1101 ctx->nr_user_bufs = 0;
1102 ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
1103 ctx->nr_user_bufs = nr;
1104 if (!ret)
1105 __io_sqe_buffers_unregister(ctx);
1106 return ret;
1107 }
1108
1109 /*
1110 * Not super efficient, but this is just a registration time. And we do cache
1111 * the last compound head, so generally we'll only do a full search if we don't
1112 * match that one.
1113 *
1114 * We check if the given compound head page has already been accounted, to
1115 * avoid double accounting it. This allows us to account the full size of the
1116 * page, not just the constituent pages of a huge page.
1117 */
1118 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
1119 int nr_pages, struct page *hpage)
1120 {
1121 int i, j;
1122
1123 /* check current page array */
1124 for (i = 0; i < nr_pages; i++) {
1125 if (!PageCompound(pages[i]))
1126 continue;
1127 if (compound_head(pages[i]) == hpage)
1128 return true;
1129 }
1130
1131 /* check previously registered pages */
1132 for (i = 0; i < ctx->nr_user_bufs; i++) {
1133 struct io_mapped_ubuf *imu = ctx->user_bufs[i];
1134
1135 for (j = 0; j < imu->nr_bvecs; j++) {
1136 if (!PageCompound(imu->bvec[j].bv_page))
1137 continue;
1138 if (compound_head(imu->bvec[j].bv_page) == hpage)
1139 return true;
1140 }
1141 }
1142
1143 return false;
1144 }
1145
1146 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
1147 int nr_pages, struct io_mapped_ubuf *imu,
1148 struct page **last_hpage)
1149 {
1150 int i, ret;
1151
1152 imu->acct_pages = 0;
1153 for (i = 0; i < nr_pages; i++) {
1154 if (!PageCompound(pages[i])) {
1155 imu->acct_pages++;
1156 } else {
1157 struct page *hpage;
1158
1159 hpage = compound_head(pages[i]);
1160 if (hpage == *last_hpage)
1161 continue;
1162 *last_hpage = hpage;
1163 if (headpage_already_acct(ctx, pages, i, hpage))
1164 continue;
1165 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
1166 }
1167 }
1168
1169 if (!imu->acct_pages)
1170 return 0;
1171
1172 ret = io_account_mem(ctx, imu->acct_pages);
1173 if (ret)
1174 imu->acct_pages = 0;
1175 return ret;
1176 }
1177
1178 struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages)
1179 {
1180 unsigned long start, end, nr_pages;
1181 struct vm_area_struct **vmas = NULL;
1182 struct page **pages = NULL;
1183 int i, pret, ret = -ENOMEM;
1184
1185 end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1186 start = ubuf >> PAGE_SHIFT;
1187 nr_pages = end - start;
1188
1189 pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
1190 if (!pages)
1191 goto done;
1192
1193 vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
1194 GFP_KERNEL);
1195 if (!vmas)
1196 goto done;
1197
1198 ret = 0;
1199 mmap_read_lock(current->mm);
1200 pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
1201 pages, vmas);
1202 if (pret == nr_pages) {
1203 /* don't support file backed memory */
1204 for (i = 0; i < nr_pages; i++) {
1205 struct vm_area_struct *vma = vmas[i];
1206
1207 if (vma_is_shmem(vma))
1208 continue;
1209 if (vma->vm_file &&
1210 !is_file_hugepages(vma->vm_file)) {
1211 ret = -EOPNOTSUPP;
1212 break;
1213 }
1214 }
1215 *npages = nr_pages;
1216 } else {
1217 ret = pret < 0 ? pret : -EFAULT;
1218 }
1219 mmap_read_unlock(current->mm);
1220 if (ret) {
1221 /*
1222 * if we did partial map, or found file backed vmas,
1223 * release any pages we did get
1224 */
1225 if (pret > 0)
1226 unpin_user_pages(pages, pret);
1227 goto done;
1228 }
1229 ret = 0;
1230 done:
1231 kvfree(vmas);
1232 if (ret < 0) {
1233 kvfree(pages);
1234 pages = ERR_PTR(ret);
1235 }
1236 return pages;
1237 }
1238
1239 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
1240 struct io_mapped_ubuf **pimu,
1241 struct page **last_hpage)
1242 {
1243 struct io_mapped_ubuf *imu = NULL;
1244 struct page **pages = NULL;
1245 unsigned long off;
1246 size_t size;
1247 int ret, nr_pages, i;
1248
1249 *pimu = ctx->dummy_ubuf;
1250 if (!iov->iov_base)
1251 return 0;
1252
1253 ret = -ENOMEM;
1254 pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len,
1255 &nr_pages);
1256 if (IS_ERR(pages)) {
1257 ret = PTR_ERR(pages);
1258 pages = NULL;
1259 goto done;
1260 }
1261
1262 imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
1263 if (!imu)
1264 goto done;
1265
1266 ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage);
1267 if (ret) {
1268 unpin_user_pages(pages, nr_pages);
1269 goto done;
1270 }
1271
1272 off = (unsigned long) iov->iov_base & ~PAGE_MASK;
1273 size = iov->iov_len;
1274 for (i = 0; i < nr_pages; i++) {
1275 size_t vec_len;
1276
1277 vec_len = min_t(size_t, size, PAGE_SIZE - off);
1278 imu->bvec[i].bv_page = pages[i];
1279 imu->bvec[i].bv_len = vec_len;
1280 imu->bvec[i].bv_offset = off;
1281 off = 0;
1282 size -= vec_len;
1283 }
1284 /* store original address for later verification */
1285 imu->ubuf = (unsigned long) iov->iov_base;
1286 imu->ubuf_end = imu->ubuf + iov->iov_len;
1287 imu->nr_bvecs = nr_pages;
1288 *pimu = imu;
1289 ret = 0;
1290 done:
1291 if (ret)
1292 kvfree(imu);
1293 kvfree(pages);
1294 return ret;
1295 }
1296
1297 static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
1298 {
1299 ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
1300 return ctx->user_bufs ? 0 : -ENOMEM;
1301 }
1302
1303 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
1304 unsigned int nr_args, u64 __user *tags)
1305 {
1306 struct page *last_hpage = NULL;
1307 struct io_rsrc_data *data;
1308 int i, ret;
1309 struct iovec iov;
1310
1311 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
1312
1313 if (ctx->user_bufs)
1314 return -EBUSY;
1315 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
1316 return -EINVAL;
1317 ret = io_rsrc_node_switch_start(ctx);
1318 if (ret)
1319 return ret;
1320 ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data);
1321 if (ret)
1322 return ret;
1323 ret = io_buffers_map_alloc(ctx, nr_args);
1324 if (ret) {
1325 io_rsrc_data_free(data);
1326 return ret;
1327 }
1328
1329 for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
1330 if (arg) {
1331 ret = io_copy_iov(ctx, &iov, arg, i);
1332 if (ret)
1333 break;
1334 ret = io_buffer_validate(&iov);
1335 if (ret)
1336 break;
1337 } else {
1338 memset(&iov, 0, sizeof(iov));
1339 }
1340
1341 if (!iov.iov_base && *io_get_tag_slot(data, i)) {
1342 ret = -EINVAL;
1343 break;
1344 }
1345
1346 ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
1347 &last_hpage);
1348 if (ret)
1349 break;
1350 }
1351
1352 WARN_ON_ONCE(ctx->buf_data);
1353
1354 ctx->buf_data = data;
1355 if (ret)
1356 __io_sqe_buffers_unregister(ctx);
1357 else
1358 io_rsrc_node_switch(ctx, NULL);
1359 return ret;
1360 }
1361
1362 int io_import_fixed(int ddir, struct iov_iter *iter,
1363 struct io_mapped_ubuf *imu,
1364 u64 buf_addr, size_t len)
1365 {
1366 u64 buf_end;
1367 size_t offset;
1368
1369 if (WARN_ON_ONCE(!imu))
1370 return -EFAULT;
1371 if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
1372 return -EFAULT;
1373 /* not inside the mapped region */
1374 if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end))
1375 return -EFAULT;
1376
1377 /*
1378 * May not be a start of buffer, set size appropriately
1379 * and advance us to the beginning.
1380 */
1381 offset = buf_addr - imu->ubuf;
1382 iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len);
1383
1384 if (offset) {
1385 /*
1386 * Don't use iov_iter_advance() here, as it's really slow for
1387 * using the latter parts of a big fixed buffer - it iterates
1388 * over each segment manually. We can cheat a bit here, because
1389 * we know that:
1390 *
1391 * 1) it's a BVEC iter, we set it up
1392 * 2) all bvecs are PAGE_SIZE in size, except potentially the
1393 * first and last bvec
1394 *
1395 * So just find our index, and adjust the iterator afterwards.
1396 * If the offset is within the first bvec (or the whole first
1397 * bvec, just use iov_iter_advance(). This makes it easier
1398 * since we can just skip the first segment, which may not
1399 * be PAGE_SIZE aligned.
1400 */
1401 const struct bio_vec *bvec = imu->bvec;
1402
1403 if (offset <= bvec->bv_len) {
1404 iov_iter_advance(iter, offset);
1405 } else {
1406 unsigned long seg_skip;
1407
1408 /* skip first vec */
1409 offset -= bvec->bv_len;
1410 seg_skip = 1 + (offset >> PAGE_SHIFT);
1411
1412 iter->bvec = bvec + seg_skip;
1413 iter->nr_segs -= seg_skip;
1414 iter->count -= bvec->bv_len + offset;
1415 iter->iov_offset = offset & ~PAGE_MASK;
1416 }
1417 }
1418
1419 return 0;
1420 }