From a9367a82eff50489fbf4de20bb7e2c4512a10a1a Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 29 Oct 2020 13:18:17 +0100 Subject: [PATCH] 5.8-stable patches added patches: io-wq-fix-use-after-free-in-io_wq_worker_running.patch io_uring-allow-timeout-poll-files-killing-to-take-task-into-account.patch io_uring-convert-advanced-xarray-uses-to-the-normal-api.patch io_uring-don-t-rely-on-weak-files-references.patch io_uring-don-t-run-task-work-on-an-exiting-task.patch io_uring-enable-task-files-specific-overflow-flushing.patch io_uring-fix-use-of-xarray-in-__io_uring_files_cancel.patch io_uring-fix-xarray-usage-in-io_uring_add_task_file.patch io_uring-move-dropping-of-files-into-separate-helper.patch io_uring-no-need-to-call-xa_destroy-on-empty-xarray.patch io_uring-reference-nsproxy-for-file-table-commands.patch io_uring-return-cancelation-status-from-poll-timeout-files-handlers.patch io_uring-stash-ctx-task-reference-for-sqpoll.patch io_uring-unconditionally-grab-req-task.patch io_wq-make-io_wqe-lock-a-raw_spinlock_t.patch --- ...e-after-free-in-io_wq_worker_running.patch | 310 +++++++++ ...es-killing-to-take-task-into-account.patch | 92 +++ ...vanced-xarray-uses-to-the-normal-api.patch | 52 ++ ...-don-t-rely-on-weak-files-references.patch | 648 ++++++++++++++++++ ...n-t-run-task-work-on-an-exiting-task.patch | 43 ++ ...ask-files-specific-overflow-flushing.patch | 131 ++++ ...of-xarray-in-__io_uring_files_cancel.patch | 56 ++ ...rray-usage-in-io_uring_add_task_file.patch | 64 ++ ...opping-of-files-into-separate-helper.patch | 61 ++ ...d-to-call-xa_destroy-on-empty-xarray.patch | 105 +++ ...ence-nsproxy-for-file-table-commands.patch | 87 +++ ...tus-from-poll-timeout-files-handlers.patch | 96 +++ ...-stash-ctx-task-reference-for-sqpoll.patch | 104 +++ ..._uring-unconditionally-grab-req-task.patch | 104 +++ ...wq-make-io_wqe-lock-a-raw_spinlock_t.patch | 253 +++++++ queue-5.8/series | 15 + 16 files changed, 2221 insertions(+) create mode 100644 queue-5.8/io-wq-fix-use-after-free-in-io_wq_worker_running.patch create mode 100644 queue-5.8/io_uring-allow-timeout-poll-files-killing-to-take-task-into-account.patch create mode 100644 queue-5.8/io_uring-convert-advanced-xarray-uses-to-the-normal-api.patch create mode 100644 queue-5.8/io_uring-don-t-rely-on-weak-files-references.patch create mode 100644 queue-5.8/io_uring-don-t-run-task-work-on-an-exiting-task.patch create mode 100644 queue-5.8/io_uring-enable-task-files-specific-overflow-flushing.patch create mode 100644 queue-5.8/io_uring-fix-use-of-xarray-in-__io_uring_files_cancel.patch create mode 100644 queue-5.8/io_uring-fix-xarray-usage-in-io_uring_add_task_file.patch create mode 100644 queue-5.8/io_uring-move-dropping-of-files-into-separate-helper.patch create mode 100644 queue-5.8/io_uring-no-need-to-call-xa_destroy-on-empty-xarray.patch create mode 100644 queue-5.8/io_uring-reference-nsproxy-for-file-table-commands.patch create mode 100644 queue-5.8/io_uring-return-cancelation-status-from-poll-timeout-files-handlers.patch create mode 100644 queue-5.8/io_uring-stash-ctx-task-reference-for-sqpoll.patch create mode 100644 queue-5.8/io_uring-unconditionally-grab-req-task.patch create mode 100644 queue-5.8/io_wq-make-io_wqe-lock-a-raw_spinlock_t.patch diff --git a/queue-5.8/io-wq-fix-use-after-free-in-io_wq_worker_running.patch b/queue-5.8/io-wq-fix-use-after-free-in-io_wq_worker_running.patch new file mode 100644 index 00000000000..d6b4164f92c --- /dev/null +++ b/queue-5.8/io-wq-fix-use-after-free-in-io_wq_worker_running.patch @@ -0,0 +1,310 @@ +From foo@baz Thu Oct 29 01:16:54 PM CET 2020 +From: Hillf Danton +Date: Sat, 26 Sep 2020 21:26:55 +0800 +Subject: io-wq: fix use-after-free in io_wq_worker_running + +From: Hillf Danton + +commit c4068bf898ddaef791049a366828d9b84b467bda upstream. + +The smart syzbot has found a reproducer for the following issue: + + ================================================================== + BUG: KASAN: use-after-free in instrument_atomic_write include/linux/instrumented.h:71 [inline] + BUG: KASAN: use-after-free in atomic_inc include/asm-generic/atomic-instrumented.h:240 [inline] + BUG: KASAN: use-after-free in io_wqe_inc_running fs/io-wq.c:301 [inline] + BUG: KASAN: use-after-free in io_wq_worker_running+0xde/0x110 fs/io-wq.c:613 + Write of size 4 at addr ffff8882183db08c by task io_wqe_worker-0/7771 + + CPU: 0 PID: 7771 Comm: io_wqe_worker-0 Not tainted 5.9.0-rc4-syzkaller #0 + Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 + Call Trace: + __dump_stack lib/dump_stack.c:77 [inline] + dump_stack+0x198/0x1fd lib/dump_stack.c:118 + print_address_description.constprop.0.cold+0xae/0x497 mm/kasan/report.c:383 + __kasan_report mm/kasan/report.c:513 [inline] + kasan_report.cold+0x1f/0x37 mm/kasan/report.c:530 + check_memory_region_inline mm/kasan/generic.c:186 [inline] + check_memory_region+0x13d/0x180 mm/kasan/generic.c:192 + instrument_atomic_write include/linux/instrumented.h:71 [inline] + atomic_inc include/asm-generic/atomic-instrumented.h:240 [inline] + io_wqe_inc_running fs/io-wq.c:301 [inline] + io_wq_worker_running+0xde/0x110 fs/io-wq.c:613 + schedule_timeout+0x148/0x250 kernel/time/timer.c:1879 + io_wqe_worker+0x517/0x10e0 fs/io-wq.c:580 + kthread+0x3b5/0x4a0 kernel/kthread.c:292 + ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:294 + + Allocated by task 7768: + kasan_save_stack+0x1b/0x40 mm/kasan/common.c:48 + kasan_set_track mm/kasan/common.c:56 [inline] + __kasan_kmalloc.constprop.0+0xbf/0xd0 mm/kasan/common.c:461 + kmem_cache_alloc_node_trace+0x17b/0x3f0 mm/slab.c:3594 + kmalloc_node include/linux/slab.h:572 [inline] + kzalloc_node include/linux/slab.h:677 [inline] + io_wq_create+0x57b/0xa10 fs/io-wq.c:1064 + io_init_wq_offload fs/io_uring.c:7432 [inline] + io_sq_offload_start fs/io_uring.c:7504 [inline] + io_uring_create fs/io_uring.c:8625 [inline] + io_uring_setup+0x1836/0x28e0 fs/io_uring.c:8694 + do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 + entry_SYSCALL_64_after_hwframe+0x44/0xa9 + + Freed by task 21: + kasan_save_stack+0x1b/0x40 mm/kasan/common.c:48 + kasan_set_track+0x1c/0x30 mm/kasan/common.c:56 + kasan_set_free_info+0x1b/0x30 mm/kasan/generic.c:355 + __kasan_slab_free+0xd8/0x120 mm/kasan/common.c:422 + __cache_free mm/slab.c:3418 [inline] + kfree+0x10e/0x2b0 mm/slab.c:3756 + __io_wq_destroy fs/io-wq.c:1138 [inline] + io_wq_destroy+0x2af/0x460 fs/io-wq.c:1146 + io_finish_async fs/io_uring.c:6836 [inline] + io_ring_ctx_free fs/io_uring.c:7870 [inline] + io_ring_exit_work+0x1e4/0x6d0 fs/io_uring.c:7954 + process_one_work+0x94c/0x1670 kernel/workqueue.c:2269 + worker_thread+0x64c/0x1120 kernel/workqueue.c:2415 + kthread+0x3b5/0x4a0 kernel/kthread.c:292 + ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:294 + + The buggy address belongs to the object at ffff8882183db000 + which belongs to the cache kmalloc-1k of size 1024 + The buggy address is located 140 bytes inside of + 1024-byte region [ffff8882183db000, ffff8882183db400) + The buggy address belongs to the page: + page:000000009bada22b refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x2183db + flags: 0x57ffe0000000200(slab) + raw: 057ffe0000000200 ffffea0008604c48 ffffea00086a8648 ffff8880aa040700 + raw: 0000000000000000 ffff8882183db000 0000000100000002 0000000000000000 + page dumped because: kasan: bad access detected + + Memory state around the buggy address: + ffff8882183daf80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff + ffff8882183db000: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb + >ffff8882183db080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb + ^ + ffff8882183db100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb + ffff8882183db180: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb + ================================================================== + +which is down to the comment below, + + /* all workers gone, wq exit can proceed */ + if (!nr_workers && refcount_dec_and_test(&wqe->wq->refs)) + complete(&wqe->wq->done); + +because there might be multiple cases of wqe in a wq and we would wait +for every worker in every wqe to go home before releasing wq's resources +on destroying. + +To that end, rework wq's refcount by making it independent of the tracking +of workers because after all they are two different things, and keeping +it balanced when workers come and go. Note the manager kthread, like +other workers, now holds a grab to wq during its lifetime. + +Finally to help destroy wq, check IO_WQ_BIT_EXIT upon creating worker +and do nothing for exiting wq. + +Cc: stable@vger.kernel.org # v5.5+ +Reported-by: syzbot+45fa0a195b941764e0f0@syzkaller.appspotmail.com +Reported-by: syzbot+9af99580130003da82b1@syzkaller.appspotmail.com +Cc: Pavel Begunkov +Signed-off-by: Hillf Danton +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + fs/io-wq.c | 116 ++++++++++++++++++++++++++++++------------------------------- + 1 file changed, 58 insertions(+), 58 deletions(-) + +--- a/fs/io-wq.c ++++ b/fs/io-wq.c +@@ -202,7 +202,6 @@ static void io_worker_exit(struct io_wor + { + struct io_wqe *wqe = worker->wqe; + struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker); +- unsigned nr_workers; + + /* + * If we're not at zero, someone else is holding a brief reference +@@ -230,15 +229,11 @@ static void io_worker_exit(struct io_wor + raw_spin_lock_irq(&wqe->lock); + } + acct->nr_workers--; +- nr_workers = wqe->acct[IO_WQ_ACCT_BOUND].nr_workers + +- wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers; + raw_spin_unlock_irq(&wqe->lock); + +- /* all workers gone, wq exit can proceed */ +- if (!nr_workers && refcount_dec_and_test(&wqe->wq->refs)) +- complete(&wqe->wq->done); +- + kfree_rcu(worker, rcu); ++ if (refcount_dec_and_test(&wqe->wq->refs)) ++ complete(&wqe->wq->done); + } + + static inline bool io_wqe_run_queue(struct io_wqe *wqe) +@@ -644,7 +639,7 @@ void io_wq_worker_sleeping(struct task_s + + static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) + { +- struct io_wqe_acct *acct =&wqe->acct[index]; ++ struct io_wqe_acct *acct = &wqe->acct[index]; + struct io_worker *worker; + + worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node); +@@ -677,6 +672,7 @@ static bool create_io_worker(struct io_w + if (index == IO_WQ_ACCT_UNBOUND) + atomic_inc(&wq->user->processes); + ++ refcount_inc(&wq->refs); + wake_up_process(worker->task); + return true; + } +@@ -692,28 +688,63 @@ static inline bool io_wqe_need_worker(st + return acct->nr_workers < acct->max_workers; + } + ++static bool io_wqe_worker_send_sig(struct io_worker *worker, void *data) ++{ ++ send_sig(SIGINT, worker->task, 1); ++ return false; ++} ++ ++/* ++ * Iterate the passed in list and call the specific function for each ++ * worker that isn't exiting ++ */ ++static bool io_wq_for_each_worker(struct io_wqe *wqe, ++ bool (*func)(struct io_worker *, void *), ++ void *data) ++{ ++ struct io_worker *worker; ++ bool ret = false; ++ ++ list_for_each_entry_rcu(worker, &wqe->all_list, all_list) { ++ if (io_worker_get(worker)) { ++ /* no task if node is/was offline */ ++ if (worker->task) ++ ret = func(worker, data); ++ io_worker_release(worker); ++ if (ret) ++ break; ++ } ++ } ++ ++ return ret; ++} ++ ++static bool io_wq_worker_wake(struct io_worker *worker, void *data) ++{ ++ wake_up_process(worker->task); ++ return false; ++} ++ + /* + * Manager thread. Tasked with creating new workers, if we need them. + */ + static int io_wq_manager(void *data) + { + struct io_wq *wq = data; +- int workers_to_create = num_possible_nodes(); + int node; + + /* create fixed workers */ +- refcount_set(&wq->refs, workers_to_create); ++ refcount_set(&wq->refs, 1); + for_each_node(node) { + if (!node_online(node)) + continue; +- if (!create_io_worker(wq, wq->wqes[node], IO_WQ_ACCT_BOUND)) +- goto err; +- workers_to_create--; ++ if (create_io_worker(wq, wq->wqes[node], IO_WQ_ACCT_BOUND)) ++ continue; ++ set_bit(IO_WQ_BIT_ERROR, &wq->state); ++ set_bit(IO_WQ_BIT_EXIT, &wq->state); ++ goto out; + } + +- while (workers_to_create--) +- refcount_dec(&wq->refs); +- + complete(&wq->done); + + while (!kthread_should_stop()) { +@@ -745,12 +776,18 @@ static int io_wq_manager(void *data) + if (current->task_works) + task_work_run(); + +- return 0; +-err: +- set_bit(IO_WQ_BIT_ERROR, &wq->state); +- set_bit(IO_WQ_BIT_EXIT, &wq->state); +- if (refcount_sub_and_test(workers_to_create, &wq->refs)) ++out: ++ if (refcount_dec_and_test(&wq->refs)) { + complete(&wq->done); ++ return 0; ++ } ++ /* if ERROR is set and we get here, we have workers to wake */ ++ if (test_bit(IO_WQ_BIT_ERROR, &wq->state)) { ++ rcu_read_lock(); ++ for_each_node(node) ++ io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL); ++ rcu_read_unlock(); ++ } + return 0; + } + +@@ -858,37 +895,6 @@ void io_wq_hash_work(struct io_wq_work * + work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT)); + } + +-static bool io_wqe_worker_send_sig(struct io_worker *worker, void *data) +-{ +- send_sig(SIGINT, worker->task, 1); +- return false; +-} +- +-/* +- * Iterate the passed in list and call the specific function for each +- * worker that isn't exiting +- */ +-static bool io_wq_for_each_worker(struct io_wqe *wqe, +- bool (*func)(struct io_worker *, void *), +- void *data) +-{ +- struct io_worker *worker; +- bool ret = false; +- +- list_for_each_entry_rcu(worker, &wqe->all_list, all_list) { +- if (io_worker_get(worker)) { +- /* no task if node is/was offline */ +- if (worker->task) +- ret = func(worker, data); +- io_worker_release(worker); +- if (ret) +- break; +- } +- } +- +- return ret; +-} +- + void io_wq_cancel_all(struct io_wq *wq) + { + int node; +@@ -1121,12 +1127,6 @@ bool io_wq_get(struct io_wq *wq, struct + return refcount_inc_not_zero(&wq->use_refs); + } + +-static bool io_wq_worker_wake(struct io_worker *worker, void *data) +-{ +- wake_up_process(worker->task); +- return false; +-} +- + static void __io_wq_destroy(struct io_wq *wq) + { + int node; diff --git a/queue-5.8/io_uring-allow-timeout-poll-files-killing-to-take-task-into-account.patch b/queue-5.8/io_uring-allow-timeout-poll-files-killing-to-take-task-into-account.patch new file mode 100644 index 00000000000..824498b2a01 --- /dev/null +++ b/queue-5.8/io_uring-allow-timeout-poll-files-killing-to-take-task-into-account.patch @@ -0,0 +1,92 @@ +From foo@baz Thu Oct 29 01:16:54 PM CET 2020 +From: Jens Axboe +Date: Tue, 22 Sep 2020 08:18:24 -0600 +Subject: io_uring: allow timeout/poll/files killing to take task into account + +From: Jens Axboe + +commit f3606e3a92ddd36299642c78592fc87609abb1f6 upstream. + +We currently cancel these when the ring exits, and we cancel all of +them. This is in preparation for killing only the ones associated +with a given task. + +Reviewed-by: Pavel Begunkov +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + fs/io_uring.c | 30 ++++++++++++++++++++++-------- + 1 file changed, 22 insertions(+), 8 deletions(-) + +--- a/fs/io_uring.c ++++ b/fs/io_uring.c +@@ -1141,13 +1141,25 @@ static void io_kill_timeout(struct io_ki + } + } + +-static void io_kill_timeouts(struct io_ring_ctx *ctx) ++static bool io_task_match(struct io_kiocb *req, struct task_struct *tsk) ++{ ++ struct io_ring_ctx *ctx = req->ctx; ++ ++ if (!tsk || req->task == tsk) ++ return true; ++ if ((ctx->flags & IORING_SETUP_SQPOLL) && req->task == ctx->sqo_thread) ++ return true; ++ return false; ++} ++ ++static void io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk) + { + struct io_kiocb *req, *tmp; + + spin_lock_irq(&ctx->completion_lock); + list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list) +- io_kill_timeout(req); ++ if (io_task_match(req, tsk)) ++ io_kill_timeout(req); + spin_unlock_irq(&ctx->completion_lock); + } + +@@ -4641,7 +4653,7 @@ static bool io_poll_remove_one(struct io + return do_complete; + } + +-static void io_poll_remove_all(struct io_ring_ctx *ctx) ++static void io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk) + { + struct hlist_node *tmp; + struct io_kiocb *req; +@@ -4652,8 +4664,10 @@ static void io_poll_remove_all(struct io + struct hlist_head *list; + + list = &ctx->cancel_hash[i]; +- hlist_for_each_entry_safe(req, tmp, list, hash_node) +- posted += io_poll_remove_one(req); ++ hlist_for_each_entry_safe(req, tmp, list, hash_node) { ++ if (io_task_match(req, tsk)) ++ posted += io_poll_remove_one(req); ++ } + } + spin_unlock_irq(&ctx->completion_lock); + +@@ -7556,8 +7570,8 @@ static void io_ring_ctx_wait_and_kill(st + percpu_ref_kill(&ctx->refs); + mutex_unlock(&ctx->uring_lock); + +- io_kill_timeouts(ctx); +- io_poll_remove_all(ctx); ++ io_kill_timeouts(ctx, NULL); ++ io_poll_remove_all(ctx, NULL); + + if (ctx->io_wq) + io_wq_cancel_all(ctx->io_wq); +@@ -7809,7 +7823,7 @@ static bool io_cancel_task_cb(struct io_ + struct io_kiocb *req = container_of(work, struct io_kiocb, work); + struct task_struct *task = data; + +- return req->task == task; ++ return io_task_match(req, task); + } + + static int io_uring_flush(struct file *file, void *data) diff --git a/queue-5.8/io_uring-convert-advanced-xarray-uses-to-the-normal-api.patch b/queue-5.8/io_uring-convert-advanced-xarray-uses-to-the-normal-api.patch new file mode 100644 index 00000000000..eac61ac7519 --- /dev/null +++ b/queue-5.8/io_uring-convert-advanced-xarray-uses-to-the-normal-api.patch @@ -0,0 +1,52 @@ +From foo@baz Thu Oct 29 01:16:54 PM CET 2020 +From: "Matthew Wilcox (Oracle)" +Date: Fri, 9 Oct 2020 13:49:53 +0100 +Subject: io_uring: Convert advanced XArray uses to the normal API + +From: "Matthew Wilcox (Oracle)" + +commit 5e2ed8c4f45093698855b1f45cdf43efbf6dd498 upstream. + +There are no bugs here that I've spotted, it's just easier to use the +normal API and there are no performance advantages to using the more +verbose advanced API. + +Signed-off-by: Matthew Wilcox (Oracle) +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + fs/io_uring.c | 14 ++------------ + 1 file changed, 2 insertions(+), 12 deletions(-) + +--- a/fs/io_uring.c ++++ b/fs/io_uring.c +@@ -7958,27 +7958,17 @@ static int io_uring_add_task_file(struct + static void io_uring_del_task_file(struct file *file) + { + struct io_uring_task *tctx = current->io_uring; +- XA_STATE(xas, &tctx->xa, (unsigned long) file); + + if (tctx->last == file) + tctx->last = NULL; +- +- xas_lock(&xas); +- file = xas_store(&xas, NULL); +- xas_unlock(&xas); +- ++ file = xa_erase(&tctx->xa, (unsigned long)file); + if (file) + fput(file); + } + + static void __io_uring_attempt_task_drop(struct file *file) + { +- XA_STATE(xas, ¤t->io_uring->xa, (unsigned long) file); +- struct file *old; +- +- rcu_read_lock(); +- old = xas_load(&xas); +- rcu_read_unlock(); ++ struct file *old = xa_load(¤t->io_uring->xa, (unsigned long)file); + + if (old == file) + io_uring_del_task_file(file); diff --git a/queue-5.8/io_uring-don-t-rely-on-weak-files-references.patch b/queue-5.8/io_uring-don-t-rely-on-weak-files-references.patch new file mode 100644 index 00000000000..c0f8a770971 --- /dev/null +++ b/queue-5.8/io_uring-don-t-rely-on-weak-files-references.patch @@ -0,0 +1,648 @@ +From foo@baz Thu Oct 29 01:16:54 PM CET 2020 +From: Jens Axboe +Date: Sun, 13 Sep 2020 13:09:39 -0600 +Subject: io_uring: don't rely on weak ->files references + +From: Jens Axboe + +commit 0f2122045b946241a9e549c2a76cea54fa58a7ff upstream. + +Grab actual references to the files_struct. To avoid circular references +issues due to this, we add a per-task note that keeps track of what +io_uring contexts a task has used. When the tasks execs or exits its +assigned files, we cancel requests based on this tracking. + +With that, we can grab proper references to the files table, and no +longer need to rely on stashing away ring_fd and ring_file to check +if the ring_fd may have been closed. + +Cc: stable@vger.kernel.org # v5.5+ +Reviewed-by: Pavel Begunkov +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + fs/exec.c | 6 + fs/file.c | 2 + fs/io_uring.c | 301 +++++++++++++++++++++++++++++++++++++++++------ + include/linux/io_uring.h | 53 ++++++++ + include/linux/sched.h | 5 + init/init_task.c | 3 + kernel/fork.c | 6 + 7 files changed, 340 insertions(+), 36 deletions(-) + create mode 100644 include/linux/io_uring.h + +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -62,6 +62,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -1847,6 +1848,11 @@ static int __do_execve_file(int fd, stru + * further execve() calls fail. */ + current->flags &= ~PF_NPROC_EXCEEDED; + ++ /* ++ * Cancel any io_uring activity across execve ++ */ ++ io_uring_task_cancel(); ++ + retval = unshare_files(&displaced); + if (retval) + goto out_ret; +--- a/fs/file.c ++++ b/fs/file.c +@@ -18,6 +18,7 @@ + #include + #include + #include ++#include + + unsigned int sysctl_nr_open __read_mostly = 1024*1024; + unsigned int sysctl_nr_open_min = BITS_PER_LONG; +@@ -439,6 +440,7 @@ void exit_files(struct task_struct *tsk) + struct files_struct * files = tsk->files; + + if (files) { ++ io_uring_files_cancel(files); + task_lock(tsk); + tsk->files = NULL; + task_unlock(tsk); +--- a/fs/io_uring.c ++++ b/fs/io_uring.c +@@ -78,6 +78,7 @@ + #include + #include + #include ++#include + + #define CREATE_TRACE_POINTS + #include +@@ -283,8 +284,6 @@ struct io_ring_ctx { + */ + struct fixed_file_data *file_data; + unsigned nr_user_files; +- int ring_fd; +- struct file *ring_file; + + /* if used, fixed mapped user buffers */ + unsigned nr_user_bufs; +@@ -1335,7 +1334,12 @@ static void __io_cqring_fill_event(struc + WRITE_ONCE(cqe->user_data, req->user_data); + WRITE_ONCE(cqe->res, res); + WRITE_ONCE(cqe->flags, cflags); +- } else if (ctx->cq_overflow_flushed) { ++ } else if (ctx->cq_overflow_flushed || req->task->io_uring->in_idle) { ++ /* ++ * If we're in ring overflow flush mode, or in task cancel mode, ++ * then we cannot store the request for later flushing, we need ++ * to drop it on the floor. ++ */ + WRITE_ONCE(ctx->rings->cq_overflow, + atomic_inc_return(&ctx->cached_cq_overflow)); + } else { +@@ -1451,17 +1455,22 @@ static void io_req_drop_files(struct io_ + wake_up(&ctx->inflight_wait); + spin_unlock_irqrestore(&ctx->inflight_lock, flags); + req->flags &= ~REQ_F_INFLIGHT; ++ put_files_struct(req->work.files); + req->work.files = NULL; + } + + static void __io_req_aux_free(struct io_kiocb *req) + { ++ struct io_uring_task *tctx = req->task->io_uring; + if (req->flags & REQ_F_NEED_CLEANUP) + io_cleanup_req(req); + + kfree(req->io); + if (req->file) + io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); ++ atomic_long_inc(&tctx->req_complete); ++ if (tctx->in_idle) ++ wake_up(&tctx->wait); + put_task_struct(req->task); + io_req_work_drop_env(req); + } +@@ -3532,8 +3541,7 @@ static int io_close_prep(struct io_kiocb + return -EBADF; + + req->close.fd = READ_ONCE(sqe->fd); +- if ((req->file && req->file->f_op == &io_uring_fops) || +- req->close.fd == req->ctx->ring_fd) ++ if ((req->file && req->file->f_op == &io_uring_fops)) + return -EBADF; + + req->close.put_file = NULL; +@@ -5671,32 +5679,18 @@ static int io_req_set_file(struct io_sub + + static int io_grab_files(struct io_kiocb *req) + { +- int ret = -EBADF; + struct io_ring_ctx *ctx = req->ctx; + + if (req->work.files || (req->flags & REQ_F_NO_FILE_TABLE)) + return 0; +- if (!ctx->ring_file) +- return -EBADF; + +- rcu_read_lock(); ++ req->work.files = get_files_struct(current); ++ req->flags |= REQ_F_INFLIGHT; ++ + spin_lock_irq(&ctx->inflight_lock); +- /* +- * We use the f_ops->flush() handler to ensure that we can flush +- * out work accessing these files if the fd is closed. Check if +- * the fd has changed since we started down this path, and disallow +- * this operation if it has. +- */ +- if (fcheck(ctx->ring_fd) == ctx->ring_file) { +- list_add(&req->inflight_entry, &ctx->inflight_list); +- req->flags |= REQ_F_INFLIGHT; +- req->work.files = current->files; +- ret = 0; +- } ++ list_add(&req->inflight_entry, &ctx->inflight_list); + spin_unlock_irq(&ctx->inflight_lock); +- rcu_read_unlock(); +- +- return ret; ++ return 0; + } + + static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) +@@ -6067,6 +6061,7 @@ static int io_init_req(struct io_ring_ct + refcount_set(&req->refs, 2); + req->task = current; + get_task_struct(req->task); ++ atomic_long_inc(&req->task->io_uring->req_issue); + req->result = 0; + + if (unlikely(req->opcode >= IORING_OP_LAST)) +@@ -6102,8 +6097,7 @@ static int io_init_req(struct io_ring_ct + return io_req_set_file(state, req, READ_ONCE(sqe->fd)); + } + +-static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, +- struct file *ring_file, int ring_fd) ++static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) + { + struct io_submit_state state, *statep = NULL; + struct io_kiocb *link = NULL; +@@ -6127,9 +6121,6 @@ static int io_submit_sqes(struct io_ring + statep = &state; + } + +- ctx->ring_fd = ring_fd; +- ctx->ring_file = ring_file; +- + for (i = 0; i < nr; i++) { + const struct io_uring_sqe *sqe; + struct io_kiocb *req; +@@ -6290,7 +6281,7 @@ static int io_sq_thread(void *data) + + mutex_lock(&ctx->uring_lock); + if (likely(!percpu_ref_is_dying(&ctx->refs))) +- ret = io_submit_sqes(ctx, to_submit, NULL, -1); ++ ret = io_submit_sqes(ctx, to_submit); + mutex_unlock(&ctx->uring_lock); + timeout = jiffies + ctx->sq_thread_idle; + } +@@ -7119,6 +7110,34 @@ out_fput: + return ret; + } + ++static int io_uring_alloc_task_context(struct task_struct *task) ++{ ++ struct io_uring_task *tctx; ++ ++ tctx = kmalloc(sizeof(*tctx), GFP_KERNEL); ++ if (unlikely(!tctx)) ++ return -ENOMEM; ++ ++ xa_init(&tctx->xa); ++ init_waitqueue_head(&tctx->wait); ++ tctx->last = NULL; ++ tctx->in_idle = 0; ++ atomic_long_set(&tctx->req_issue, 0); ++ atomic_long_set(&tctx->req_complete, 0); ++ task->io_uring = tctx; ++ return 0; ++} ++ ++void __io_uring_free(struct task_struct *tsk) ++{ ++ struct io_uring_task *tctx = tsk->io_uring; ++ ++ WARN_ON_ONCE(!xa_empty(&tctx->xa)); ++ xa_destroy(&tctx->xa); ++ kfree(tctx); ++ tsk->io_uring = NULL; ++} ++ + static int io_sq_offload_start(struct io_ring_ctx *ctx, + struct io_uring_params *p) + { +@@ -7154,6 +7173,9 @@ static int io_sq_offload_start(struct io + ctx->sqo_thread = NULL; + goto err; + } ++ ret = io_uring_alloc_task_context(ctx->sqo_thread); ++ if (ret) ++ goto err; + wake_up_process(ctx->sqo_thread); + } else if (p->flags & IORING_SETUP_SQ_AFF) { + /* Can't have SQ_AFF without SQPOLL */ +@@ -7633,7 +7655,7 @@ static bool io_wq_files_match(struct io_ + { + struct files_struct *files = data; + +- return work->files == files; ++ return !files || work->files == files; + } + + /* +@@ -7787,7 +7809,7 @@ static bool io_uring_cancel_files(struct + + spin_lock_irq(&ctx->inflight_lock); + list_for_each_entry(req, &ctx->inflight_list, inflight_entry) { +- if (req->work.files != files) ++ if (files && req->work.files != files) + continue; + /* req is being completed, ignore */ + if (!refcount_inc_not_zero(&req->refs)) +@@ -7850,18 +7872,217 @@ static bool io_cancel_task_cb(struct io_ + return io_task_match(req, task); + } + ++static bool __io_uring_cancel_task_requests(struct io_ring_ctx *ctx, ++ struct task_struct *task, ++ struct files_struct *files) ++{ ++ bool ret; ++ ++ ret = io_uring_cancel_files(ctx, files); ++ if (!files) { ++ enum io_wq_cancel cret; ++ ++ cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, task, true); ++ if (cret != IO_WQ_CANCEL_NOTFOUND) ++ ret = true; ++ ++ /* SQPOLL thread does its own polling */ ++ if (!(ctx->flags & IORING_SETUP_SQPOLL)) { ++ if (!list_empty_careful(&ctx->poll_list)) { ++ io_iopoll_reap_events(ctx); ++ ret = true; ++ } ++ } ++ ++ ret |= io_poll_remove_all(ctx, task); ++ ret |= io_kill_timeouts(ctx, task); ++ } ++ ++ return ret; ++} ++ ++/* ++ * We need to iteratively cancel requests, in case a request has dependent ++ * hard links. These persist even for failure of cancelations, hence keep ++ * looping until none are found. ++ */ ++static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, ++ struct files_struct *files) ++{ ++ struct task_struct *task = current; ++ ++ if (ctx->flags & IORING_SETUP_SQPOLL) ++ task = ctx->sqo_thread; ++ ++ io_cqring_overflow_flush(ctx, true, task, files); ++ ++ while (__io_uring_cancel_task_requests(ctx, task, files)) { ++ io_run_task_work(); ++ cond_resched(); ++ } ++} ++ ++/* ++ * Note that this task has used io_uring. We use it for cancelation purposes. ++ */ ++static int io_uring_add_task_file(struct file *file) ++{ ++ if (unlikely(!current->io_uring)) { ++ int ret; ++ ++ ret = io_uring_alloc_task_context(current); ++ if (unlikely(ret)) ++ return ret; ++ } ++ if (current->io_uring->last != file) { ++ XA_STATE(xas, ¤t->io_uring->xa, (unsigned long) file); ++ void *old; ++ ++ rcu_read_lock(); ++ old = xas_load(&xas); ++ if (old != file) { ++ get_file(file); ++ xas_lock(&xas); ++ xas_store(&xas, file); ++ xas_unlock(&xas); ++ } ++ rcu_read_unlock(); ++ current->io_uring->last = file; ++ } ++ ++ return 0; ++} ++ ++/* ++ * Remove this io_uring_file -> task mapping. ++ */ ++static void io_uring_del_task_file(struct file *file) ++{ ++ struct io_uring_task *tctx = current->io_uring; ++ XA_STATE(xas, &tctx->xa, (unsigned long) file); ++ ++ if (tctx->last == file) ++ tctx->last = NULL; ++ ++ xas_lock(&xas); ++ file = xas_store(&xas, NULL); ++ xas_unlock(&xas); ++ ++ if (file) ++ fput(file); ++} ++ ++static void __io_uring_attempt_task_drop(struct file *file) ++{ ++ XA_STATE(xas, ¤t->io_uring->xa, (unsigned long) file); ++ struct file *old; ++ ++ rcu_read_lock(); ++ old = xas_load(&xas); ++ rcu_read_unlock(); ++ ++ if (old == file) ++ io_uring_del_task_file(file); ++} ++ ++/* ++ * Drop task note for this file if we're the only ones that hold it after ++ * pending fput() ++ */ ++static void io_uring_attempt_task_drop(struct file *file, bool exiting) ++{ ++ if (!current->io_uring) ++ return; ++ /* ++ * fput() is pending, will be 2 if the only other ref is our potential ++ * task file note. If the task is exiting, drop regardless of count. ++ */ ++ if (!exiting && atomic_long_read(&file->f_count) != 2) ++ return; ++ ++ __io_uring_attempt_task_drop(file); ++} ++ ++void __io_uring_files_cancel(struct files_struct *files) ++{ ++ struct io_uring_task *tctx = current->io_uring; ++ XA_STATE(xas, &tctx->xa, 0); ++ ++ /* make sure overflow events are dropped */ ++ tctx->in_idle = true; ++ ++ do { ++ struct io_ring_ctx *ctx; ++ struct file *file; ++ ++ xas_lock(&xas); ++ file = xas_next_entry(&xas, ULONG_MAX); ++ xas_unlock(&xas); ++ ++ if (!file) ++ break; ++ ++ ctx = file->private_data; ++ ++ io_uring_cancel_task_requests(ctx, files); ++ if (files) ++ io_uring_del_task_file(file); ++ } while (1); ++} ++ ++static inline bool io_uring_task_idle(struct io_uring_task *tctx) ++{ ++ return atomic_long_read(&tctx->req_issue) == ++ atomic_long_read(&tctx->req_complete); ++} ++ ++/* ++ * Find any io_uring fd that this task has registered or done IO on, and cancel ++ * requests. ++ */ ++void __io_uring_task_cancel(void) ++{ ++ struct io_uring_task *tctx = current->io_uring; ++ DEFINE_WAIT(wait); ++ long completions; ++ ++ /* make sure overflow events are dropped */ ++ tctx->in_idle = true; ++ ++ while (!io_uring_task_idle(tctx)) { ++ /* read completions before cancelations */ ++ completions = atomic_long_read(&tctx->req_complete); ++ __io_uring_files_cancel(NULL); ++ ++ prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE); ++ ++ /* ++ * If we've seen completions, retry. This avoids a race where ++ * a completion comes in before we did prepare_to_wait(). ++ */ ++ if (completions != atomic_long_read(&tctx->req_complete)) ++ continue; ++ if (io_uring_task_idle(tctx)) ++ break; ++ schedule(); ++ } ++ ++ finish_wait(&tctx->wait, &wait); ++ tctx->in_idle = false; ++} ++ + static int io_uring_flush(struct file *file, void *data) + { + struct io_ring_ctx *ctx = file->private_data; + +- io_uring_cancel_files(ctx, data); +- + /* + * If the task is going away, cancel work it may have pending + */ + if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) +- io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, current, true); ++ data = NULL; + ++ io_uring_cancel_task_requests(ctx, data); ++ io_uring_attempt_task_drop(file, !data); + return 0; + } + +@@ -7975,8 +8196,11 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned + wake_up(&ctx->sqo_wait); + submitted = to_submit; + } else if (to_submit) { ++ ret = io_uring_add_task_file(f.file); ++ if (unlikely(ret)) ++ goto out; + mutex_lock(&ctx->uring_lock); +- submitted = io_submit_sqes(ctx, to_submit, f.file, fd); ++ submitted = io_submit_sqes(ctx, to_submit); + mutex_unlock(&ctx->uring_lock); + + if (submitted != to_submit) +@@ -8188,6 +8412,7 @@ static int io_uring_get_fd(struct io_rin + file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx, + O_RDWR | O_CLOEXEC); + if (IS_ERR(file)) { ++err_fd: + put_unused_fd(ret); + ret = PTR_ERR(file); + goto err; +@@ -8196,6 +8421,10 @@ static int io_uring_get_fd(struct io_rin + #if defined(CONFIG_UNIX) + ctx->ring_sock->file = file; + #endif ++ if (unlikely(io_uring_add_task_file(file))) { ++ file = ERR_PTR(-ENOMEM); ++ goto err_fd; ++ } + fd_install(ret, file); + return ret; + err: +--- /dev/null ++++ b/include/linux/io_uring.h +@@ -0,0 +1,53 @@ ++/* SPDX-License-Identifier: GPL-2.0-or-later */ ++#ifndef _LINUX_IO_URING_H ++#define _LINUX_IO_URING_H ++ ++#include ++#include ++#include ++ ++struct io_uring_task { ++ /* submission side */ ++ struct xarray xa; ++ struct wait_queue_head wait; ++ struct file *last; ++ atomic_long_t req_issue; ++ ++ /* completion side */ ++ bool in_idle ____cacheline_aligned_in_smp; ++ atomic_long_t req_complete; ++}; ++ ++#if defined(CONFIG_IO_URING) ++void __io_uring_task_cancel(void); ++void __io_uring_files_cancel(struct files_struct *files); ++void __io_uring_free(struct task_struct *tsk); ++ ++static inline void io_uring_task_cancel(void) ++{ ++ if (current->io_uring && !xa_empty(¤t->io_uring->xa)) ++ __io_uring_task_cancel(); ++} ++static inline void io_uring_files_cancel(struct files_struct *files) ++{ ++ if (current->io_uring && !xa_empty(¤t->io_uring->xa)) ++ __io_uring_files_cancel(files); ++} ++static inline void io_uring_free(struct task_struct *tsk) ++{ ++ if (tsk->io_uring) ++ __io_uring_free(tsk); ++} ++#else ++static inline void io_uring_task_cancel(void) ++{ ++} ++static inline void io_uring_files_cancel(struct files_struct *files) ++{ ++} ++static inline void io_uring_free(struct task_struct *tsk) ++{ ++} ++#endif ++ ++#endif +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -61,6 +61,7 @@ struct sighand_struct; + struct signal_struct; + struct task_delay_info; + struct task_group; ++struct io_uring_task; + + /* + * Task state bitmask. NOTE! These bits are also +@@ -923,6 +924,10 @@ struct task_struct { + /* Open file information: */ + struct files_struct *files; + ++#ifdef CONFIG_IO_URING ++ struct io_uring_task *io_uring; ++#endif ++ + /* Namespaces: */ + struct nsproxy *nsproxy; + +--- a/init/init_task.c ++++ b/init/init_task.c +@@ -113,6 +113,9 @@ struct task_struct init_task + .thread = INIT_THREAD, + .fs = &init_fs, + .files = &init_files, ++#ifdef CONFIG_IO_URING ++ .io_uring = NULL, ++#endif + .signal = &init_signals, + .sighand = &init_sighand, + .nsproxy = &init_nsproxy, +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -95,6 +95,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -745,6 +746,7 @@ void __put_task_struct(struct task_struc + WARN_ON(refcount_read(&tsk->usage)); + WARN_ON(tsk == current); + ++ io_uring_free(tsk); + cgroup_free(tsk); + task_numa_free(tsk, true); + security_task_free(tsk); +@@ -2022,6 +2024,10 @@ static __latent_entropy struct task_stru + p->vtime.state = VTIME_INACTIVE; + #endif + ++#ifdef CONFIG_IO_URING ++ p->io_uring = NULL; ++#endif ++ + #if defined(SPLIT_RSS_COUNTING) + memset(&p->rss_stat, 0, sizeof(p->rss_stat)); + #endif diff --git a/queue-5.8/io_uring-don-t-run-task-work-on-an-exiting-task.patch b/queue-5.8/io_uring-don-t-run-task-work-on-an-exiting-task.patch new file mode 100644 index 00000000000..e61b12717dd --- /dev/null +++ b/queue-5.8/io_uring-don-t-run-task-work-on-an-exiting-task.patch @@ -0,0 +1,43 @@ +From foo@baz Thu Oct 29 01:16:54 PM CET 2020 +From: Jens Axboe +Date: Mon, 12 Oct 2020 11:53:29 -0600 +Subject: io_uring: don't run task work on an exiting task + +From: Jens Axboe + +commit 6200b0ae4ea28a4bfd8eb434e33e6201b7a6a282 upstream. + +This isn't safe, and isn't needed either. We are guaranteed that any +work we queue is on a live task (and will be run), or it goes to +our backup io-wq threads if the task is exiting. + +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + fs/io_uring.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +--- a/fs/io_uring.c ++++ b/fs/io_uring.c +@@ -1762,6 +1762,12 @@ static int io_put_kbuf(struct io_kiocb * + + static inline bool io_run_task_work(void) + { ++ /* ++ * Not safe to run on exiting task, and the task_work handling will ++ * not add work to such a task. ++ */ ++ if (unlikely(current->flags & PF_EXITING)) ++ return false; + if (current->task_works) { + __set_current_state(TASK_RUNNING); + task_work_run(); +@@ -7791,6 +7797,8 @@ static void io_uring_cancel_files(struct + io_put_req(cancel_req); + } + ++ /* cancellations _may_ trigger task work */ ++ io_run_task_work(); + schedule(); + finish_wait(&ctx->inflight_wait, &wait); + } diff --git a/queue-5.8/io_uring-enable-task-files-specific-overflow-flushing.patch b/queue-5.8/io_uring-enable-task-files-specific-overflow-flushing.patch new file mode 100644 index 00000000000..cad20791fb2 --- /dev/null +++ b/queue-5.8/io_uring-enable-task-files-specific-overflow-flushing.patch @@ -0,0 +1,131 @@ +From foo@baz Thu Oct 29 01:16:54 PM CET 2020 +From: Jens Axboe +Date: Mon, 28 Sep 2020 13:10:13 -0600 +Subject: io_uring: enable task/files specific overflow flushing + +From: Jens Axboe + +commit e6c8aa9ac33bd7c968af7816240fc081401fddcd upstream. + +This allows us to selectively flush out pending overflows, depending on +the task and/or files_struct being passed in. + +No intended functional changes in this patch. + +Reviewed-by: Pavel Begunkov +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + fs/io_uring.c | 41 ++++++++++++++++++++++++++--------------- + 1 file changed, 26 insertions(+), 15 deletions(-) + +--- a/fs/io_uring.c ++++ b/fs/io_uring.c +@@ -1240,12 +1240,24 @@ static void io_cqring_ev_posted(struct i + eventfd_signal(ctx->cq_ev_fd, 1); + } + ++static inline bool io_match_files(struct io_kiocb *req, ++ struct files_struct *files) ++{ ++ if (!files) ++ return true; ++ if (req->flags & REQ_F_WORK_INITIALIZED) ++ return req->work.files == files; ++ return false; ++} ++ + /* Returns true if there are no backlogged entries after the flush */ +-static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) ++static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, ++ struct task_struct *tsk, ++ struct files_struct *files) + { + struct io_rings *rings = ctx->rings; ++ struct io_kiocb *req, *tmp; + struct io_uring_cqe *cqe; +- struct io_kiocb *req; + unsigned long flags; + LIST_HEAD(list); + +@@ -1264,7 +1276,12 @@ static bool io_cqring_overflow_flush(str + ctx->cq_overflow_flushed = 1; + + cqe = NULL; +- while (!list_empty(&ctx->cq_overflow_list)) { ++ list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, list) { ++ if (tsk && req->task != tsk) ++ continue; ++ if (!io_match_files(req, files)) ++ continue; ++ + cqe = io_get_cqring(ctx); + if (!cqe && !force) + break; +@@ -1734,7 +1751,7 @@ static unsigned io_cqring_events(struct + if (noflush && !list_empty(&ctx->cq_overflow_list)) + return -1U; + +- io_cqring_overflow_flush(ctx, false); ++ io_cqring_overflow_flush(ctx, false, NULL, NULL); + } + + /* See comment at the top of this file */ +@@ -6095,7 +6112,7 @@ static int io_submit_sqes(struct io_ring + /* if we have a backlog and couldn't flush it all, return BUSY */ + if (test_bit(0, &ctx->sq_check_overflow)) { + if (!list_empty(&ctx->cq_overflow_list) && +- !io_cqring_overflow_flush(ctx, false)) ++ !io_cqring_overflow_flush(ctx, false, NULL, NULL)) + return -EBUSY; + } + +@@ -7556,7 +7573,7 @@ static void io_ring_exit_work(struct wor + + ctx = container_of(work, struct io_ring_ctx, exit_work); + if (ctx->rings) +- io_cqring_overflow_flush(ctx, true); ++ io_cqring_overflow_flush(ctx, true, NULL, NULL); + + /* + * If we're doing polled IO and end up having requests being +@@ -7567,7 +7584,7 @@ static void io_ring_exit_work(struct wor + while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)) { + io_iopoll_reap_events(ctx); + if (ctx->rings) +- io_cqring_overflow_flush(ctx, true); ++ io_cqring_overflow_flush(ctx, true, NULL, NULL); + } + io_ring_ctx_free(ctx); + } +@@ -7587,7 +7604,7 @@ static void io_ring_ctx_wait_and_kill(st + io_iopoll_reap_events(ctx); + /* if we failed setting up the ctx, we might not have any rings */ + if (ctx->rings) +- io_cqring_overflow_flush(ctx, true); ++ io_cqring_overflow_flush(ctx, true, NULL, NULL); + idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx); + + /* +@@ -7637,12 +7654,6 @@ static bool io_match_link(struct io_kioc + return false; + } + +-static inline bool io_match_files(struct io_kiocb *req, +- struct files_struct *files) +-{ +- return (req->flags & REQ_F_WORK_INITIALIZED) && req->work.files == files; +-} +- + static bool io_match_link_files(struct io_kiocb *req, + struct files_struct *files) + { +@@ -7959,7 +7970,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned + ret = 0; + if (ctx->flags & IORING_SETUP_SQPOLL) { + if (!list_empty_careful(&ctx->cq_overflow_list)) +- io_cqring_overflow_flush(ctx, false); ++ io_cqring_overflow_flush(ctx, false, NULL, NULL); + if (flags & IORING_ENTER_SQ_WAKEUP) + wake_up(&ctx->sqo_wait); + submitted = to_submit; diff --git a/queue-5.8/io_uring-fix-use-of-xarray-in-__io_uring_files_cancel.patch b/queue-5.8/io_uring-fix-use-of-xarray-in-__io_uring_files_cancel.patch new file mode 100644 index 00000000000..72a693f66a7 --- /dev/null +++ b/queue-5.8/io_uring-fix-use-of-xarray-in-__io_uring_files_cancel.patch @@ -0,0 +1,56 @@ +From foo@baz Thu Oct 29 01:16:54 PM CET 2020 +From: "Matthew Wilcox (Oracle)" +Date: Fri, 9 Oct 2020 13:49:51 +0100 +Subject: io_uring: Fix use of XArray in __io_uring_files_cancel + +From: "Matthew Wilcox (Oracle)" + +commit ce765372bc443573d1d339a2bf4995de385dea3a upstream. + +We have to drop the lock during each iteration, so there's no advantage +to using the advanced API. Convert this to a standard xa_for_each() loop. + +Reported-by: syzbot+27c12725d8ff0bfe1a13@syzkaller.appspotmail.com +Signed-off-by: Matthew Wilcox (Oracle) +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + fs/io_uring.c | 19 +++++-------------- + 1 file changed, 5 insertions(+), 14 deletions(-) + +--- a/fs/io_uring.c ++++ b/fs/io_uring.c +@@ -8008,28 +8008,19 @@ static void io_uring_attempt_task_drop(s + void __io_uring_files_cancel(struct files_struct *files) + { + struct io_uring_task *tctx = current->io_uring; +- XA_STATE(xas, &tctx->xa, 0); ++ struct file *file; ++ unsigned long index; + + /* make sure overflow events are dropped */ + tctx->in_idle = true; + +- do { +- struct io_ring_ctx *ctx; +- struct file *file; +- +- xas_lock(&xas); +- file = xas_next_entry(&xas, ULONG_MAX); +- xas_unlock(&xas); +- +- if (!file) +- break; +- +- ctx = file->private_data; ++ xa_for_each(&tctx->xa, index, file) { ++ struct io_ring_ctx *ctx = file->private_data; + + io_uring_cancel_task_requests(ctx, files); + if (files) + io_uring_del_task_file(file); +- } while (1); ++ } + } + + static inline bool io_uring_task_idle(struct io_uring_task *tctx) diff --git a/queue-5.8/io_uring-fix-xarray-usage-in-io_uring_add_task_file.patch b/queue-5.8/io_uring-fix-xarray-usage-in-io_uring_add_task_file.patch new file mode 100644 index 00000000000..e8036233afa --- /dev/null +++ b/queue-5.8/io_uring-fix-xarray-usage-in-io_uring_add_task_file.patch @@ -0,0 +1,64 @@ +From foo@baz Thu Oct 29 01:16:54 PM CET 2020 +From: "Matthew Wilcox (Oracle)" +Date: Fri, 9 Oct 2020 13:49:52 +0100 +Subject: io_uring: Fix XArray usage in io_uring_add_task_file + +From: "Matthew Wilcox (Oracle)" + +commit 236434c3438c4da3dfbd6aeeab807577b85e951a upstream. + +The xas_store() wasn't paired with an xas_nomem() loop, so if it couldn't +allocate memory using GFP_NOWAIT, it would leak the reference to the file +descriptor. Also the node pointed to by the xas could be freed between +the call to xas_load() under the rcu_read_lock() and the acquisition of +the xa_lock. + +It's easier to just use the normal xa_load/xa_store interface here. + +Signed-off-by: Matthew Wilcox (Oracle) +[axboe: fix missing assign after alloc, cur_uring -> tctx rename] +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + fs/io_uring.c | 21 +++++++++------------ + 1 file changed, 9 insertions(+), 12 deletions(-) + +--- a/fs/io_uring.c ++++ b/fs/io_uring.c +@@ -7929,27 +7929,24 @@ static void io_uring_cancel_task_request + */ + static int io_uring_add_task_file(struct file *file) + { +- if (unlikely(!current->io_uring)) { ++ struct io_uring_task *tctx = current->io_uring; ++ ++ if (unlikely(!tctx)) { + int ret; + + ret = io_uring_alloc_task_context(current); + if (unlikely(ret)) + return ret; ++ tctx = current->io_uring; + } +- if (current->io_uring->last != file) { +- XA_STATE(xas, ¤t->io_uring->xa, (unsigned long) file); +- void *old; ++ if (tctx->last != file) { ++ void *old = xa_load(&tctx->xa, (unsigned long)file); + +- rcu_read_lock(); +- old = xas_load(&xas); +- if (old != file) { ++ if (!old) { + get_file(file); +- xas_lock(&xas); +- xas_store(&xas, file); +- xas_unlock(&xas); ++ xa_store(&tctx->xa, (unsigned long)file, file, GFP_KERNEL); + } +- rcu_read_unlock(); +- current->io_uring->last = file; ++ tctx->last = file; + } + + return 0; diff --git a/queue-5.8/io_uring-move-dropping-of-files-into-separate-helper.patch b/queue-5.8/io_uring-move-dropping-of-files-into-separate-helper.patch new file mode 100644 index 00000000000..ecd4d40b777 --- /dev/null +++ b/queue-5.8/io_uring-move-dropping-of-files-into-separate-helper.patch @@ -0,0 +1,61 @@ +From foo@baz Thu Oct 29 01:16:54 PM CET 2020 +From: Jens Axboe +Date: Mon, 12 Oct 2020 11:03:18 -0600 +Subject: io_uring: move dropping of files into separate helper + +From: Jens Axboe + +commit f573d384456b3025d3f8e58b3eafaeeb0f510784 upstream. + +No functional changes in this patch, prep patch for grabbing references +to the files_struct. + +Reviewed-by: Pavel Begunkov +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + fs/io_uring.c | 26 ++++++++++++++++---------- + 1 file changed, 16 insertions(+), 10 deletions(-) + +--- a/fs/io_uring.c ++++ b/fs/io_uring.c +@@ -1424,6 +1424,20 @@ static inline void io_put_file(struct io + fput(file); + } + ++static void io_req_drop_files(struct io_kiocb *req) ++{ ++ struct io_ring_ctx *ctx = req->ctx; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ctx->inflight_lock, flags); ++ list_del(&req->inflight_entry); ++ if (waitqueue_active(&ctx->inflight_wait)) ++ wake_up(&ctx->inflight_wait); ++ spin_unlock_irqrestore(&ctx->inflight_lock, flags); ++ req->flags &= ~REQ_F_INFLIGHT; ++ req->work.files = NULL; ++} ++ + static void __io_req_aux_free(struct io_kiocb *req) + { + if (req->flags & REQ_F_NEED_CLEANUP) +@@ -1440,16 +1454,8 @@ static void __io_free_req(struct io_kioc + { + __io_req_aux_free(req); + +- if (req->flags & REQ_F_INFLIGHT) { +- struct io_ring_ctx *ctx = req->ctx; +- unsigned long flags; +- +- spin_lock_irqsave(&ctx->inflight_lock, flags); +- list_del(&req->inflight_entry); +- if (waitqueue_active(&ctx->inflight_wait)) +- wake_up(&ctx->inflight_wait); +- spin_unlock_irqrestore(&ctx->inflight_lock, flags); +- } ++ if (req->flags & REQ_F_INFLIGHT) ++ io_req_drop_files(req); + + percpu_ref_put(&req->ctx->refs); + if (likely(!io_is_fallback_req(req))) diff --git a/queue-5.8/io_uring-no-need-to-call-xa_destroy-on-empty-xarray.patch b/queue-5.8/io_uring-no-need-to-call-xa_destroy-on-empty-xarray.patch new file mode 100644 index 00000000000..b74efdbda9f --- /dev/null +++ b/queue-5.8/io_uring-no-need-to-call-xa_destroy-on-empty-xarray.patch @@ -0,0 +1,105 @@ +From foo@baz Thu Oct 29 01:16:54 PM CET 2020 +From: Jens Axboe +Date: Thu, 8 Oct 2020 07:46:52 -0600 +Subject: io_uring: no need to call xa_destroy() on empty xarray + +From: Jens Axboe + +commit ca6484cd308a671811bf39f3119e81966eb476e3 upstream. + +The kernel test robot reports this lockdep issue: + +[child1:659] mbind (274) returned ENOSYS, marking as inactive. +[child1:659] mq_timedsend (279) returned ENOSYS, marking as inactive. +[main] 10175 iterations. [F:7781 S:2344 HI:2397] +[ 24.610601] +[ 24.610743] ================================ +[ 24.611083] WARNING: inconsistent lock state +[ 24.611437] 5.9.0-rc7-00017-g0f2122045b9462 #5 Not tainted +[ 24.611861] -------------------------------- +[ 24.612193] inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage. +[ 24.612660] ksoftirqd/0/7 [HC0[0]:SC1[3]:HE0:SE0] takes: +[ 24.613086] f00ed998 (&xa->xa_lock#4){+.?.}-{2:2}, at: xa_destroy+0x43/0xc1 +[ 24.613642] {SOFTIRQ-ON-W} state was registered at: +[ 24.614024] lock_acquire+0x20c/0x29b +[ 24.614341] _raw_spin_lock+0x21/0x30 +[ 24.614636] io_uring_add_task_file+0xe8/0x13a +[ 24.614987] io_uring_create+0x535/0x6bd +[ 24.615297] io_uring_setup+0x11d/0x136 +[ 24.615606] __ia32_sys_io_uring_setup+0xd/0xf +[ 24.615977] do_int80_syscall_32+0x53/0x6c +[ 24.616306] restore_all_switch_stack+0x0/0xb1 +[ 24.616677] irq event stamp: 939881 +[ 24.616968] hardirqs last enabled at (939880): [<8105592d>] __local_bh_enable_ip+0x13c/0x145 +[ 24.617642] hardirqs last disabled at (939881): [<81b6ace3>] _raw_spin_lock_irqsave+0x1b/0x4e +[ 24.618321] softirqs last enabled at (939738): [<81b6c7c8>] __do_softirq+0x3f0/0x45a +[ 24.618924] softirqs last disabled at (939743): [<81055741>] run_ksoftirqd+0x35/0x61 +[ 24.619521] +[ 24.619521] other info that might help us debug this: +[ 24.620028] Possible unsafe locking scenario: +[ 24.620028] +[ 24.620492] CPU0 +[ 24.620685] ---- +[ 24.620894] lock(&xa->xa_lock#4); +[ 24.621168] +[ 24.621381] lock(&xa->xa_lock#4); +[ 24.621695] +[ 24.621695] *** DEADLOCK *** +[ 24.621695] +[ 24.622154] 1 lock held by ksoftirqd/0/7: +[ 24.622468] #0: 823bfb94 (rcu_callback){....}-{0:0}, at: rcu_process_callbacks+0xc0/0x155 +[ 24.623106] +[ 24.623106] stack backtrace: +[ 24.623454] CPU: 0 PID: 7 Comm: ksoftirqd/0 Not tainted 5.9.0-rc7-00017-g0f2122045b9462 #5 +[ 24.624090] Call Trace: +[ 24.624284] ? show_stack+0x40/0x46 +[ 24.624551] dump_stack+0x1b/0x1d +[ 24.624809] print_usage_bug+0x17a/0x185 +[ 24.625142] mark_lock+0x11d/0x1db +[ 24.625474] ? print_shortest_lock_dependencies+0x121/0x121 +[ 24.625905] __lock_acquire+0x41e/0x7bf +[ 24.626206] lock_acquire+0x20c/0x29b +[ 24.626517] ? xa_destroy+0x43/0xc1 +[ 24.626810] ? lock_acquire+0x20c/0x29b +[ 24.627110] _raw_spin_lock_irqsave+0x3e/0x4e +[ 24.627450] ? xa_destroy+0x43/0xc1 +[ 24.627725] xa_destroy+0x43/0xc1 +[ 24.627989] __io_uring_free+0x57/0x71 +[ 24.628286] ? get_pid+0x22/0x22 +[ 24.628544] __put_task_struct+0xf2/0x163 +[ 24.628865] put_task_struct+0x1f/0x2a +[ 24.629161] delayed_put_task_struct+0xe2/0xe9 +[ 24.629509] rcu_process_callbacks+0x128/0x155 +[ 24.629860] __do_softirq+0x1a3/0x45a +[ 24.630151] run_ksoftirqd+0x35/0x61 +[ 24.630443] smpboot_thread_fn+0x304/0x31a +[ 24.630763] kthread+0x124/0x139 +[ 24.631016] ? sort_range+0x18/0x18 +[ 24.631290] ? kthread_create_worker_on_cpu+0x17/0x17 +[ 24.631682] ret_from_fork+0x1c/0x28 + +which is complaining about xa_destroy() grabbing the xa lock in an +IRQ disabling fashion, whereas the io_uring uses cases aren't interrupt +safe. This is really an xarray issue, since it should not assume the +lock type. But for our use case, since we know the xarray is empty at +this point, there's no need to actually call xa_destroy(). So just get +rid of it. + +Fixes: 0f2122045b94 ("io_uring: don't rely on weak ->files references") +Reported-by: kernel test robot +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + fs/io_uring.c | 1 - + 1 file changed, 1 deletion(-) + +--- a/fs/io_uring.c ++++ b/fs/io_uring.c +@@ -7136,7 +7136,6 @@ void __io_uring_free(struct task_struct + struct io_uring_task *tctx = tsk->io_uring; + + WARN_ON_ONCE(!xa_empty(&tctx->xa)); +- xa_destroy(&tctx->xa); + kfree(tctx); + tsk->io_uring = NULL; + } diff --git a/queue-5.8/io_uring-reference-nsproxy-for-file-table-commands.patch b/queue-5.8/io_uring-reference-nsproxy-for-file-table-commands.patch new file mode 100644 index 00000000000..f59e826eb3f --- /dev/null +++ b/queue-5.8/io_uring-reference-nsproxy-for-file-table-commands.patch @@ -0,0 +1,87 @@ +From foo@baz Thu Oct 29 01:16:54 PM CET 2020 +From: Jens Axboe +Date: Fri, 18 Sep 2020 20:13:06 -0600 +Subject: io_uring: reference ->nsproxy for file table commands + +From: Jens Axboe + +commit 9b8284921513fc1ea57d87777283a59b05862f03 upstream. + +If we don't get and assign the namespace for the async work, then certain +paths just don't work properly (like /dev/stdin, /proc/mounts, etc). +Anything that references the current namespace of the given task should +be assigned for async work on behalf of that task. + +Cc: stable@vger.kernel.org # v5.5+ +Reported-by: Al Viro +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + fs/io-wq.c | 4 ++++ + fs/io-wq.h | 1 + + fs/io_uring.c | 3 +++ + 3 files changed, 8 insertions(+) + +--- a/fs/io-wq.c ++++ b/fs/io-wq.c +@@ -60,6 +60,7 @@ struct io_worker { + const struct cred *cur_creds; + const struct cred *saved_creds; + struct files_struct *restore_files; ++ struct nsproxy *restore_nsproxy; + struct fs_struct *restore_fs; + }; + +@@ -153,6 +154,7 @@ static bool __io_worker_unuse(struct io_ + + task_lock(current); + current->files = worker->restore_files; ++ current->nsproxy = worker->restore_nsproxy; + task_unlock(current); + } + +@@ -318,6 +320,7 @@ static void io_worker_start(struct io_wq + + worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING); + worker->restore_files = current->files; ++ worker->restore_nsproxy = current->nsproxy; + worker->restore_fs = current->fs; + io_wqe_inc_running(wqe, worker); + } +@@ -454,6 +457,7 @@ static void io_impersonate_work(struct i + if (work->files && current->files != work->files) { + task_lock(current); + current->files = work->files; ++ current->nsproxy = work->nsproxy; + task_unlock(current); + } + if (work->fs && current->fs != work->fs) +--- a/fs/io-wq.h ++++ b/fs/io-wq.h +@@ -88,6 +88,7 @@ struct io_wq_work { + struct files_struct *files; + struct mm_struct *mm; + const struct cred *creds; ++ struct nsproxy *nsproxy; + struct fs_struct *fs; + unsigned flags; + }; +--- a/fs/io_uring.c ++++ b/fs/io_uring.c +@@ -1456,6 +1456,7 @@ static void io_req_drop_files(struct io_ + spin_unlock_irqrestore(&ctx->inflight_lock, flags); + req->flags &= ~REQ_F_INFLIGHT; + put_files_struct(req->work.files); ++ put_nsproxy(req->work.nsproxy); + req->work.files = NULL; + } + +@@ -5685,6 +5686,8 @@ static int io_grab_files(struct io_kiocb + return 0; + + req->work.files = get_files_struct(current); ++ get_nsproxy(current->nsproxy); ++ req->work.nsproxy = current->nsproxy; + req->flags |= REQ_F_INFLIGHT; + + spin_lock_irq(&ctx->inflight_lock); diff --git a/queue-5.8/io_uring-return-cancelation-status-from-poll-timeout-files-handlers.patch b/queue-5.8/io_uring-return-cancelation-status-from-poll-timeout-files-handlers.patch new file mode 100644 index 00000000000..21bfa6fb379 --- /dev/null +++ b/queue-5.8/io_uring-return-cancelation-status-from-poll-timeout-files-handlers.patch @@ -0,0 +1,96 @@ +From foo@baz Thu Oct 29 01:16:54 PM CET 2020 +From: Jens Axboe +Date: Sat, 26 Sep 2020 15:05:03 -0600 +Subject: io_uring: return cancelation status from poll/timeout/files handlers + +From: Jens Axboe + +commit 76e1b6427fd8246376a97e3227049d49188dfb9c upstream. + +Return whether we found and canceled requests or not. This is in +preparation for using this information, no functional changes in this +patch. + +Reviewed-by: Pavel Begunkov +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + fs/io_uring.c | 30 ++++++++++++++++++++++++------ + 1 file changed, 24 insertions(+), 6 deletions(-) + +--- a/fs/io_uring.c ++++ b/fs/io_uring.c +@@ -1143,15 +1143,23 @@ static bool io_task_match(struct io_kioc + return false; + } + +-static void io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk) ++/* ++ * Returns true if we found and killed one or more timeouts ++ */ ++static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk) + { + struct io_kiocb *req, *tmp; ++ int canceled = 0; + + spin_lock_irq(&ctx->completion_lock); +- list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list) +- if (io_task_match(req, tsk)) ++ list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list) { ++ if (io_task_match(req, tsk)) { + io_kill_timeout(req); ++ canceled++; ++ } ++ } + spin_unlock_irq(&ctx->completion_lock); ++ return canceled != 0; + } + + static void __io_queue_deferred(struct io_ring_ctx *ctx) +@@ -4650,7 +4658,10 @@ static bool io_poll_remove_one(struct io + return do_complete; + } + +-static void io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk) ++/* ++ * Returns true if we found and killed one or more poll requests ++ */ ++static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk) + { + struct hlist_node *tmp; + struct io_kiocb *req; +@@ -4670,6 +4681,8 @@ static void io_poll_remove_all(struct io + + if (posted) + io_cqring_ev_posted(ctx); ++ ++ return posted != 0; + } + + static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr) +@@ -7744,11 +7757,14 @@ static void io_cancel_defer_files(struct + } + } + +-static void io_uring_cancel_files(struct io_ring_ctx *ctx, ++/* ++ * Returns true if we found and killed one or more files pinning requests ++ */ ++static bool io_uring_cancel_files(struct io_ring_ctx *ctx, + struct files_struct *files) + { + if (list_empty_careful(&ctx->inflight_list)) +- return; ++ return false; + + io_cancel_defer_files(ctx, files); + /* cancel all at once, should be faster than doing it one by one*/ +@@ -7811,6 +7827,8 @@ static void io_uring_cancel_files(struct + schedule(); + finish_wait(&ctx->inflight_wait, &wait); + } ++ ++ return true; + } + + static bool io_cancel_task_cb(struct io_wq_work *work, void *data) diff --git a/queue-5.8/io_uring-stash-ctx-task-reference-for-sqpoll.patch b/queue-5.8/io_uring-stash-ctx-task-reference-for-sqpoll.patch new file mode 100644 index 00000000000..a7241e757bd --- /dev/null +++ b/queue-5.8/io_uring-stash-ctx-task-reference-for-sqpoll.patch @@ -0,0 +1,104 @@ +From foo@baz Thu Oct 29 01:16:54 PM CET 2020 +From: Jens Axboe +Date: Mon, 12 Oct 2020 11:15:07 -0600 +Subject: io_uring: stash ctx task reference for SQPOLL + +From: Jens Axboe + +commit 2aede0e417db846793c276c7a1bbf7262c8349b0 upstream. + +We can grab a reference to the task instead of stashing away the task +files_struct. This is doable without creating a circular reference +between the ring fd and the task itself. + +Reviewed-by: Pavel Begunkov +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + fs/io_uring.c | 39 +++++++++++++++++++++++++++++---------- + 1 file changed, 29 insertions(+), 10 deletions(-) + +--- a/fs/io_uring.c ++++ b/fs/io_uring.c +@@ -264,7 +264,16 @@ struct io_ring_ctx { + /* IO offload */ + struct io_wq *io_wq; + struct task_struct *sqo_thread; /* if using sq thread polling */ +- struct mm_struct *sqo_mm; ++ ++ /* ++ * For SQPOLL usage - we hold a reference to the parent task, so we ++ * have access to the ->files ++ */ ++ struct task_struct *sqo_task; ++ ++ /* Only used for accounting purposes */ ++ struct mm_struct *mm_account; ++ + wait_queue_head_t sqo_wait; + + /* +@@ -4421,9 +4430,10 @@ static int io_sq_thread_acquire_mm(struc + { + if (io_op_defs[req->opcode].needs_mm && !current->mm) { + if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL) || +- !mmget_not_zero(ctx->sqo_mm))) ++ !ctx->sqo_task->mm || ++ !mmget_not_zero(ctx->sqo_task->mm))) + return -EFAULT; +- kthread_use_mm(ctx->sqo_mm); ++ kthread_use_mm(ctx->sqo_task->mm); + } + + return 0; +@@ -7104,9 +7114,6 @@ static int io_sq_offload_start(struct io + { + int ret; + +- mmgrab(current->mm); +- ctx->sqo_mm = current->mm; +- + if (ctx->flags & IORING_SETUP_SQPOLL) { + ret = -EPERM; + if (!capable(CAP_SYS_ADMIN)) +@@ -7151,8 +7158,6 @@ static int io_sq_offload_start(struct io + return 0; + err: + io_finish_async(ctx); +- mmdrop(ctx->sqo_mm); +- ctx->sqo_mm = NULL; + return ret; + } + +@@ -7482,8 +7487,12 @@ static void io_destroy_buffers(struct io + static void io_ring_ctx_free(struct io_ring_ctx *ctx) + { + io_finish_async(ctx); +- if (ctx->sqo_mm) +- mmdrop(ctx->sqo_mm); ++ if (ctx->sqo_task) { ++ put_task_struct(ctx->sqo_task); ++ ctx->sqo_task = NULL; ++ mmdrop(ctx->mm_account); ++ ctx->mm_account = NULL; ++ } + + io_iopoll_reap_events(ctx); + io_sqe_buffer_unregister(ctx); +@@ -8256,6 +8265,16 @@ static int io_uring_create(unsigned entr + ctx->user = user; + ctx->creds = get_current_cred(); + ++ ctx->sqo_task = get_task_struct(current); ++ /* ++ * This is just grabbed for accounting purposes. When a process exits, ++ * the mm is exited and dropped before the files, hence we need to hang ++ * on to this mm purely for the purposes of being able to unaccount ++ * memory (locked/pinned vm). It's not used for anything else. ++ */ ++ mmgrab(current->mm); ++ ctx->mm_account = current->mm; ++ + ret = io_allocate_scq_urings(ctx, p); + if (ret) + goto err; diff --git a/queue-5.8/io_uring-unconditionally-grab-req-task.patch b/queue-5.8/io_uring-unconditionally-grab-req-task.patch new file mode 100644 index 00000000000..348ee93bc57 --- /dev/null +++ b/queue-5.8/io_uring-unconditionally-grab-req-task.patch @@ -0,0 +1,104 @@ +From foo@baz Thu Oct 29 01:16:54 PM CET 2020 +From: Jens Axboe +Date: Mon, 12 Oct 2020 11:25:39 -0600 +Subject: io_uring: unconditionally grab req->task + +From: Jens Axboe + +commit e3bc8e9dad7f2f83cc807111d4472164c9210153 upstream. + +Sometimes we assign a weak reference to it, sometimes we grab a +reference to it. Clean this up and make it unconditional, and drop the +flag related to tracking this state. + +Reviewed-by: Pavel Begunkov +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + fs/io_uring.c | 26 +++----------------------- + 1 file changed, 3 insertions(+), 23 deletions(-) + +--- a/fs/io_uring.c ++++ b/fs/io_uring.c +@@ -550,7 +550,6 @@ enum { + REQ_F_NO_FILE_TABLE_BIT, + REQ_F_QUEUE_TIMEOUT_BIT, + REQ_F_WORK_INITIALIZED_BIT, +- REQ_F_TASK_PINNED_BIT, + + /* not a real bit, just to check we're not overflowing the space */ + __REQ_F_LAST_BIT, +@@ -608,8 +607,6 @@ enum { + REQ_F_QUEUE_TIMEOUT = BIT(REQ_F_QUEUE_TIMEOUT_BIT), + /* io_wq_work is initialized */ + REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT), +- /* req->task is refcounted */ +- REQ_F_TASK_PINNED = BIT(REQ_F_TASK_PINNED_BIT), + }; + + struct async_poll { +@@ -924,21 +921,6 @@ struct sock *io_uring_get_socket(struct + } + EXPORT_SYMBOL(io_uring_get_socket); + +-static void io_get_req_task(struct io_kiocb *req) +-{ +- if (req->flags & REQ_F_TASK_PINNED) +- return; +- get_task_struct(req->task); +- req->flags |= REQ_F_TASK_PINNED; +-} +- +-/* not idempotent -- it doesn't clear REQ_F_TASK_PINNED */ +-static void __io_put_req_task(struct io_kiocb *req) +-{ +- if (req->flags & REQ_F_TASK_PINNED) +- put_task_struct(req->task); +-} +- + static void io_file_put_work(struct work_struct *work); + + /* +@@ -1455,7 +1437,7 @@ static void __io_req_aux_free(struct io_ + kfree(req->io); + if (req->file) + io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); +- __io_put_req_task(req); ++ put_task_struct(req->task); + io_req_work_drop_env(req); + } + +@@ -1765,7 +1747,7 @@ static inline bool io_req_multi_free(str + if ((req->flags & REQ_F_LINK_HEAD) || io_is_fallback_req(req)) + return false; + +- if (req->file || req->io) ++ if (req->file || req->io || req->task) + rb->need_iter++; + + rb->reqs[rb->to_free++] = req; +@@ -4584,7 +4566,6 @@ static bool io_arm_poll_handler(struct i + if (req->flags & REQ_F_WORK_INITIALIZED) + memcpy(&apoll->work, &req->work, sizeof(req->work)); + +- io_get_req_task(req); + req->apoll = apoll; + INIT_HLIST_NODE(&req->hash_node); + +@@ -4774,8 +4755,6 @@ static int io_poll_add_prep(struct io_ki + + events = READ_ONCE(sqe->poll_events); + poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; +- +- io_get_req_task(req); + return 0; + } + +@@ -6057,6 +6036,7 @@ static int io_init_req(struct io_ring_ct + /* one is dropped after submission, the other at completion */ + refcount_set(&req->refs, 2); + req->task = current; ++ get_task_struct(req->task); + req->result = 0; + + if (unlikely(req->opcode >= IORING_OP_LAST)) diff --git a/queue-5.8/io_wq-make-io_wqe-lock-a-raw_spinlock_t.patch b/queue-5.8/io_wq-make-io_wqe-lock-a-raw_spinlock_t.patch new file mode 100644 index 00000000000..95b29b7e0bf --- /dev/null +++ b/queue-5.8/io_wq-make-io_wqe-lock-a-raw_spinlock_t.patch @@ -0,0 +1,253 @@ +From foo@baz Thu Oct 29 01:16:54 PM CET 2020 +From: Sebastian Andrzej Siewior +Date: Tue, 1 Sep 2020 10:41:46 +0200 +Subject: io_wq: Make io_wqe::lock a raw_spinlock_t + +From: Sebastian Andrzej Siewior + +commit 95da84659226d75698a1ab958be0af21d9cc2a9c upstream. + +During a context switch the scheduler invokes wq_worker_sleeping() with +disabled preemption. Disabling preemption is needed because it protects +access to `worker->sleeping'. As an optimisation it avoids invoking +schedule() within the schedule path as part of possible wake up (thus +preempt_enable_no_resched() afterwards). + +The io-wq has been added to the mix in the same section with disabled +preemption. This breaks on PREEMPT_RT because io_wq_worker_sleeping() +acquires a spinlock_t. Also within the schedule() the spinlock_t must be +acquired after tsk_is_pi_blocked() otherwise it will block on the +sleeping lock again while scheduling out. + +While playing with `io_uring-bench' I didn't notice a significant +latency spike after converting io_wqe::lock to a raw_spinlock_t. The +latency was more or less the same. + +In order to keep the spinlock_t it would have to be moved after the +tsk_is_pi_blocked() check which would introduce a branch instruction +into the hot path. + +The lock is used to maintain the `work_list' and wakes one task up at +most. +Should io_wqe_cancel_pending_work() cause latency spikes, while +searching for a specific item, then it would need to drop the lock +during iterations. +revert_creds() is also invoked under the lock. According to debug +cred::non_rcu is 0. Otherwise it should be moved outside of the locked +section because put_cred_rcu()->free_uid() acquires a sleeping lock. + +Convert io_wqe::lock to a raw_spinlock_t.c + +Signed-off-by: Sebastian Andrzej Siewior +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + fs/io-wq.c | 52 ++++++++++++++++++++++++++-------------------------- + 1 file changed, 26 insertions(+), 26 deletions(-) + +--- a/fs/io-wq.c ++++ b/fs/io-wq.c +@@ -88,7 +88,7 @@ enum { + */ + struct io_wqe { + struct { +- spinlock_t lock; ++ raw_spinlock_t lock; + struct io_wq_work_list work_list; + unsigned long hash_map; + unsigned flags; +@@ -149,7 +149,7 @@ static bool __io_worker_unuse(struct io_ + + if (current->files != worker->restore_files) { + __acquire(&wqe->lock); +- spin_unlock_irq(&wqe->lock); ++ raw_spin_unlock_irq(&wqe->lock); + dropped_lock = true; + + task_lock(current); +@@ -168,7 +168,7 @@ static bool __io_worker_unuse(struct io_ + if (worker->mm) { + if (!dropped_lock) { + __acquire(&wqe->lock); +- spin_unlock_irq(&wqe->lock); ++ raw_spin_unlock_irq(&wqe->lock); + dropped_lock = true; + } + __set_current_state(TASK_RUNNING); +@@ -222,17 +222,17 @@ static void io_worker_exit(struct io_wor + worker->flags = 0; + preempt_enable(); + +- spin_lock_irq(&wqe->lock); ++ raw_spin_lock_irq(&wqe->lock); + hlist_nulls_del_rcu(&worker->nulls_node); + list_del_rcu(&worker->all_list); + if (__io_worker_unuse(wqe, worker)) { + __release(&wqe->lock); +- spin_lock_irq(&wqe->lock); ++ raw_spin_lock_irq(&wqe->lock); + } + acct->nr_workers--; + nr_workers = wqe->acct[IO_WQ_ACCT_BOUND].nr_workers + + wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers; +- spin_unlock_irq(&wqe->lock); ++ raw_spin_unlock_irq(&wqe->lock); + + /* all workers gone, wq exit can proceed */ + if (!nr_workers && refcount_dec_and_test(&wqe->wq->refs)) +@@ -508,7 +508,7 @@ get_next: + else if (!wq_list_empty(&wqe->work_list)) + wqe->flags |= IO_WQE_FLAG_STALLED; + +- spin_unlock_irq(&wqe->lock); ++ raw_spin_unlock_irq(&wqe->lock); + if (!work) + break; + io_assign_current_work(worker, work); +@@ -543,7 +543,7 @@ get_next: + io_wqe_enqueue(wqe, linked); + + if (hash != -1U && !next_hashed) { +- spin_lock_irq(&wqe->lock); ++ raw_spin_lock_irq(&wqe->lock); + wqe->hash_map &= ~BIT_ULL(hash); + wqe->flags &= ~IO_WQE_FLAG_STALLED; + /* dependent work is not hashed */ +@@ -551,11 +551,11 @@ get_next: + /* skip unnecessary unlock-lock wqe->lock */ + if (!work) + goto get_next; +- spin_unlock_irq(&wqe->lock); ++ raw_spin_unlock_irq(&wqe->lock); + } + } while (work); + +- spin_lock_irq(&wqe->lock); ++ raw_spin_lock_irq(&wqe->lock); + } while (1); + } + +@@ -570,7 +570,7 @@ static int io_wqe_worker(void *data) + while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) { + set_current_state(TASK_INTERRUPTIBLE); + loop: +- spin_lock_irq(&wqe->lock); ++ raw_spin_lock_irq(&wqe->lock); + if (io_wqe_run_queue(wqe)) { + __set_current_state(TASK_RUNNING); + io_worker_handle_work(worker); +@@ -581,7 +581,7 @@ loop: + __release(&wqe->lock); + goto loop; + } +- spin_unlock_irq(&wqe->lock); ++ raw_spin_unlock_irq(&wqe->lock); + if (signal_pending(current)) + flush_signals(current); + if (schedule_timeout(WORKER_IDLE_TIMEOUT)) +@@ -593,11 +593,11 @@ loop: + } + + if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) { +- spin_lock_irq(&wqe->lock); ++ raw_spin_lock_irq(&wqe->lock); + if (!wq_list_empty(&wqe->work_list)) + io_worker_handle_work(worker); + else +- spin_unlock_irq(&wqe->lock); ++ raw_spin_unlock_irq(&wqe->lock); + } + + io_worker_exit(worker); +@@ -637,9 +637,9 @@ void io_wq_worker_sleeping(struct task_s + + worker->flags &= ~IO_WORKER_F_RUNNING; + +- spin_lock_irq(&wqe->lock); ++ raw_spin_lock_irq(&wqe->lock); + io_wqe_dec_running(wqe, worker); +- spin_unlock_irq(&wqe->lock); ++ raw_spin_unlock_irq(&wqe->lock); + } + + static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) +@@ -663,7 +663,7 @@ static bool create_io_worker(struct io_w + return false; + } + +- spin_lock_irq(&wqe->lock); ++ raw_spin_lock_irq(&wqe->lock); + hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); + list_add_tail_rcu(&worker->all_list, &wqe->all_list); + worker->flags |= IO_WORKER_F_FREE; +@@ -672,7 +672,7 @@ static bool create_io_worker(struct io_w + if (!acct->nr_workers && (worker->flags & IO_WORKER_F_BOUND)) + worker->flags |= IO_WORKER_F_FIXED; + acct->nr_workers++; +- spin_unlock_irq(&wqe->lock); ++ raw_spin_unlock_irq(&wqe->lock); + + if (index == IO_WQ_ACCT_UNBOUND) + atomic_inc(&wq->user->processes); +@@ -727,12 +727,12 @@ static int io_wq_manager(void *data) + if (!node_online(node)) + continue; + +- spin_lock_irq(&wqe->lock); ++ raw_spin_lock_irq(&wqe->lock); + if (io_wqe_need_worker(wqe, IO_WQ_ACCT_BOUND)) + fork_worker[IO_WQ_ACCT_BOUND] = true; + if (io_wqe_need_worker(wqe, IO_WQ_ACCT_UNBOUND)) + fork_worker[IO_WQ_ACCT_UNBOUND] = true; +- spin_unlock_irq(&wqe->lock); ++ raw_spin_unlock_irq(&wqe->lock); + if (fork_worker[IO_WQ_ACCT_BOUND]) + create_io_worker(wq, wqe, IO_WQ_ACCT_BOUND); + if (fork_worker[IO_WQ_ACCT_UNBOUND]) +@@ -829,10 +829,10 @@ static void io_wqe_enqueue(struct io_wqe + } + + work_flags = work->flags; +- spin_lock_irqsave(&wqe->lock, flags); ++ raw_spin_lock_irqsave(&wqe->lock, flags); + io_wqe_insert_work(wqe, work); + wqe->flags &= ~IO_WQE_FLAG_STALLED; +- spin_unlock_irqrestore(&wqe->lock, flags); ++ raw_spin_unlock_irqrestore(&wqe->lock, flags); + + if ((work_flags & IO_WQ_WORK_CONCURRENT) || + !atomic_read(&acct->nr_running)) +@@ -959,13 +959,13 @@ static void io_wqe_cancel_pending_work(s + unsigned long flags; + + retry: +- spin_lock_irqsave(&wqe->lock, flags); ++ raw_spin_lock_irqsave(&wqe->lock, flags); + wq_list_for_each(node, prev, &wqe->work_list) { + work = container_of(node, struct io_wq_work, list); + if (!match->fn(work, match->data)) + continue; + io_wqe_remove_pending(wqe, work, prev); +- spin_unlock_irqrestore(&wqe->lock, flags); ++ raw_spin_unlock_irqrestore(&wqe->lock, flags); + io_run_cancel(work, wqe); + match->nr_pending++; + if (!match->cancel_all) +@@ -974,7 +974,7 @@ retry: + /* not safe to continue after unlock */ + goto retry; + } +- spin_unlock_irqrestore(&wqe->lock, flags); ++ raw_spin_unlock_irqrestore(&wqe->lock, flags); + } + + static void io_wqe_cancel_running_work(struct io_wqe *wqe, +@@ -1082,7 +1082,7 @@ struct io_wq *io_wq_create(unsigned boun + } + atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0); + wqe->wq = wq; +- spin_lock_init(&wqe->lock); ++ raw_spin_lock_init(&wqe->lock); + INIT_WQ_LIST(&wqe->work_list); + INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0); + INIT_LIST_HEAD(&wqe->all_list); diff --git a/queue-5.8/series b/queue-5.8/series index a9871319fc4..2cb6815f349 100644 --- a/queue-5.8/series +++ b/queue-5.8/series @@ -1 +1,16 @@ netfilter-nftables_offload-kasan-slab-out-of-bounds-read-in-nft_flow_rule_create.patch +io_uring-don-t-run-task-work-on-an-exiting-task.patch +io_uring-allow-timeout-poll-files-killing-to-take-task-into-account.patch +io_uring-move-dropping-of-files-into-separate-helper.patch +io_uring-stash-ctx-task-reference-for-sqpoll.patch +io_uring-unconditionally-grab-req-task.patch +io_uring-return-cancelation-status-from-poll-timeout-files-handlers.patch +io_uring-enable-task-files-specific-overflow-flushing.patch +io_uring-don-t-rely-on-weak-files-references.patch +io_uring-reference-nsproxy-for-file-table-commands.patch +io_wq-make-io_wqe-lock-a-raw_spinlock_t.patch +io-wq-fix-use-after-free-in-io_wq_worker_running.patch +io_uring-no-need-to-call-xa_destroy-on-empty-xarray.patch +io_uring-fix-use-of-xarray-in-__io_uring_files_cancel.patch +io_uring-fix-xarray-usage-in-io_uring_add_task_file.patch +io_uring-convert-advanced-xarray-uses-to-the-normal-api.patch -- 2.47.3