--- /dev/null
+From foo@baz Thu Oct 29 01:16:54 PM CET 2020
+From: Hillf Danton <hdanton@sina.com>
+Date: Sat, 26 Sep 2020 21:26:55 +0800
+Subject: io-wq: fix use-after-free in io_wq_worker_running
+
+From: Hillf Danton <hdanton@sina.com>
+
+commit c4068bf898ddaef791049a366828d9b84b467bda upstream.
+
+The smart syzbot has found a reproducer for the following issue:
+
+ ==================================================================
+ BUG: KASAN: use-after-free in instrument_atomic_write include/linux/instrumented.h:71 [inline]
+ BUG: KASAN: use-after-free in atomic_inc include/asm-generic/atomic-instrumented.h:240 [inline]
+ BUG: KASAN: use-after-free in io_wqe_inc_running fs/io-wq.c:301 [inline]
+ BUG: KASAN: use-after-free in io_wq_worker_running+0xde/0x110 fs/io-wq.c:613
+ Write of size 4 at addr ffff8882183db08c by task io_wqe_worker-0/7771
+
+ CPU: 0 PID: 7771 Comm: io_wqe_worker-0 Not tainted 5.9.0-rc4-syzkaller #0
+ Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
+ Call Trace:
+ __dump_stack lib/dump_stack.c:77 [inline]
+ dump_stack+0x198/0x1fd lib/dump_stack.c:118
+ print_address_description.constprop.0.cold+0xae/0x497 mm/kasan/report.c:383
+ __kasan_report mm/kasan/report.c:513 [inline]
+ kasan_report.cold+0x1f/0x37 mm/kasan/report.c:530
+ check_memory_region_inline mm/kasan/generic.c:186 [inline]
+ check_memory_region+0x13d/0x180 mm/kasan/generic.c:192
+ instrument_atomic_write include/linux/instrumented.h:71 [inline]
+ atomic_inc include/asm-generic/atomic-instrumented.h:240 [inline]
+ io_wqe_inc_running fs/io-wq.c:301 [inline]
+ io_wq_worker_running+0xde/0x110 fs/io-wq.c:613
+ schedule_timeout+0x148/0x250 kernel/time/timer.c:1879
+ io_wqe_worker+0x517/0x10e0 fs/io-wq.c:580
+ kthread+0x3b5/0x4a0 kernel/kthread.c:292
+ ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:294
+
+ Allocated by task 7768:
+ kasan_save_stack+0x1b/0x40 mm/kasan/common.c:48
+ kasan_set_track mm/kasan/common.c:56 [inline]
+ __kasan_kmalloc.constprop.0+0xbf/0xd0 mm/kasan/common.c:461
+ kmem_cache_alloc_node_trace+0x17b/0x3f0 mm/slab.c:3594
+ kmalloc_node include/linux/slab.h:572 [inline]
+ kzalloc_node include/linux/slab.h:677 [inline]
+ io_wq_create+0x57b/0xa10 fs/io-wq.c:1064
+ io_init_wq_offload fs/io_uring.c:7432 [inline]
+ io_sq_offload_start fs/io_uring.c:7504 [inline]
+ io_uring_create fs/io_uring.c:8625 [inline]
+ io_uring_setup+0x1836/0x28e0 fs/io_uring.c:8694
+ do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46
+ entry_SYSCALL_64_after_hwframe+0x44/0xa9
+
+ Freed by task 21:
+ kasan_save_stack+0x1b/0x40 mm/kasan/common.c:48
+ kasan_set_track+0x1c/0x30 mm/kasan/common.c:56
+ kasan_set_free_info+0x1b/0x30 mm/kasan/generic.c:355
+ __kasan_slab_free+0xd8/0x120 mm/kasan/common.c:422
+ __cache_free mm/slab.c:3418 [inline]
+ kfree+0x10e/0x2b0 mm/slab.c:3756
+ __io_wq_destroy fs/io-wq.c:1138 [inline]
+ io_wq_destroy+0x2af/0x460 fs/io-wq.c:1146
+ io_finish_async fs/io_uring.c:6836 [inline]
+ io_ring_ctx_free fs/io_uring.c:7870 [inline]
+ io_ring_exit_work+0x1e4/0x6d0 fs/io_uring.c:7954
+ process_one_work+0x94c/0x1670 kernel/workqueue.c:2269
+ worker_thread+0x64c/0x1120 kernel/workqueue.c:2415
+ kthread+0x3b5/0x4a0 kernel/kthread.c:292
+ ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:294
+
+ The buggy address belongs to the object at ffff8882183db000
+ which belongs to the cache kmalloc-1k of size 1024
+ The buggy address is located 140 bytes inside of
+ 1024-byte region [ffff8882183db000, ffff8882183db400)
+ The buggy address belongs to the page:
+ page:000000009bada22b refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x2183db
+ flags: 0x57ffe0000000200(slab)
+ raw: 057ffe0000000200 ffffea0008604c48 ffffea00086a8648 ffff8880aa040700
+ raw: 0000000000000000 ffff8882183db000 0000000100000002 0000000000000000
+ page dumped because: kasan: bad access detected
+
+ Memory state around the buggy address:
+ ffff8882183daf80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
+ ffff8882183db000: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
+ >ffff8882183db080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
+ ^
+ ffff8882183db100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
+ ffff8882183db180: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
+ ==================================================================
+
+which is down to the comment below,
+
+ /* all workers gone, wq exit can proceed */
+ if (!nr_workers && refcount_dec_and_test(&wqe->wq->refs))
+ complete(&wqe->wq->done);
+
+because there might be multiple cases of wqe in a wq and we would wait
+for every worker in every wqe to go home before releasing wq's resources
+on destroying.
+
+To that end, rework wq's refcount by making it independent of the tracking
+of workers because after all they are two different things, and keeping
+it balanced when workers come and go. Note the manager kthread, like
+other workers, now holds a grab to wq during its lifetime.
+
+Finally to help destroy wq, check IO_WQ_BIT_EXIT upon creating worker
+and do nothing for exiting wq.
+
+Cc: stable@vger.kernel.org # v5.5+
+Reported-by: syzbot+45fa0a195b941764e0f0@syzkaller.appspotmail.com
+Reported-by: syzbot+9af99580130003da82b1@syzkaller.appspotmail.com
+Cc: Pavel Begunkov <asml.silence@gmail.com>
+Signed-off-by: Hillf Danton <hdanton@sina.com>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/io-wq.c | 116 ++++++++++++++++++++++++++++++-------------------------------
+ 1 file changed, 58 insertions(+), 58 deletions(-)
+
+--- a/fs/io-wq.c
++++ b/fs/io-wq.c
+@@ -202,7 +202,6 @@ static void io_worker_exit(struct io_wor
+ {
+ struct io_wqe *wqe = worker->wqe;
+ struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker);
+- unsigned nr_workers;
+
+ /*
+ * If we're not at zero, someone else is holding a brief reference
+@@ -230,15 +229,11 @@ static void io_worker_exit(struct io_wor
+ raw_spin_lock_irq(&wqe->lock);
+ }
+ acct->nr_workers--;
+- nr_workers = wqe->acct[IO_WQ_ACCT_BOUND].nr_workers +
+- wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers;
+ raw_spin_unlock_irq(&wqe->lock);
+
+- /* all workers gone, wq exit can proceed */
+- if (!nr_workers && refcount_dec_and_test(&wqe->wq->refs))
+- complete(&wqe->wq->done);
+-
+ kfree_rcu(worker, rcu);
++ if (refcount_dec_and_test(&wqe->wq->refs))
++ complete(&wqe->wq->done);
+ }
+
+ static inline bool io_wqe_run_queue(struct io_wqe *wqe)
+@@ -644,7 +639,7 @@ void io_wq_worker_sleeping(struct task_s
+
+ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
+ {
+- struct io_wqe_acct *acct =&wqe->acct[index];
++ struct io_wqe_acct *acct = &wqe->acct[index];
+ struct io_worker *worker;
+
+ worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node);
+@@ -677,6 +672,7 @@ static bool create_io_worker(struct io_w
+ if (index == IO_WQ_ACCT_UNBOUND)
+ atomic_inc(&wq->user->processes);
+
++ refcount_inc(&wq->refs);
+ wake_up_process(worker->task);
+ return true;
+ }
+@@ -692,28 +688,63 @@ static inline bool io_wqe_need_worker(st
+ return acct->nr_workers < acct->max_workers;
+ }
+
++static bool io_wqe_worker_send_sig(struct io_worker *worker, void *data)
++{
++ send_sig(SIGINT, worker->task, 1);
++ return false;
++}
++
++/*
++ * Iterate the passed in list and call the specific function for each
++ * worker that isn't exiting
++ */
++static bool io_wq_for_each_worker(struct io_wqe *wqe,
++ bool (*func)(struct io_worker *, void *),
++ void *data)
++{
++ struct io_worker *worker;
++ bool ret = false;
++
++ list_for_each_entry_rcu(worker, &wqe->all_list, all_list) {
++ if (io_worker_get(worker)) {
++ /* no task if node is/was offline */
++ if (worker->task)
++ ret = func(worker, data);
++ io_worker_release(worker);
++ if (ret)
++ break;
++ }
++ }
++
++ return ret;
++}
++
++static bool io_wq_worker_wake(struct io_worker *worker, void *data)
++{
++ wake_up_process(worker->task);
++ return false;
++}
++
+ /*
+ * Manager thread. Tasked with creating new workers, if we need them.
+ */
+ static int io_wq_manager(void *data)
+ {
+ struct io_wq *wq = data;
+- int workers_to_create = num_possible_nodes();
+ int node;
+
+ /* create fixed workers */
+- refcount_set(&wq->refs, workers_to_create);
++ refcount_set(&wq->refs, 1);
+ for_each_node(node) {
+ if (!node_online(node))
+ continue;
+- if (!create_io_worker(wq, wq->wqes[node], IO_WQ_ACCT_BOUND))
+- goto err;
+- workers_to_create--;
++ if (create_io_worker(wq, wq->wqes[node], IO_WQ_ACCT_BOUND))
++ continue;
++ set_bit(IO_WQ_BIT_ERROR, &wq->state);
++ set_bit(IO_WQ_BIT_EXIT, &wq->state);
++ goto out;
+ }
+
+- while (workers_to_create--)
+- refcount_dec(&wq->refs);
+-
+ complete(&wq->done);
+
+ while (!kthread_should_stop()) {
+@@ -745,12 +776,18 @@ static int io_wq_manager(void *data)
+ if (current->task_works)
+ task_work_run();
+
+- return 0;
+-err:
+- set_bit(IO_WQ_BIT_ERROR, &wq->state);
+- set_bit(IO_WQ_BIT_EXIT, &wq->state);
+- if (refcount_sub_and_test(workers_to_create, &wq->refs))
++out:
++ if (refcount_dec_and_test(&wq->refs)) {
+ complete(&wq->done);
++ return 0;
++ }
++ /* if ERROR is set and we get here, we have workers to wake */
++ if (test_bit(IO_WQ_BIT_ERROR, &wq->state)) {
++ rcu_read_lock();
++ for_each_node(node)
++ io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL);
++ rcu_read_unlock();
++ }
+ return 0;
+ }
+
+@@ -858,37 +895,6 @@ void io_wq_hash_work(struct io_wq_work *
+ work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT));
+ }
+
+-static bool io_wqe_worker_send_sig(struct io_worker *worker, void *data)
+-{
+- send_sig(SIGINT, worker->task, 1);
+- return false;
+-}
+-
+-/*
+- * Iterate the passed in list and call the specific function for each
+- * worker that isn't exiting
+- */
+-static bool io_wq_for_each_worker(struct io_wqe *wqe,
+- bool (*func)(struct io_worker *, void *),
+- void *data)
+-{
+- struct io_worker *worker;
+- bool ret = false;
+-
+- list_for_each_entry_rcu(worker, &wqe->all_list, all_list) {
+- if (io_worker_get(worker)) {
+- /* no task if node is/was offline */
+- if (worker->task)
+- ret = func(worker, data);
+- io_worker_release(worker);
+- if (ret)
+- break;
+- }
+- }
+-
+- return ret;
+-}
+-
+ void io_wq_cancel_all(struct io_wq *wq)
+ {
+ int node;
+@@ -1121,12 +1127,6 @@ bool io_wq_get(struct io_wq *wq, struct
+ return refcount_inc_not_zero(&wq->use_refs);
+ }
+
+-static bool io_wq_worker_wake(struct io_worker *worker, void *data)
+-{
+- wake_up_process(worker->task);
+- return false;
+-}
+-
+ static void __io_wq_destroy(struct io_wq *wq)
+ {
+ int node;
--- /dev/null
+From foo@baz Thu Oct 29 01:16:54 PM CET 2020
+From: Jens Axboe <axboe@kernel.dk>
+Date: Tue, 22 Sep 2020 08:18:24 -0600
+Subject: io_uring: allow timeout/poll/files killing to take task into account
+
+From: Jens Axboe <axboe@kernel.dk>
+
+commit f3606e3a92ddd36299642c78592fc87609abb1f6 upstream.
+
+We currently cancel these when the ring exits, and we cancel all of
+them. This is in preparation for killing only the ones associated
+with a given task.
+
+Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/io_uring.c | 30 ++++++++++++++++++++++--------
+ 1 file changed, 22 insertions(+), 8 deletions(-)
+
+--- a/fs/io_uring.c
++++ b/fs/io_uring.c
+@@ -1141,13 +1141,25 @@ static void io_kill_timeout(struct io_ki
+ }
+ }
+
+-static void io_kill_timeouts(struct io_ring_ctx *ctx)
++static bool io_task_match(struct io_kiocb *req, struct task_struct *tsk)
++{
++ struct io_ring_ctx *ctx = req->ctx;
++
++ if (!tsk || req->task == tsk)
++ return true;
++ if ((ctx->flags & IORING_SETUP_SQPOLL) && req->task == ctx->sqo_thread)
++ return true;
++ return false;
++}
++
++static void io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk)
+ {
+ struct io_kiocb *req, *tmp;
+
+ spin_lock_irq(&ctx->completion_lock);
+ list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list)
+- io_kill_timeout(req);
++ if (io_task_match(req, tsk))
++ io_kill_timeout(req);
+ spin_unlock_irq(&ctx->completion_lock);
+ }
+
+@@ -4641,7 +4653,7 @@ static bool io_poll_remove_one(struct io
+ return do_complete;
+ }
+
+-static void io_poll_remove_all(struct io_ring_ctx *ctx)
++static void io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk)
+ {
+ struct hlist_node *tmp;
+ struct io_kiocb *req;
+@@ -4652,8 +4664,10 @@ static void io_poll_remove_all(struct io
+ struct hlist_head *list;
+
+ list = &ctx->cancel_hash[i];
+- hlist_for_each_entry_safe(req, tmp, list, hash_node)
+- posted += io_poll_remove_one(req);
++ hlist_for_each_entry_safe(req, tmp, list, hash_node) {
++ if (io_task_match(req, tsk))
++ posted += io_poll_remove_one(req);
++ }
+ }
+ spin_unlock_irq(&ctx->completion_lock);
+
+@@ -7556,8 +7570,8 @@ static void io_ring_ctx_wait_and_kill(st
+ percpu_ref_kill(&ctx->refs);
+ mutex_unlock(&ctx->uring_lock);
+
+- io_kill_timeouts(ctx);
+- io_poll_remove_all(ctx);
++ io_kill_timeouts(ctx, NULL);
++ io_poll_remove_all(ctx, NULL);
+
+ if (ctx->io_wq)
+ io_wq_cancel_all(ctx->io_wq);
+@@ -7809,7 +7823,7 @@ static bool io_cancel_task_cb(struct io_
+ struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+ struct task_struct *task = data;
+
+- return req->task == task;
++ return io_task_match(req, task);
+ }
+
+ static int io_uring_flush(struct file *file, void *data)
--- /dev/null
+From foo@baz Thu Oct 29 01:16:54 PM CET 2020
+From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
+Date: Fri, 9 Oct 2020 13:49:53 +0100
+Subject: io_uring: Convert advanced XArray uses to the normal API
+
+From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
+
+commit 5e2ed8c4f45093698855b1f45cdf43efbf6dd498 upstream.
+
+There are no bugs here that I've spotted, it's just easier to use the
+normal API and there are no performance advantages to using the more
+verbose advanced API.
+
+Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/io_uring.c | 14 ++------------
+ 1 file changed, 2 insertions(+), 12 deletions(-)
+
+--- a/fs/io_uring.c
++++ b/fs/io_uring.c
+@@ -7958,27 +7958,17 @@ static int io_uring_add_task_file(struct
+ static void io_uring_del_task_file(struct file *file)
+ {
+ struct io_uring_task *tctx = current->io_uring;
+- XA_STATE(xas, &tctx->xa, (unsigned long) file);
+
+ if (tctx->last == file)
+ tctx->last = NULL;
+-
+- xas_lock(&xas);
+- file = xas_store(&xas, NULL);
+- xas_unlock(&xas);
+-
++ file = xa_erase(&tctx->xa, (unsigned long)file);
+ if (file)
+ fput(file);
+ }
+
+ static void __io_uring_attempt_task_drop(struct file *file)
+ {
+- XA_STATE(xas, ¤t->io_uring->xa, (unsigned long) file);
+- struct file *old;
+-
+- rcu_read_lock();
+- old = xas_load(&xas);
+- rcu_read_unlock();
++ struct file *old = xa_load(¤t->io_uring->xa, (unsigned long)file);
+
+ if (old == file)
+ io_uring_del_task_file(file);
--- /dev/null
+From foo@baz Thu Oct 29 01:16:54 PM CET 2020
+From: Jens Axboe <axboe@kernel.dk>
+Date: Sun, 13 Sep 2020 13:09:39 -0600
+Subject: io_uring: don't rely on weak ->files references
+
+From: Jens Axboe <axboe@kernel.dk>
+
+commit 0f2122045b946241a9e549c2a76cea54fa58a7ff upstream.
+
+Grab actual references to the files_struct. To avoid circular references
+issues due to this, we add a per-task note that keeps track of what
+io_uring contexts a task has used. When the tasks execs or exits its
+assigned files, we cancel requests based on this tracking.
+
+With that, we can grab proper references to the files table, and no
+longer need to rely on stashing away ring_fd and ring_file to check
+if the ring_fd may have been closed.
+
+Cc: stable@vger.kernel.org # v5.5+
+Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/exec.c | 6
+ fs/file.c | 2
+ fs/io_uring.c | 301 +++++++++++++++++++++++++++++++++++++++++------
+ include/linux/io_uring.h | 53 ++++++++
+ include/linux/sched.h | 5
+ init/init_task.c | 3
+ kernel/fork.c | 6
+ 7 files changed, 340 insertions(+), 36 deletions(-)
+ create mode 100644 include/linux/io_uring.h
+
+--- a/fs/exec.c
++++ b/fs/exec.c
+@@ -62,6 +62,7 @@
+ #include <linux/oom.h>
+ #include <linux/compat.h>
+ #include <linux/vmalloc.h>
++#include <linux/io_uring.h>
+
+ #include <linux/uaccess.h>
+ #include <asm/mmu_context.h>
+@@ -1847,6 +1848,11 @@ static int __do_execve_file(int fd, stru
+ * further execve() calls fail. */
+ current->flags &= ~PF_NPROC_EXCEEDED;
+
++ /*
++ * Cancel any io_uring activity across execve
++ */
++ io_uring_task_cancel();
++
+ retval = unshare_files(&displaced);
+ if (retval)
+ goto out_ret;
+--- a/fs/file.c
++++ b/fs/file.c
+@@ -18,6 +18,7 @@
+ #include <linux/bitops.h>
+ #include <linux/spinlock.h>
+ #include <linux/rcupdate.h>
++#include <linux/io_uring.h>
+
+ unsigned int sysctl_nr_open __read_mostly = 1024*1024;
+ unsigned int sysctl_nr_open_min = BITS_PER_LONG;
+@@ -439,6 +440,7 @@ void exit_files(struct task_struct *tsk)
+ struct files_struct * files = tsk->files;
+
+ if (files) {
++ io_uring_files_cancel(files);
+ task_lock(tsk);
+ tsk->files = NULL;
+ task_unlock(tsk);
+--- a/fs/io_uring.c
++++ b/fs/io_uring.c
+@@ -78,6 +78,7 @@
+ #include <linux/fs_struct.h>
+ #include <linux/splice.h>
+ #include <linux/task_work.h>
++#include <linux/io_uring.h>
+
+ #define CREATE_TRACE_POINTS
+ #include <trace/events/io_uring.h>
+@@ -283,8 +284,6 @@ struct io_ring_ctx {
+ */
+ struct fixed_file_data *file_data;
+ unsigned nr_user_files;
+- int ring_fd;
+- struct file *ring_file;
+
+ /* if used, fixed mapped user buffers */
+ unsigned nr_user_bufs;
+@@ -1335,7 +1334,12 @@ static void __io_cqring_fill_event(struc
+ WRITE_ONCE(cqe->user_data, req->user_data);
+ WRITE_ONCE(cqe->res, res);
+ WRITE_ONCE(cqe->flags, cflags);
+- } else if (ctx->cq_overflow_flushed) {
++ } else if (ctx->cq_overflow_flushed || req->task->io_uring->in_idle) {
++ /*
++ * If we're in ring overflow flush mode, or in task cancel mode,
++ * then we cannot store the request for later flushing, we need
++ * to drop it on the floor.
++ */
+ WRITE_ONCE(ctx->rings->cq_overflow,
+ atomic_inc_return(&ctx->cached_cq_overflow));
+ } else {
+@@ -1451,17 +1455,22 @@ static void io_req_drop_files(struct io_
+ wake_up(&ctx->inflight_wait);
+ spin_unlock_irqrestore(&ctx->inflight_lock, flags);
+ req->flags &= ~REQ_F_INFLIGHT;
++ put_files_struct(req->work.files);
+ req->work.files = NULL;
+ }
+
+ static void __io_req_aux_free(struct io_kiocb *req)
+ {
++ struct io_uring_task *tctx = req->task->io_uring;
+ if (req->flags & REQ_F_NEED_CLEANUP)
+ io_cleanup_req(req);
+
+ kfree(req->io);
+ if (req->file)
+ io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
++ atomic_long_inc(&tctx->req_complete);
++ if (tctx->in_idle)
++ wake_up(&tctx->wait);
+ put_task_struct(req->task);
+ io_req_work_drop_env(req);
+ }
+@@ -3532,8 +3541,7 @@ static int io_close_prep(struct io_kiocb
+ return -EBADF;
+
+ req->close.fd = READ_ONCE(sqe->fd);
+- if ((req->file && req->file->f_op == &io_uring_fops) ||
+- req->close.fd == req->ctx->ring_fd)
++ if ((req->file && req->file->f_op == &io_uring_fops))
+ return -EBADF;
+
+ req->close.put_file = NULL;
+@@ -5671,32 +5679,18 @@ static int io_req_set_file(struct io_sub
+
+ static int io_grab_files(struct io_kiocb *req)
+ {
+- int ret = -EBADF;
+ struct io_ring_ctx *ctx = req->ctx;
+
+ if (req->work.files || (req->flags & REQ_F_NO_FILE_TABLE))
+ return 0;
+- if (!ctx->ring_file)
+- return -EBADF;
+
+- rcu_read_lock();
++ req->work.files = get_files_struct(current);
++ req->flags |= REQ_F_INFLIGHT;
++
+ spin_lock_irq(&ctx->inflight_lock);
+- /*
+- * We use the f_ops->flush() handler to ensure that we can flush
+- * out work accessing these files if the fd is closed. Check if
+- * the fd has changed since we started down this path, and disallow
+- * this operation if it has.
+- */
+- if (fcheck(ctx->ring_fd) == ctx->ring_file) {
+- list_add(&req->inflight_entry, &ctx->inflight_list);
+- req->flags |= REQ_F_INFLIGHT;
+- req->work.files = current->files;
+- ret = 0;
+- }
++ list_add(&req->inflight_entry, &ctx->inflight_list);
+ spin_unlock_irq(&ctx->inflight_lock);
+- rcu_read_unlock();
+-
+- return ret;
++ return 0;
+ }
+
+ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
+@@ -6067,6 +6061,7 @@ static int io_init_req(struct io_ring_ct
+ refcount_set(&req->refs, 2);
+ req->task = current;
+ get_task_struct(req->task);
++ atomic_long_inc(&req->task->io_uring->req_issue);
+ req->result = 0;
+
+ if (unlikely(req->opcode >= IORING_OP_LAST))
+@@ -6102,8 +6097,7 @@ static int io_init_req(struct io_ring_ct
+ return io_req_set_file(state, req, READ_ONCE(sqe->fd));
+ }
+
+-static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
+- struct file *ring_file, int ring_fd)
++static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
+ {
+ struct io_submit_state state, *statep = NULL;
+ struct io_kiocb *link = NULL;
+@@ -6127,9 +6121,6 @@ static int io_submit_sqes(struct io_ring
+ statep = &state;
+ }
+
+- ctx->ring_fd = ring_fd;
+- ctx->ring_file = ring_file;
+-
+ for (i = 0; i < nr; i++) {
+ const struct io_uring_sqe *sqe;
+ struct io_kiocb *req;
+@@ -6290,7 +6281,7 @@ static int io_sq_thread(void *data)
+
+ mutex_lock(&ctx->uring_lock);
+ if (likely(!percpu_ref_is_dying(&ctx->refs)))
+- ret = io_submit_sqes(ctx, to_submit, NULL, -1);
++ ret = io_submit_sqes(ctx, to_submit);
+ mutex_unlock(&ctx->uring_lock);
+ timeout = jiffies + ctx->sq_thread_idle;
+ }
+@@ -7119,6 +7110,34 @@ out_fput:
+ return ret;
+ }
+
++static int io_uring_alloc_task_context(struct task_struct *task)
++{
++ struct io_uring_task *tctx;
++
++ tctx = kmalloc(sizeof(*tctx), GFP_KERNEL);
++ if (unlikely(!tctx))
++ return -ENOMEM;
++
++ xa_init(&tctx->xa);
++ init_waitqueue_head(&tctx->wait);
++ tctx->last = NULL;
++ tctx->in_idle = 0;
++ atomic_long_set(&tctx->req_issue, 0);
++ atomic_long_set(&tctx->req_complete, 0);
++ task->io_uring = tctx;
++ return 0;
++}
++
++void __io_uring_free(struct task_struct *tsk)
++{
++ struct io_uring_task *tctx = tsk->io_uring;
++
++ WARN_ON_ONCE(!xa_empty(&tctx->xa));
++ xa_destroy(&tctx->xa);
++ kfree(tctx);
++ tsk->io_uring = NULL;
++}
++
+ static int io_sq_offload_start(struct io_ring_ctx *ctx,
+ struct io_uring_params *p)
+ {
+@@ -7154,6 +7173,9 @@ static int io_sq_offload_start(struct io
+ ctx->sqo_thread = NULL;
+ goto err;
+ }
++ ret = io_uring_alloc_task_context(ctx->sqo_thread);
++ if (ret)
++ goto err;
+ wake_up_process(ctx->sqo_thread);
+ } else if (p->flags & IORING_SETUP_SQ_AFF) {
+ /* Can't have SQ_AFF without SQPOLL */
+@@ -7633,7 +7655,7 @@ static bool io_wq_files_match(struct io_
+ {
+ struct files_struct *files = data;
+
+- return work->files == files;
++ return !files || work->files == files;
+ }
+
+ /*
+@@ -7787,7 +7809,7 @@ static bool io_uring_cancel_files(struct
+
+ spin_lock_irq(&ctx->inflight_lock);
+ list_for_each_entry(req, &ctx->inflight_list, inflight_entry) {
+- if (req->work.files != files)
++ if (files && req->work.files != files)
+ continue;
+ /* req is being completed, ignore */
+ if (!refcount_inc_not_zero(&req->refs))
+@@ -7850,18 +7872,217 @@ static bool io_cancel_task_cb(struct io_
+ return io_task_match(req, task);
+ }
+
++static bool __io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
++ struct task_struct *task,
++ struct files_struct *files)
++{
++ bool ret;
++
++ ret = io_uring_cancel_files(ctx, files);
++ if (!files) {
++ enum io_wq_cancel cret;
++
++ cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, task, true);
++ if (cret != IO_WQ_CANCEL_NOTFOUND)
++ ret = true;
++
++ /* SQPOLL thread does its own polling */
++ if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
++ if (!list_empty_careful(&ctx->poll_list)) {
++ io_iopoll_reap_events(ctx);
++ ret = true;
++ }
++ }
++
++ ret |= io_poll_remove_all(ctx, task);
++ ret |= io_kill_timeouts(ctx, task);
++ }
++
++ return ret;
++}
++
++/*
++ * We need to iteratively cancel requests, in case a request has dependent
++ * hard links. These persist even for failure of cancelations, hence keep
++ * looping until none are found.
++ */
++static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
++ struct files_struct *files)
++{
++ struct task_struct *task = current;
++
++ if (ctx->flags & IORING_SETUP_SQPOLL)
++ task = ctx->sqo_thread;
++
++ io_cqring_overflow_flush(ctx, true, task, files);
++
++ while (__io_uring_cancel_task_requests(ctx, task, files)) {
++ io_run_task_work();
++ cond_resched();
++ }
++}
++
++/*
++ * Note that this task has used io_uring. We use it for cancelation purposes.
++ */
++static int io_uring_add_task_file(struct file *file)
++{
++ if (unlikely(!current->io_uring)) {
++ int ret;
++
++ ret = io_uring_alloc_task_context(current);
++ if (unlikely(ret))
++ return ret;
++ }
++ if (current->io_uring->last != file) {
++ XA_STATE(xas, ¤t->io_uring->xa, (unsigned long) file);
++ void *old;
++
++ rcu_read_lock();
++ old = xas_load(&xas);
++ if (old != file) {
++ get_file(file);
++ xas_lock(&xas);
++ xas_store(&xas, file);
++ xas_unlock(&xas);
++ }
++ rcu_read_unlock();
++ current->io_uring->last = file;
++ }
++
++ return 0;
++}
++
++/*
++ * Remove this io_uring_file -> task mapping.
++ */
++static void io_uring_del_task_file(struct file *file)
++{
++ struct io_uring_task *tctx = current->io_uring;
++ XA_STATE(xas, &tctx->xa, (unsigned long) file);
++
++ if (tctx->last == file)
++ tctx->last = NULL;
++
++ xas_lock(&xas);
++ file = xas_store(&xas, NULL);
++ xas_unlock(&xas);
++
++ if (file)
++ fput(file);
++}
++
++static void __io_uring_attempt_task_drop(struct file *file)
++{
++ XA_STATE(xas, ¤t->io_uring->xa, (unsigned long) file);
++ struct file *old;
++
++ rcu_read_lock();
++ old = xas_load(&xas);
++ rcu_read_unlock();
++
++ if (old == file)
++ io_uring_del_task_file(file);
++}
++
++/*
++ * Drop task note for this file if we're the only ones that hold it after
++ * pending fput()
++ */
++static void io_uring_attempt_task_drop(struct file *file, bool exiting)
++{
++ if (!current->io_uring)
++ return;
++ /*
++ * fput() is pending, will be 2 if the only other ref is our potential
++ * task file note. If the task is exiting, drop regardless of count.
++ */
++ if (!exiting && atomic_long_read(&file->f_count) != 2)
++ return;
++
++ __io_uring_attempt_task_drop(file);
++}
++
++void __io_uring_files_cancel(struct files_struct *files)
++{
++ struct io_uring_task *tctx = current->io_uring;
++ XA_STATE(xas, &tctx->xa, 0);
++
++ /* make sure overflow events are dropped */
++ tctx->in_idle = true;
++
++ do {
++ struct io_ring_ctx *ctx;
++ struct file *file;
++
++ xas_lock(&xas);
++ file = xas_next_entry(&xas, ULONG_MAX);
++ xas_unlock(&xas);
++
++ if (!file)
++ break;
++
++ ctx = file->private_data;
++
++ io_uring_cancel_task_requests(ctx, files);
++ if (files)
++ io_uring_del_task_file(file);
++ } while (1);
++}
++
++static inline bool io_uring_task_idle(struct io_uring_task *tctx)
++{
++ return atomic_long_read(&tctx->req_issue) ==
++ atomic_long_read(&tctx->req_complete);
++}
++
++/*
++ * Find any io_uring fd that this task has registered or done IO on, and cancel
++ * requests.
++ */
++void __io_uring_task_cancel(void)
++{
++ struct io_uring_task *tctx = current->io_uring;
++ DEFINE_WAIT(wait);
++ long completions;
++
++ /* make sure overflow events are dropped */
++ tctx->in_idle = true;
++
++ while (!io_uring_task_idle(tctx)) {
++ /* read completions before cancelations */
++ completions = atomic_long_read(&tctx->req_complete);
++ __io_uring_files_cancel(NULL);
++
++ prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
++
++ /*
++ * If we've seen completions, retry. This avoids a race where
++ * a completion comes in before we did prepare_to_wait().
++ */
++ if (completions != atomic_long_read(&tctx->req_complete))
++ continue;
++ if (io_uring_task_idle(tctx))
++ break;
++ schedule();
++ }
++
++ finish_wait(&tctx->wait, &wait);
++ tctx->in_idle = false;
++}
++
+ static int io_uring_flush(struct file *file, void *data)
+ {
+ struct io_ring_ctx *ctx = file->private_data;
+
+- io_uring_cancel_files(ctx, data);
+-
+ /*
+ * If the task is going away, cancel work it may have pending
+ */
+ if (fatal_signal_pending(current) || (current->flags & PF_EXITING))
+- io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, current, true);
++ data = NULL;
+
++ io_uring_cancel_task_requests(ctx, data);
++ io_uring_attempt_task_drop(file, !data);
+ return 0;
+ }
+
+@@ -7975,8 +8196,11 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned
+ wake_up(&ctx->sqo_wait);
+ submitted = to_submit;
+ } else if (to_submit) {
++ ret = io_uring_add_task_file(f.file);
++ if (unlikely(ret))
++ goto out;
+ mutex_lock(&ctx->uring_lock);
+- submitted = io_submit_sqes(ctx, to_submit, f.file, fd);
++ submitted = io_submit_sqes(ctx, to_submit);
+ mutex_unlock(&ctx->uring_lock);
+
+ if (submitted != to_submit)
+@@ -8188,6 +8412,7 @@ static int io_uring_get_fd(struct io_rin
+ file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
+ O_RDWR | O_CLOEXEC);
+ if (IS_ERR(file)) {
++err_fd:
+ put_unused_fd(ret);
+ ret = PTR_ERR(file);
+ goto err;
+@@ -8196,6 +8421,10 @@ static int io_uring_get_fd(struct io_rin
+ #if defined(CONFIG_UNIX)
+ ctx->ring_sock->file = file;
+ #endif
++ if (unlikely(io_uring_add_task_file(file))) {
++ file = ERR_PTR(-ENOMEM);
++ goto err_fd;
++ }
+ fd_install(ret, file);
+ return ret;
+ err:
+--- /dev/null
++++ b/include/linux/io_uring.h
+@@ -0,0 +1,53 @@
++/* SPDX-License-Identifier: GPL-2.0-or-later */
++#ifndef _LINUX_IO_URING_H
++#define _LINUX_IO_URING_H
++
++#include <linux/sched.h>
++#include <linux/xarray.h>
++#include <linux/percpu-refcount.h>
++
++struct io_uring_task {
++ /* submission side */
++ struct xarray xa;
++ struct wait_queue_head wait;
++ struct file *last;
++ atomic_long_t req_issue;
++
++ /* completion side */
++ bool in_idle ____cacheline_aligned_in_smp;
++ atomic_long_t req_complete;
++};
++
++#if defined(CONFIG_IO_URING)
++void __io_uring_task_cancel(void);
++void __io_uring_files_cancel(struct files_struct *files);
++void __io_uring_free(struct task_struct *tsk);
++
++static inline void io_uring_task_cancel(void)
++{
++ if (current->io_uring && !xa_empty(¤t->io_uring->xa))
++ __io_uring_task_cancel();
++}
++static inline void io_uring_files_cancel(struct files_struct *files)
++{
++ if (current->io_uring && !xa_empty(¤t->io_uring->xa))
++ __io_uring_files_cancel(files);
++}
++static inline void io_uring_free(struct task_struct *tsk)
++{
++ if (tsk->io_uring)
++ __io_uring_free(tsk);
++}
++#else
++static inline void io_uring_task_cancel(void)
++{
++}
++static inline void io_uring_files_cancel(struct files_struct *files)
++{
++}
++static inline void io_uring_free(struct task_struct *tsk)
++{
++}
++#endif
++
++#endif
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -61,6 +61,7 @@ struct sighand_struct;
+ struct signal_struct;
+ struct task_delay_info;
+ struct task_group;
++struct io_uring_task;
+
+ /*
+ * Task state bitmask. NOTE! These bits are also
+@@ -923,6 +924,10 @@ struct task_struct {
+ /* Open file information: */
+ struct files_struct *files;
+
++#ifdef CONFIG_IO_URING
++ struct io_uring_task *io_uring;
++#endif
++
+ /* Namespaces: */
+ struct nsproxy *nsproxy;
+
+--- a/init/init_task.c
++++ b/init/init_task.c
+@@ -113,6 +113,9 @@ struct task_struct init_task
+ .thread = INIT_THREAD,
+ .fs = &init_fs,
+ .files = &init_files,
++#ifdef CONFIG_IO_URING
++ .io_uring = NULL,
++#endif
+ .signal = &init_signals,
+ .sighand = &init_sighand,
+ .nsproxy = &init_nsproxy,
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -95,6 +95,7 @@
+ #include <linux/stackleak.h>
+ #include <linux/kasan.h>
+ #include <linux/scs.h>
++#include <linux/io_uring.h>
+
+ #include <asm/pgalloc.h>
+ #include <linux/uaccess.h>
+@@ -745,6 +746,7 @@ void __put_task_struct(struct task_struc
+ WARN_ON(refcount_read(&tsk->usage));
+ WARN_ON(tsk == current);
+
++ io_uring_free(tsk);
+ cgroup_free(tsk);
+ task_numa_free(tsk, true);
+ security_task_free(tsk);
+@@ -2022,6 +2024,10 @@ static __latent_entropy struct task_stru
+ p->vtime.state = VTIME_INACTIVE;
+ #endif
+
++#ifdef CONFIG_IO_URING
++ p->io_uring = NULL;
++#endif
++
+ #if defined(SPLIT_RSS_COUNTING)
+ memset(&p->rss_stat, 0, sizeof(p->rss_stat));
+ #endif
--- /dev/null
+From foo@baz Thu Oct 29 01:16:54 PM CET 2020
+From: Jens Axboe <axboe@kernel.dk>
+Date: Mon, 12 Oct 2020 11:53:29 -0600
+Subject: io_uring: don't run task work on an exiting task
+
+From: Jens Axboe <axboe@kernel.dk>
+
+commit 6200b0ae4ea28a4bfd8eb434e33e6201b7a6a282 upstream.
+
+This isn't safe, and isn't needed either. We are guaranteed that any
+work we queue is on a live task (and will be run), or it goes to
+our backup io-wq threads if the task is exiting.
+
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/io_uring.c | 8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+--- a/fs/io_uring.c
++++ b/fs/io_uring.c
+@@ -1762,6 +1762,12 @@ static int io_put_kbuf(struct io_kiocb *
+
+ static inline bool io_run_task_work(void)
+ {
++ /*
++ * Not safe to run on exiting task, and the task_work handling will
++ * not add work to such a task.
++ */
++ if (unlikely(current->flags & PF_EXITING))
++ return false;
+ if (current->task_works) {
+ __set_current_state(TASK_RUNNING);
+ task_work_run();
+@@ -7791,6 +7797,8 @@ static void io_uring_cancel_files(struct
+ io_put_req(cancel_req);
+ }
+
++ /* cancellations _may_ trigger task work */
++ io_run_task_work();
+ schedule();
+ finish_wait(&ctx->inflight_wait, &wait);
+ }
--- /dev/null
+From foo@baz Thu Oct 29 01:16:54 PM CET 2020
+From: Jens Axboe <axboe@kernel.dk>
+Date: Mon, 28 Sep 2020 13:10:13 -0600
+Subject: io_uring: enable task/files specific overflow flushing
+
+From: Jens Axboe <axboe@kernel.dk>
+
+commit e6c8aa9ac33bd7c968af7816240fc081401fddcd upstream.
+
+This allows us to selectively flush out pending overflows, depending on
+the task and/or files_struct being passed in.
+
+No intended functional changes in this patch.
+
+Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/io_uring.c | 41 ++++++++++++++++++++++++++---------------
+ 1 file changed, 26 insertions(+), 15 deletions(-)
+
+--- a/fs/io_uring.c
++++ b/fs/io_uring.c
+@@ -1240,12 +1240,24 @@ static void io_cqring_ev_posted(struct i
+ eventfd_signal(ctx->cq_ev_fd, 1);
+ }
+
++static inline bool io_match_files(struct io_kiocb *req,
++ struct files_struct *files)
++{
++ if (!files)
++ return true;
++ if (req->flags & REQ_F_WORK_INITIALIZED)
++ return req->work.files == files;
++ return false;
++}
++
+ /* Returns true if there are no backlogged entries after the flush */
+-static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
++static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
++ struct task_struct *tsk,
++ struct files_struct *files)
+ {
+ struct io_rings *rings = ctx->rings;
++ struct io_kiocb *req, *tmp;
+ struct io_uring_cqe *cqe;
+- struct io_kiocb *req;
+ unsigned long flags;
+ LIST_HEAD(list);
+
+@@ -1264,7 +1276,12 @@ static bool io_cqring_overflow_flush(str
+ ctx->cq_overflow_flushed = 1;
+
+ cqe = NULL;
+- while (!list_empty(&ctx->cq_overflow_list)) {
++ list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, list) {
++ if (tsk && req->task != tsk)
++ continue;
++ if (!io_match_files(req, files))
++ continue;
++
+ cqe = io_get_cqring(ctx);
+ if (!cqe && !force)
+ break;
+@@ -1734,7 +1751,7 @@ static unsigned io_cqring_events(struct
+ if (noflush && !list_empty(&ctx->cq_overflow_list))
+ return -1U;
+
+- io_cqring_overflow_flush(ctx, false);
++ io_cqring_overflow_flush(ctx, false, NULL, NULL);
+ }
+
+ /* See comment at the top of this file */
+@@ -6095,7 +6112,7 @@ static int io_submit_sqes(struct io_ring
+ /* if we have a backlog and couldn't flush it all, return BUSY */
+ if (test_bit(0, &ctx->sq_check_overflow)) {
+ if (!list_empty(&ctx->cq_overflow_list) &&
+- !io_cqring_overflow_flush(ctx, false))
++ !io_cqring_overflow_flush(ctx, false, NULL, NULL))
+ return -EBUSY;
+ }
+
+@@ -7556,7 +7573,7 @@ static void io_ring_exit_work(struct wor
+
+ ctx = container_of(work, struct io_ring_ctx, exit_work);
+ if (ctx->rings)
+- io_cqring_overflow_flush(ctx, true);
++ io_cqring_overflow_flush(ctx, true, NULL, NULL);
+
+ /*
+ * If we're doing polled IO and end up having requests being
+@@ -7567,7 +7584,7 @@ static void io_ring_exit_work(struct wor
+ while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)) {
+ io_iopoll_reap_events(ctx);
+ if (ctx->rings)
+- io_cqring_overflow_flush(ctx, true);
++ io_cqring_overflow_flush(ctx, true, NULL, NULL);
+ }
+ io_ring_ctx_free(ctx);
+ }
+@@ -7587,7 +7604,7 @@ static void io_ring_ctx_wait_and_kill(st
+ io_iopoll_reap_events(ctx);
+ /* if we failed setting up the ctx, we might not have any rings */
+ if (ctx->rings)
+- io_cqring_overflow_flush(ctx, true);
++ io_cqring_overflow_flush(ctx, true, NULL, NULL);
+ idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
+
+ /*
+@@ -7637,12 +7654,6 @@ static bool io_match_link(struct io_kioc
+ return false;
+ }
+
+-static inline bool io_match_files(struct io_kiocb *req,
+- struct files_struct *files)
+-{
+- return (req->flags & REQ_F_WORK_INITIALIZED) && req->work.files == files;
+-}
+-
+ static bool io_match_link_files(struct io_kiocb *req,
+ struct files_struct *files)
+ {
+@@ -7959,7 +7970,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned
+ ret = 0;
+ if (ctx->flags & IORING_SETUP_SQPOLL) {
+ if (!list_empty_careful(&ctx->cq_overflow_list))
+- io_cqring_overflow_flush(ctx, false);
++ io_cqring_overflow_flush(ctx, false, NULL, NULL);
+ if (flags & IORING_ENTER_SQ_WAKEUP)
+ wake_up(&ctx->sqo_wait);
+ submitted = to_submit;
--- /dev/null
+From foo@baz Thu Oct 29 01:16:54 PM CET 2020
+From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
+Date: Fri, 9 Oct 2020 13:49:51 +0100
+Subject: io_uring: Fix use of XArray in __io_uring_files_cancel
+
+From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
+
+commit ce765372bc443573d1d339a2bf4995de385dea3a upstream.
+
+We have to drop the lock during each iteration, so there's no advantage
+to using the advanced API. Convert this to a standard xa_for_each() loop.
+
+Reported-by: syzbot+27c12725d8ff0bfe1a13@syzkaller.appspotmail.com
+Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/io_uring.c | 19 +++++--------------
+ 1 file changed, 5 insertions(+), 14 deletions(-)
+
+--- a/fs/io_uring.c
++++ b/fs/io_uring.c
+@@ -8008,28 +8008,19 @@ static void io_uring_attempt_task_drop(s
+ void __io_uring_files_cancel(struct files_struct *files)
+ {
+ struct io_uring_task *tctx = current->io_uring;
+- XA_STATE(xas, &tctx->xa, 0);
++ struct file *file;
++ unsigned long index;
+
+ /* make sure overflow events are dropped */
+ tctx->in_idle = true;
+
+- do {
+- struct io_ring_ctx *ctx;
+- struct file *file;
+-
+- xas_lock(&xas);
+- file = xas_next_entry(&xas, ULONG_MAX);
+- xas_unlock(&xas);
+-
+- if (!file)
+- break;
+-
+- ctx = file->private_data;
++ xa_for_each(&tctx->xa, index, file) {
++ struct io_ring_ctx *ctx = file->private_data;
+
+ io_uring_cancel_task_requests(ctx, files);
+ if (files)
+ io_uring_del_task_file(file);
+- } while (1);
++ }
+ }
+
+ static inline bool io_uring_task_idle(struct io_uring_task *tctx)
--- /dev/null
+From foo@baz Thu Oct 29 01:16:54 PM CET 2020
+From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
+Date: Fri, 9 Oct 2020 13:49:52 +0100
+Subject: io_uring: Fix XArray usage in io_uring_add_task_file
+
+From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
+
+commit 236434c3438c4da3dfbd6aeeab807577b85e951a upstream.
+
+The xas_store() wasn't paired with an xas_nomem() loop, so if it couldn't
+allocate memory using GFP_NOWAIT, it would leak the reference to the file
+descriptor. Also the node pointed to by the xas could be freed between
+the call to xas_load() under the rcu_read_lock() and the acquisition of
+the xa_lock.
+
+It's easier to just use the normal xa_load/xa_store interface here.
+
+Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
+[axboe: fix missing assign after alloc, cur_uring -> tctx rename]
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/io_uring.c | 21 +++++++++------------
+ 1 file changed, 9 insertions(+), 12 deletions(-)
+
+--- a/fs/io_uring.c
++++ b/fs/io_uring.c
+@@ -7929,27 +7929,24 @@ static void io_uring_cancel_task_request
+ */
+ static int io_uring_add_task_file(struct file *file)
+ {
+- if (unlikely(!current->io_uring)) {
++ struct io_uring_task *tctx = current->io_uring;
++
++ if (unlikely(!tctx)) {
+ int ret;
+
+ ret = io_uring_alloc_task_context(current);
+ if (unlikely(ret))
+ return ret;
++ tctx = current->io_uring;
+ }
+- if (current->io_uring->last != file) {
+- XA_STATE(xas, ¤t->io_uring->xa, (unsigned long) file);
+- void *old;
++ if (tctx->last != file) {
++ void *old = xa_load(&tctx->xa, (unsigned long)file);
+
+- rcu_read_lock();
+- old = xas_load(&xas);
+- if (old != file) {
++ if (!old) {
+ get_file(file);
+- xas_lock(&xas);
+- xas_store(&xas, file);
+- xas_unlock(&xas);
++ xa_store(&tctx->xa, (unsigned long)file, file, GFP_KERNEL);
+ }
+- rcu_read_unlock();
+- current->io_uring->last = file;
++ tctx->last = file;
+ }
+
+ return 0;
--- /dev/null
+From foo@baz Thu Oct 29 01:16:54 PM CET 2020
+From: Jens Axboe <axboe@kernel.dk>
+Date: Mon, 12 Oct 2020 11:03:18 -0600
+Subject: io_uring: move dropping of files into separate helper
+
+From: Jens Axboe <axboe@kernel.dk>
+
+commit f573d384456b3025d3f8e58b3eafaeeb0f510784 upstream.
+
+No functional changes in this patch, prep patch for grabbing references
+to the files_struct.
+
+Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/io_uring.c | 26 ++++++++++++++++----------
+ 1 file changed, 16 insertions(+), 10 deletions(-)
+
+--- a/fs/io_uring.c
++++ b/fs/io_uring.c
+@@ -1424,6 +1424,20 @@ static inline void io_put_file(struct io
+ fput(file);
+ }
+
++static void io_req_drop_files(struct io_kiocb *req)
++{
++ struct io_ring_ctx *ctx = req->ctx;
++ unsigned long flags;
++
++ spin_lock_irqsave(&ctx->inflight_lock, flags);
++ list_del(&req->inflight_entry);
++ if (waitqueue_active(&ctx->inflight_wait))
++ wake_up(&ctx->inflight_wait);
++ spin_unlock_irqrestore(&ctx->inflight_lock, flags);
++ req->flags &= ~REQ_F_INFLIGHT;
++ req->work.files = NULL;
++}
++
+ static void __io_req_aux_free(struct io_kiocb *req)
+ {
+ if (req->flags & REQ_F_NEED_CLEANUP)
+@@ -1440,16 +1454,8 @@ static void __io_free_req(struct io_kioc
+ {
+ __io_req_aux_free(req);
+
+- if (req->flags & REQ_F_INFLIGHT) {
+- struct io_ring_ctx *ctx = req->ctx;
+- unsigned long flags;
+-
+- spin_lock_irqsave(&ctx->inflight_lock, flags);
+- list_del(&req->inflight_entry);
+- if (waitqueue_active(&ctx->inflight_wait))
+- wake_up(&ctx->inflight_wait);
+- spin_unlock_irqrestore(&ctx->inflight_lock, flags);
+- }
++ if (req->flags & REQ_F_INFLIGHT)
++ io_req_drop_files(req);
+
+ percpu_ref_put(&req->ctx->refs);
+ if (likely(!io_is_fallback_req(req)))
--- /dev/null
+From foo@baz Thu Oct 29 01:16:54 PM CET 2020
+From: Jens Axboe <axboe@kernel.dk>
+Date: Thu, 8 Oct 2020 07:46:52 -0600
+Subject: io_uring: no need to call xa_destroy() on empty xarray
+
+From: Jens Axboe <axboe@kernel.dk>
+
+commit ca6484cd308a671811bf39f3119e81966eb476e3 upstream.
+
+The kernel test robot reports this lockdep issue:
+
+[child1:659] mbind (274) returned ENOSYS, marking as inactive.
+[child1:659] mq_timedsend (279) returned ENOSYS, marking as inactive.
+[main] 10175 iterations. [F:7781 S:2344 HI:2397]
+[ 24.610601]
+[ 24.610743] ================================
+[ 24.611083] WARNING: inconsistent lock state
+[ 24.611437] 5.9.0-rc7-00017-g0f2122045b9462 #5 Not tainted
+[ 24.611861] --------------------------------
+[ 24.612193] inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage.
+[ 24.612660] ksoftirqd/0/7 [HC0[0]:SC1[3]:HE0:SE0] takes:
+[ 24.613086] f00ed998 (&xa->xa_lock#4){+.?.}-{2:2}, at: xa_destroy+0x43/0xc1
+[ 24.613642] {SOFTIRQ-ON-W} state was registered at:
+[ 24.614024] lock_acquire+0x20c/0x29b
+[ 24.614341] _raw_spin_lock+0x21/0x30
+[ 24.614636] io_uring_add_task_file+0xe8/0x13a
+[ 24.614987] io_uring_create+0x535/0x6bd
+[ 24.615297] io_uring_setup+0x11d/0x136
+[ 24.615606] __ia32_sys_io_uring_setup+0xd/0xf
+[ 24.615977] do_int80_syscall_32+0x53/0x6c
+[ 24.616306] restore_all_switch_stack+0x0/0xb1
+[ 24.616677] irq event stamp: 939881
+[ 24.616968] hardirqs last enabled at (939880): [<8105592d>] __local_bh_enable_ip+0x13c/0x145
+[ 24.617642] hardirqs last disabled at (939881): [<81b6ace3>] _raw_spin_lock_irqsave+0x1b/0x4e
+[ 24.618321] softirqs last enabled at (939738): [<81b6c7c8>] __do_softirq+0x3f0/0x45a
+[ 24.618924] softirqs last disabled at (939743): [<81055741>] run_ksoftirqd+0x35/0x61
+[ 24.619521]
+[ 24.619521] other info that might help us debug this:
+[ 24.620028] Possible unsafe locking scenario:
+[ 24.620028]
+[ 24.620492] CPU0
+[ 24.620685] ----
+[ 24.620894] lock(&xa->xa_lock#4);
+[ 24.621168] <Interrupt>
+[ 24.621381] lock(&xa->xa_lock#4);
+[ 24.621695]
+[ 24.621695] *** DEADLOCK ***
+[ 24.621695]
+[ 24.622154] 1 lock held by ksoftirqd/0/7:
+[ 24.622468] #0: 823bfb94 (rcu_callback){....}-{0:0}, at: rcu_process_callbacks+0xc0/0x155
+[ 24.623106]
+[ 24.623106] stack backtrace:
+[ 24.623454] CPU: 0 PID: 7 Comm: ksoftirqd/0 Not tainted 5.9.0-rc7-00017-g0f2122045b9462 #5
+[ 24.624090] Call Trace:
+[ 24.624284] ? show_stack+0x40/0x46
+[ 24.624551] dump_stack+0x1b/0x1d
+[ 24.624809] print_usage_bug+0x17a/0x185
+[ 24.625142] mark_lock+0x11d/0x1db
+[ 24.625474] ? print_shortest_lock_dependencies+0x121/0x121
+[ 24.625905] __lock_acquire+0x41e/0x7bf
+[ 24.626206] lock_acquire+0x20c/0x29b
+[ 24.626517] ? xa_destroy+0x43/0xc1
+[ 24.626810] ? lock_acquire+0x20c/0x29b
+[ 24.627110] _raw_spin_lock_irqsave+0x3e/0x4e
+[ 24.627450] ? xa_destroy+0x43/0xc1
+[ 24.627725] xa_destroy+0x43/0xc1
+[ 24.627989] __io_uring_free+0x57/0x71
+[ 24.628286] ? get_pid+0x22/0x22
+[ 24.628544] __put_task_struct+0xf2/0x163
+[ 24.628865] put_task_struct+0x1f/0x2a
+[ 24.629161] delayed_put_task_struct+0xe2/0xe9
+[ 24.629509] rcu_process_callbacks+0x128/0x155
+[ 24.629860] __do_softirq+0x1a3/0x45a
+[ 24.630151] run_ksoftirqd+0x35/0x61
+[ 24.630443] smpboot_thread_fn+0x304/0x31a
+[ 24.630763] kthread+0x124/0x139
+[ 24.631016] ? sort_range+0x18/0x18
+[ 24.631290] ? kthread_create_worker_on_cpu+0x17/0x17
+[ 24.631682] ret_from_fork+0x1c/0x28
+
+which is complaining about xa_destroy() grabbing the xa lock in an
+IRQ disabling fashion, whereas the io_uring uses cases aren't interrupt
+safe. This is really an xarray issue, since it should not assume the
+lock type. But for our use case, since we know the xarray is empty at
+this point, there's no need to actually call xa_destroy(). So just get
+rid of it.
+
+Fixes: 0f2122045b94 ("io_uring: don't rely on weak ->files references")
+Reported-by: kernel test robot <lkp@intel.com>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/io_uring.c | 1 -
+ 1 file changed, 1 deletion(-)
+
+--- a/fs/io_uring.c
++++ b/fs/io_uring.c
+@@ -7136,7 +7136,6 @@ void __io_uring_free(struct task_struct
+ struct io_uring_task *tctx = tsk->io_uring;
+
+ WARN_ON_ONCE(!xa_empty(&tctx->xa));
+- xa_destroy(&tctx->xa);
+ kfree(tctx);
+ tsk->io_uring = NULL;
+ }
--- /dev/null
+From foo@baz Thu Oct 29 01:16:54 PM CET 2020
+From: Jens Axboe <axboe@kernel.dk>
+Date: Fri, 18 Sep 2020 20:13:06 -0600
+Subject: io_uring: reference ->nsproxy for file table commands
+
+From: Jens Axboe <axboe@kernel.dk>
+
+commit 9b8284921513fc1ea57d87777283a59b05862f03 upstream.
+
+If we don't get and assign the namespace for the async work, then certain
+paths just don't work properly (like /dev/stdin, /proc/mounts, etc).
+Anything that references the current namespace of the given task should
+be assigned for async work on behalf of that task.
+
+Cc: stable@vger.kernel.org # v5.5+
+Reported-by: Al Viro <viro@zeniv.linux.org.uk>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/io-wq.c | 4 ++++
+ fs/io-wq.h | 1 +
+ fs/io_uring.c | 3 +++
+ 3 files changed, 8 insertions(+)
+
+--- a/fs/io-wq.c
++++ b/fs/io-wq.c
+@@ -60,6 +60,7 @@ struct io_worker {
+ const struct cred *cur_creds;
+ const struct cred *saved_creds;
+ struct files_struct *restore_files;
++ struct nsproxy *restore_nsproxy;
+ struct fs_struct *restore_fs;
+ };
+
+@@ -153,6 +154,7 @@ static bool __io_worker_unuse(struct io_
+
+ task_lock(current);
+ current->files = worker->restore_files;
++ current->nsproxy = worker->restore_nsproxy;
+ task_unlock(current);
+ }
+
+@@ -318,6 +320,7 @@ static void io_worker_start(struct io_wq
+
+ worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
+ worker->restore_files = current->files;
++ worker->restore_nsproxy = current->nsproxy;
+ worker->restore_fs = current->fs;
+ io_wqe_inc_running(wqe, worker);
+ }
+@@ -454,6 +457,7 @@ static void io_impersonate_work(struct i
+ if (work->files && current->files != work->files) {
+ task_lock(current);
+ current->files = work->files;
++ current->nsproxy = work->nsproxy;
+ task_unlock(current);
+ }
+ if (work->fs && current->fs != work->fs)
+--- a/fs/io-wq.h
++++ b/fs/io-wq.h
+@@ -88,6 +88,7 @@ struct io_wq_work {
+ struct files_struct *files;
+ struct mm_struct *mm;
+ const struct cred *creds;
++ struct nsproxy *nsproxy;
+ struct fs_struct *fs;
+ unsigned flags;
+ };
+--- a/fs/io_uring.c
++++ b/fs/io_uring.c
+@@ -1456,6 +1456,7 @@ static void io_req_drop_files(struct io_
+ spin_unlock_irqrestore(&ctx->inflight_lock, flags);
+ req->flags &= ~REQ_F_INFLIGHT;
+ put_files_struct(req->work.files);
++ put_nsproxy(req->work.nsproxy);
+ req->work.files = NULL;
+ }
+
+@@ -5685,6 +5686,8 @@ static int io_grab_files(struct io_kiocb
+ return 0;
+
+ req->work.files = get_files_struct(current);
++ get_nsproxy(current->nsproxy);
++ req->work.nsproxy = current->nsproxy;
+ req->flags |= REQ_F_INFLIGHT;
+
+ spin_lock_irq(&ctx->inflight_lock);
--- /dev/null
+From foo@baz Thu Oct 29 01:16:54 PM CET 2020
+From: Jens Axboe <axboe@kernel.dk>
+Date: Sat, 26 Sep 2020 15:05:03 -0600
+Subject: io_uring: return cancelation status from poll/timeout/files handlers
+
+From: Jens Axboe <axboe@kernel.dk>
+
+commit 76e1b6427fd8246376a97e3227049d49188dfb9c upstream.
+
+Return whether we found and canceled requests or not. This is in
+preparation for using this information, no functional changes in this
+patch.
+
+Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/io_uring.c | 30 ++++++++++++++++++++++++------
+ 1 file changed, 24 insertions(+), 6 deletions(-)
+
+--- a/fs/io_uring.c
++++ b/fs/io_uring.c
+@@ -1143,15 +1143,23 @@ static bool io_task_match(struct io_kioc
+ return false;
+ }
+
+-static void io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk)
++/*
++ * Returns true if we found and killed one or more timeouts
++ */
++static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk)
+ {
+ struct io_kiocb *req, *tmp;
++ int canceled = 0;
+
+ spin_lock_irq(&ctx->completion_lock);
+- list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list)
+- if (io_task_match(req, tsk))
++ list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list) {
++ if (io_task_match(req, tsk)) {
+ io_kill_timeout(req);
++ canceled++;
++ }
++ }
+ spin_unlock_irq(&ctx->completion_lock);
++ return canceled != 0;
+ }
+
+ static void __io_queue_deferred(struct io_ring_ctx *ctx)
+@@ -4650,7 +4658,10 @@ static bool io_poll_remove_one(struct io
+ return do_complete;
+ }
+
+-static void io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk)
++/*
++ * Returns true if we found and killed one or more poll requests
++ */
++static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk)
+ {
+ struct hlist_node *tmp;
+ struct io_kiocb *req;
+@@ -4670,6 +4681,8 @@ static void io_poll_remove_all(struct io
+
+ if (posted)
+ io_cqring_ev_posted(ctx);
++
++ return posted != 0;
+ }
+
+ static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
+@@ -7744,11 +7757,14 @@ static void io_cancel_defer_files(struct
+ }
+ }
+
+-static void io_uring_cancel_files(struct io_ring_ctx *ctx,
++/*
++ * Returns true if we found and killed one or more files pinning requests
++ */
++static bool io_uring_cancel_files(struct io_ring_ctx *ctx,
+ struct files_struct *files)
+ {
+ if (list_empty_careful(&ctx->inflight_list))
+- return;
++ return false;
+
+ io_cancel_defer_files(ctx, files);
+ /* cancel all at once, should be faster than doing it one by one*/
+@@ -7811,6 +7827,8 @@ static void io_uring_cancel_files(struct
+ schedule();
+ finish_wait(&ctx->inflight_wait, &wait);
+ }
++
++ return true;
+ }
+
+ static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
--- /dev/null
+From foo@baz Thu Oct 29 01:16:54 PM CET 2020
+From: Jens Axboe <axboe@kernel.dk>
+Date: Mon, 12 Oct 2020 11:15:07 -0600
+Subject: io_uring: stash ctx task reference for SQPOLL
+
+From: Jens Axboe <axboe@kernel.dk>
+
+commit 2aede0e417db846793c276c7a1bbf7262c8349b0 upstream.
+
+We can grab a reference to the task instead of stashing away the task
+files_struct. This is doable without creating a circular reference
+between the ring fd and the task itself.
+
+Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/io_uring.c | 39 +++++++++++++++++++++++++++++----------
+ 1 file changed, 29 insertions(+), 10 deletions(-)
+
+--- a/fs/io_uring.c
++++ b/fs/io_uring.c
+@@ -264,7 +264,16 @@ struct io_ring_ctx {
+ /* IO offload */
+ struct io_wq *io_wq;
+ struct task_struct *sqo_thread; /* if using sq thread polling */
+- struct mm_struct *sqo_mm;
++
++ /*
++ * For SQPOLL usage - we hold a reference to the parent task, so we
++ * have access to the ->files
++ */
++ struct task_struct *sqo_task;
++
++ /* Only used for accounting purposes */
++ struct mm_struct *mm_account;
++
+ wait_queue_head_t sqo_wait;
+
+ /*
+@@ -4421,9 +4430,10 @@ static int io_sq_thread_acquire_mm(struc
+ {
+ if (io_op_defs[req->opcode].needs_mm && !current->mm) {
+ if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL) ||
+- !mmget_not_zero(ctx->sqo_mm)))
++ !ctx->sqo_task->mm ||
++ !mmget_not_zero(ctx->sqo_task->mm)))
+ return -EFAULT;
+- kthread_use_mm(ctx->sqo_mm);
++ kthread_use_mm(ctx->sqo_task->mm);
+ }
+
+ return 0;
+@@ -7104,9 +7114,6 @@ static int io_sq_offload_start(struct io
+ {
+ int ret;
+
+- mmgrab(current->mm);
+- ctx->sqo_mm = current->mm;
+-
+ if (ctx->flags & IORING_SETUP_SQPOLL) {
+ ret = -EPERM;
+ if (!capable(CAP_SYS_ADMIN))
+@@ -7151,8 +7158,6 @@ static int io_sq_offload_start(struct io
+ return 0;
+ err:
+ io_finish_async(ctx);
+- mmdrop(ctx->sqo_mm);
+- ctx->sqo_mm = NULL;
+ return ret;
+ }
+
+@@ -7482,8 +7487,12 @@ static void io_destroy_buffers(struct io
+ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
+ {
+ io_finish_async(ctx);
+- if (ctx->sqo_mm)
+- mmdrop(ctx->sqo_mm);
++ if (ctx->sqo_task) {
++ put_task_struct(ctx->sqo_task);
++ ctx->sqo_task = NULL;
++ mmdrop(ctx->mm_account);
++ ctx->mm_account = NULL;
++ }
+
+ io_iopoll_reap_events(ctx);
+ io_sqe_buffer_unregister(ctx);
+@@ -8256,6 +8265,16 @@ static int io_uring_create(unsigned entr
+ ctx->user = user;
+ ctx->creds = get_current_cred();
+
++ ctx->sqo_task = get_task_struct(current);
++ /*
++ * This is just grabbed for accounting purposes. When a process exits,
++ * the mm is exited and dropped before the files, hence we need to hang
++ * on to this mm purely for the purposes of being able to unaccount
++ * memory (locked/pinned vm). It's not used for anything else.
++ */
++ mmgrab(current->mm);
++ ctx->mm_account = current->mm;
++
+ ret = io_allocate_scq_urings(ctx, p);
+ if (ret)
+ goto err;
--- /dev/null
+From foo@baz Thu Oct 29 01:16:54 PM CET 2020
+From: Jens Axboe <axboe@kernel.dk>
+Date: Mon, 12 Oct 2020 11:25:39 -0600
+Subject: io_uring: unconditionally grab req->task
+
+From: Jens Axboe <axboe@kernel.dk>
+
+commit e3bc8e9dad7f2f83cc807111d4472164c9210153 upstream.
+
+Sometimes we assign a weak reference to it, sometimes we grab a
+reference to it. Clean this up and make it unconditional, and drop the
+flag related to tracking this state.
+
+Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/io_uring.c | 26 +++-----------------------
+ 1 file changed, 3 insertions(+), 23 deletions(-)
+
+--- a/fs/io_uring.c
++++ b/fs/io_uring.c
+@@ -550,7 +550,6 @@ enum {
+ REQ_F_NO_FILE_TABLE_BIT,
+ REQ_F_QUEUE_TIMEOUT_BIT,
+ REQ_F_WORK_INITIALIZED_BIT,
+- REQ_F_TASK_PINNED_BIT,
+
+ /* not a real bit, just to check we're not overflowing the space */
+ __REQ_F_LAST_BIT,
+@@ -608,8 +607,6 @@ enum {
+ REQ_F_QUEUE_TIMEOUT = BIT(REQ_F_QUEUE_TIMEOUT_BIT),
+ /* io_wq_work is initialized */
+ REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT),
+- /* req->task is refcounted */
+- REQ_F_TASK_PINNED = BIT(REQ_F_TASK_PINNED_BIT),
+ };
+
+ struct async_poll {
+@@ -924,21 +921,6 @@ struct sock *io_uring_get_socket(struct
+ }
+ EXPORT_SYMBOL(io_uring_get_socket);
+
+-static void io_get_req_task(struct io_kiocb *req)
+-{
+- if (req->flags & REQ_F_TASK_PINNED)
+- return;
+- get_task_struct(req->task);
+- req->flags |= REQ_F_TASK_PINNED;
+-}
+-
+-/* not idempotent -- it doesn't clear REQ_F_TASK_PINNED */
+-static void __io_put_req_task(struct io_kiocb *req)
+-{
+- if (req->flags & REQ_F_TASK_PINNED)
+- put_task_struct(req->task);
+-}
+-
+ static void io_file_put_work(struct work_struct *work);
+
+ /*
+@@ -1455,7 +1437,7 @@ static void __io_req_aux_free(struct io_
+ kfree(req->io);
+ if (req->file)
+ io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
+- __io_put_req_task(req);
++ put_task_struct(req->task);
+ io_req_work_drop_env(req);
+ }
+
+@@ -1765,7 +1747,7 @@ static inline bool io_req_multi_free(str
+ if ((req->flags & REQ_F_LINK_HEAD) || io_is_fallback_req(req))
+ return false;
+
+- if (req->file || req->io)
++ if (req->file || req->io || req->task)
+ rb->need_iter++;
+
+ rb->reqs[rb->to_free++] = req;
+@@ -4584,7 +4566,6 @@ static bool io_arm_poll_handler(struct i
+ if (req->flags & REQ_F_WORK_INITIALIZED)
+ memcpy(&apoll->work, &req->work, sizeof(req->work));
+
+- io_get_req_task(req);
+ req->apoll = apoll;
+ INIT_HLIST_NODE(&req->hash_node);
+
+@@ -4774,8 +4755,6 @@ static int io_poll_add_prep(struct io_ki
+
+ events = READ_ONCE(sqe->poll_events);
+ poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
+-
+- io_get_req_task(req);
+ return 0;
+ }
+
+@@ -6057,6 +6036,7 @@ static int io_init_req(struct io_ring_ct
+ /* one is dropped after submission, the other at completion */
+ refcount_set(&req->refs, 2);
+ req->task = current;
++ get_task_struct(req->task);
+ req->result = 0;
+
+ if (unlikely(req->opcode >= IORING_OP_LAST))
--- /dev/null
+From foo@baz Thu Oct 29 01:16:54 PM CET 2020
+From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Date: Tue, 1 Sep 2020 10:41:46 +0200
+Subject: io_wq: Make io_wqe::lock a raw_spinlock_t
+
+From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+
+commit 95da84659226d75698a1ab958be0af21d9cc2a9c upstream.
+
+During a context switch the scheduler invokes wq_worker_sleeping() with
+disabled preemption. Disabling preemption is needed because it protects
+access to `worker->sleeping'. As an optimisation it avoids invoking
+schedule() within the schedule path as part of possible wake up (thus
+preempt_enable_no_resched() afterwards).
+
+The io-wq has been added to the mix in the same section with disabled
+preemption. This breaks on PREEMPT_RT because io_wq_worker_sleeping()
+acquires a spinlock_t. Also within the schedule() the spinlock_t must be
+acquired after tsk_is_pi_blocked() otherwise it will block on the
+sleeping lock again while scheduling out.
+
+While playing with `io_uring-bench' I didn't notice a significant
+latency spike after converting io_wqe::lock to a raw_spinlock_t. The
+latency was more or less the same.
+
+In order to keep the spinlock_t it would have to be moved after the
+tsk_is_pi_blocked() check which would introduce a branch instruction
+into the hot path.
+
+The lock is used to maintain the `work_list' and wakes one task up at
+most.
+Should io_wqe_cancel_pending_work() cause latency spikes, while
+searching for a specific item, then it would need to drop the lock
+during iterations.
+revert_creds() is also invoked under the lock. According to debug
+cred::non_rcu is 0. Otherwise it should be moved outside of the locked
+section because put_cred_rcu()->free_uid() acquires a sleeping lock.
+
+Convert io_wqe::lock to a raw_spinlock_t.c
+
+Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/io-wq.c | 52 ++++++++++++++++++++++++++--------------------------
+ 1 file changed, 26 insertions(+), 26 deletions(-)
+
+--- a/fs/io-wq.c
++++ b/fs/io-wq.c
+@@ -88,7 +88,7 @@ enum {
+ */
+ struct io_wqe {
+ struct {
+- spinlock_t lock;
++ raw_spinlock_t lock;
+ struct io_wq_work_list work_list;
+ unsigned long hash_map;
+ unsigned flags;
+@@ -149,7 +149,7 @@ static bool __io_worker_unuse(struct io_
+
+ if (current->files != worker->restore_files) {
+ __acquire(&wqe->lock);
+- spin_unlock_irq(&wqe->lock);
++ raw_spin_unlock_irq(&wqe->lock);
+ dropped_lock = true;
+
+ task_lock(current);
+@@ -168,7 +168,7 @@ static bool __io_worker_unuse(struct io_
+ if (worker->mm) {
+ if (!dropped_lock) {
+ __acquire(&wqe->lock);
+- spin_unlock_irq(&wqe->lock);
++ raw_spin_unlock_irq(&wqe->lock);
+ dropped_lock = true;
+ }
+ __set_current_state(TASK_RUNNING);
+@@ -222,17 +222,17 @@ static void io_worker_exit(struct io_wor
+ worker->flags = 0;
+ preempt_enable();
+
+- spin_lock_irq(&wqe->lock);
++ raw_spin_lock_irq(&wqe->lock);
+ hlist_nulls_del_rcu(&worker->nulls_node);
+ list_del_rcu(&worker->all_list);
+ if (__io_worker_unuse(wqe, worker)) {
+ __release(&wqe->lock);
+- spin_lock_irq(&wqe->lock);
++ raw_spin_lock_irq(&wqe->lock);
+ }
+ acct->nr_workers--;
+ nr_workers = wqe->acct[IO_WQ_ACCT_BOUND].nr_workers +
+ wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers;
+- spin_unlock_irq(&wqe->lock);
++ raw_spin_unlock_irq(&wqe->lock);
+
+ /* all workers gone, wq exit can proceed */
+ if (!nr_workers && refcount_dec_and_test(&wqe->wq->refs))
+@@ -508,7 +508,7 @@ get_next:
+ else if (!wq_list_empty(&wqe->work_list))
+ wqe->flags |= IO_WQE_FLAG_STALLED;
+
+- spin_unlock_irq(&wqe->lock);
++ raw_spin_unlock_irq(&wqe->lock);
+ if (!work)
+ break;
+ io_assign_current_work(worker, work);
+@@ -543,7 +543,7 @@ get_next:
+ io_wqe_enqueue(wqe, linked);
+
+ if (hash != -1U && !next_hashed) {
+- spin_lock_irq(&wqe->lock);
++ raw_spin_lock_irq(&wqe->lock);
+ wqe->hash_map &= ~BIT_ULL(hash);
+ wqe->flags &= ~IO_WQE_FLAG_STALLED;
+ /* dependent work is not hashed */
+@@ -551,11 +551,11 @@ get_next:
+ /* skip unnecessary unlock-lock wqe->lock */
+ if (!work)
+ goto get_next;
+- spin_unlock_irq(&wqe->lock);
++ raw_spin_unlock_irq(&wqe->lock);
+ }
+ } while (work);
+
+- spin_lock_irq(&wqe->lock);
++ raw_spin_lock_irq(&wqe->lock);
+ } while (1);
+ }
+
+@@ -570,7 +570,7 @@ static int io_wqe_worker(void *data)
+ while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ loop:
+- spin_lock_irq(&wqe->lock);
++ raw_spin_lock_irq(&wqe->lock);
+ if (io_wqe_run_queue(wqe)) {
+ __set_current_state(TASK_RUNNING);
+ io_worker_handle_work(worker);
+@@ -581,7 +581,7 @@ loop:
+ __release(&wqe->lock);
+ goto loop;
+ }
+- spin_unlock_irq(&wqe->lock);
++ raw_spin_unlock_irq(&wqe->lock);
+ if (signal_pending(current))
+ flush_signals(current);
+ if (schedule_timeout(WORKER_IDLE_TIMEOUT))
+@@ -593,11 +593,11 @@ loop:
+ }
+
+ if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
+- spin_lock_irq(&wqe->lock);
++ raw_spin_lock_irq(&wqe->lock);
+ if (!wq_list_empty(&wqe->work_list))
+ io_worker_handle_work(worker);
+ else
+- spin_unlock_irq(&wqe->lock);
++ raw_spin_unlock_irq(&wqe->lock);
+ }
+
+ io_worker_exit(worker);
+@@ -637,9 +637,9 @@ void io_wq_worker_sleeping(struct task_s
+
+ worker->flags &= ~IO_WORKER_F_RUNNING;
+
+- spin_lock_irq(&wqe->lock);
++ raw_spin_lock_irq(&wqe->lock);
+ io_wqe_dec_running(wqe, worker);
+- spin_unlock_irq(&wqe->lock);
++ raw_spin_unlock_irq(&wqe->lock);
+ }
+
+ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
+@@ -663,7 +663,7 @@ static bool create_io_worker(struct io_w
+ return false;
+ }
+
+- spin_lock_irq(&wqe->lock);
++ raw_spin_lock_irq(&wqe->lock);
+ hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
+ list_add_tail_rcu(&worker->all_list, &wqe->all_list);
+ worker->flags |= IO_WORKER_F_FREE;
+@@ -672,7 +672,7 @@ static bool create_io_worker(struct io_w
+ if (!acct->nr_workers && (worker->flags & IO_WORKER_F_BOUND))
+ worker->flags |= IO_WORKER_F_FIXED;
+ acct->nr_workers++;
+- spin_unlock_irq(&wqe->lock);
++ raw_spin_unlock_irq(&wqe->lock);
+
+ if (index == IO_WQ_ACCT_UNBOUND)
+ atomic_inc(&wq->user->processes);
+@@ -727,12 +727,12 @@ static int io_wq_manager(void *data)
+ if (!node_online(node))
+ continue;
+
+- spin_lock_irq(&wqe->lock);
++ raw_spin_lock_irq(&wqe->lock);
+ if (io_wqe_need_worker(wqe, IO_WQ_ACCT_BOUND))
+ fork_worker[IO_WQ_ACCT_BOUND] = true;
+ if (io_wqe_need_worker(wqe, IO_WQ_ACCT_UNBOUND))
+ fork_worker[IO_WQ_ACCT_UNBOUND] = true;
+- spin_unlock_irq(&wqe->lock);
++ raw_spin_unlock_irq(&wqe->lock);
+ if (fork_worker[IO_WQ_ACCT_BOUND])
+ create_io_worker(wq, wqe, IO_WQ_ACCT_BOUND);
+ if (fork_worker[IO_WQ_ACCT_UNBOUND])
+@@ -829,10 +829,10 @@ static void io_wqe_enqueue(struct io_wqe
+ }
+
+ work_flags = work->flags;
+- spin_lock_irqsave(&wqe->lock, flags);
++ raw_spin_lock_irqsave(&wqe->lock, flags);
+ io_wqe_insert_work(wqe, work);
+ wqe->flags &= ~IO_WQE_FLAG_STALLED;
+- spin_unlock_irqrestore(&wqe->lock, flags);
++ raw_spin_unlock_irqrestore(&wqe->lock, flags);
+
+ if ((work_flags & IO_WQ_WORK_CONCURRENT) ||
+ !atomic_read(&acct->nr_running))
+@@ -959,13 +959,13 @@ static void io_wqe_cancel_pending_work(s
+ unsigned long flags;
+
+ retry:
+- spin_lock_irqsave(&wqe->lock, flags);
++ raw_spin_lock_irqsave(&wqe->lock, flags);
+ wq_list_for_each(node, prev, &wqe->work_list) {
+ work = container_of(node, struct io_wq_work, list);
+ if (!match->fn(work, match->data))
+ continue;
+ io_wqe_remove_pending(wqe, work, prev);
+- spin_unlock_irqrestore(&wqe->lock, flags);
++ raw_spin_unlock_irqrestore(&wqe->lock, flags);
+ io_run_cancel(work, wqe);
+ match->nr_pending++;
+ if (!match->cancel_all)
+@@ -974,7 +974,7 @@ retry:
+ /* not safe to continue after unlock */
+ goto retry;
+ }
+- spin_unlock_irqrestore(&wqe->lock, flags);
++ raw_spin_unlock_irqrestore(&wqe->lock, flags);
+ }
+
+ static void io_wqe_cancel_running_work(struct io_wqe *wqe,
+@@ -1082,7 +1082,7 @@ struct io_wq *io_wq_create(unsigned boun
+ }
+ atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0);
+ wqe->wq = wq;
+- spin_lock_init(&wqe->lock);
++ raw_spin_lock_init(&wqe->lock);
+ INIT_WQ_LIST(&wqe->work_list);
+ INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0);
+ INIT_LIST_HEAD(&wqe->all_list);
netfilter-nftables_offload-kasan-slab-out-of-bounds-read-in-nft_flow_rule_create.patch
+io_uring-don-t-run-task-work-on-an-exiting-task.patch
+io_uring-allow-timeout-poll-files-killing-to-take-task-into-account.patch
+io_uring-move-dropping-of-files-into-separate-helper.patch
+io_uring-stash-ctx-task-reference-for-sqpoll.patch
+io_uring-unconditionally-grab-req-task.patch
+io_uring-return-cancelation-status-from-poll-timeout-files-handlers.patch
+io_uring-enable-task-files-specific-overflow-flushing.patch
+io_uring-don-t-rely-on-weak-files-references.patch
+io_uring-reference-nsproxy-for-file-table-commands.patch
+io_wq-make-io_wqe-lock-a-raw_spinlock_t.patch
+io-wq-fix-use-after-free-in-io_wq_worker_running.patch
+io_uring-no-need-to-call-xa_destroy-on-empty-xarray.patch
+io_uring-fix-use-of-xarray-in-__io_uring_files_cancel.patch
+io_uring-fix-xarray-usage-in-io_uring_add_task_file.patch
+io_uring-convert-advanced-xarray-uses-to-the-normal-api.patch