From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 29 Jul 2019 18:57:52 +0000 (+0200)
Subject: 5.2-stable patches
X-Git-Tag: v5.2.5~5
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=5684654eb9fed589229d987a4b65edc76e3bd5b5;p=thirdparty%2Fkernel%2Fstable-queue.git

5.2-stable patches

added patches:
	io_uring-add-a-memory-barrier-before-atomic_read.patch
	io_uring-don-t-use-iov_iter_advance-for-fixed-buffers.patch
	io_uring-ensure-list-is-initialized-for-poll-commands.patch
	io_uring-fix-counter-inc-dec-mismatch-in-async_list.patch
---

diff --git a/queue-5.2/io_uring-add-a-memory-barrier-before-atomic_read.patch b/queue-5.2/io_uring-add-a-memory-barrier-before-atomic_read.patch
new file mode 100644
index 00000000000..ff030829405
--- /dev/null
+++ b/queue-5.2/io_uring-add-a-memory-barrier-before-atomic_read.patch
@@ -0,0 +1,93 @@
+From c0e48f9dea9129aa11bec3ed13803bcc26e96e49 Mon Sep 17 00:00:00 2001
+From: Zhengyuan Liu <liuzhengyuan@kylinos.cn>
+Date: Thu, 18 Jul 2019 20:44:00 +0800
+Subject: io_uring: add a memory barrier before atomic_read
+
+From: Zhengyuan Liu <liuzhengyuan@kylinos.cn>
+
+commit c0e48f9dea9129aa11bec3ed13803bcc26e96e49 upstream.
+
+There is a hang issue while using fio to do some basic test. The issue
+can be easily reproduced using the below script:
+
+        while true
+        do
+                fio  --ioengine=io_uring  -rw=write -bs=4k -numjobs=1 \
+                     -size=1G -iodepth=64 -name=uring   --filename=/dev/zero
+        done
+
+After several minutes (or more), fio would block at
+io_uring_enter->io_cqring_wait in order to waiting for previously
+committed sqes to be completed and can't return to user anymore until
+we send a SIGTERM to fio. After receiving SIGTERM, fio hangs at
+io_ring_ctx_wait_and_kill with a backtrace like this:
+
+        [54133.243816] Call Trace:
+        [54133.243842]  __schedule+0x3a0/0x790
+        [54133.243868]  schedule+0x38/0xa0
+        [54133.243880]  schedule_timeout+0x218/0x3b0
+        [54133.243891]  ? sched_clock+0x9/0x10
+        [54133.243903]  ? wait_for_completion+0xa3/0x130
+        [54133.243916]  ? _raw_spin_unlock_irq+0x2c/0x40
+        [54133.243930]  ? trace_hardirqs_on+0x3f/0xe0
+        [54133.243951]  wait_for_completion+0xab/0x130
+        [54133.243962]  ? wake_up_q+0x70/0x70
+        [54133.243984]  io_ring_ctx_wait_and_kill+0xa0/0x1d0
+        [54133.243998]  io_uring_release+0x20/0x30
+        [54133.244008]  __fput+0xcf/0x270
+        [54133.244029]  ____fput+0xe/0x10
+        [54133.244040]  task_work_run+0x7f/0xa0
+        [54133.244056]  do_exit+0x305/0xc40
+        [54133.244067]  ? get_signal+0x13b/0xbd0
+        [54133.244088]  do_group_exit+0x50/0xd0
+        [54133.244103]  get_signal+0x18d/0xbd0
+        [54133.244112]  ? _raw_spin_unlock_irqrestore+0x36/0x60
+        [54133.244142]  do_signal+0x34/0x720
+        [54133.244171]  ? exit_to_usermode_loop+0x7e/0x130
+        [54133.244190]  exit_to_usermode_loop+0xc0/0x130
+        [54133.244209]  do_syscall_64+0x16b/0x1d0
+        [54133.244221]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
+
+The reason is that we had added a req to ctx->pending_async at the very
+end, but it didn't get a chance to be processed. How could this happen?
+
+        fio#cpu0                                        wq#cpu1
+
+        io_add_to_prev_work                    io_sq_wq_submit_work
+
+          atomic_read() <<< 1
+
+                                                  atomic_dec_return() << 1->0
+                                                  list_empty();    <<< true;
+
+          list_add_tail()
+          atomic_read() << 0 or 1?
+
+As atomic_ops.rst states, atomic_read does not guarantee that the
+runtime modification by any other thread is visible yet, so we must take
+care of that with a proper implicit or explicit memory barrier.
+
+This issue was detected with the help of Jackie's <liuyun01@kylinos.cn>
+
+Fixes: 31b515106428 ("io_uring: allow workqueue item to handle multiple buffered requests")
+Signed-off-by: Zhengyuan Liu <liuzhengyuan@kylinos.cn>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/io_uring.c |    4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/fs/io_uring.c
++++ b/fs/io_uring.c
+@@ -1769,6 +1769,10 @@ static bool io_add_to_prev_work(struct a
+ 	ret = true;
+ 	spin_lock(&list->lock);
+ 	list_add_tail(&req->list, &list->list);
++	/*
++	 * Ensure we see a simultaneous modification from io_sq_wq_submit_work()
++	 */
++	smp_mb();
+ 	if (!atomic_read(&list->cnt)) {
+ 		list_del_init(&req->list);
+ 		ret = false;
diff --git a/queue-5.2/io_uring-don-t-use-iov_iter_advance-for-fixed-buffers.patch b/queue-5.2/io_uring-don-t-use-iov_iter_advance-for-fixed-buffers.patch
new file mode 100644
index 00000000000..21f4a7843ef
--- /dev/null
+++ b/queue-5.2/io_uring-don-t-use-iov_iter_advance-for-fixed-buffers.patch
@@ -0,0 +1,92 @@
+From bd11b3a391e3df6fa958facbe4b3f9f4cca9bd49 Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@kernel.dk>
+Date: Sat, 20 Jul 2019 08:37:31 -0600
+Subject: io_uring: don't use iov_iter_advance() for fixed buffers
+
+From: Jens Axboe <axboe@kernel.dk>
+
+commit bd11b3a391e3df6fa958facbe4b3f9f4cca9bd49 upstream.
+
+Hrvoje reports that when a large fixed buffer is registered and IO is
+being done to the latter pages of said buffer, the IO submission time
+is much worse:
+
+reading to the start of the buffer: 11238 ns
+reading to the end of the buffer:   1039879 ns
+
+In fact, it's worse by two orders of magnitude. The reason for that is
+how io_uring figures out how to setup the iov_iter. We point the iter
+at the first bvec, and then use iov_iter_advance() to fast-forward to
+the offset within that buffer we need.
+
+However, that is abysmally slow, as it entails iterating the bvecs
+that we setup as part of buffer registration. There's really no need
+to use this generic helper, as we know it's a BVEC type iterator, and
+we also know that each bvec is PAGE_SIZE in size, apart from possibly
+the first and last. Hence we can just use a shift on the offset to
+find the right index, and then adjust the iov_iter appropriately.
+After this fix, the timings are:
+
+reading to the start of the buffer: 10135 ns
+reading to the end of the buffer:   1377 ns
+
+Or about an 755x improvement for the tail page.
+
+Reported-by: Hrvoje Zeba <zeba.hrvoje@gmail.com>
+Tested-by: Hrvoje Zeba <zeba.hrvoje@gmail.com>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+
+---
+ fs/io_uring.c |   39 +++++++++++++++++++++++++++++++++++++--
+ 1 file changed, 37 insertions(+), 2 deletions(-)
+
+--- a/fs/io_uring.c
++++ b/fs/io_uring.c
+@@ -1001,8 +1001,43 @@ static int io_import_fixed(struct io_rin
+ 	 */
+ 	offset = buf_addr - imu->ubuf;
+ 	iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
+-	if (offset)
+-		iov_iter_advance(iter, offset);
++
++	if (offset) {
++		/*
++		 * Don't use iov_iter_advance() here, as it's really slow for
++		 * using the latter parts of a big fixed buffer - it iterates
++		 * over each segment manually. We can cheat a bit here, because
++		 * we know that:
++		 *
++		 * 1) it's a BVEC iter, we set it up
++		 * 2) all bvecs are PAGE_SIZE in size, except potentially the
++		 *    first and last bvec
++		 *
++		 * So just find our index, and adjust the iterator afterwards.
++		 * If the offset is within the first bvec (or the whole first
++		 * bvec, just use iov_iter_advance(). This makes it easier
++		 * since we can just skip the first segment, which may not
++		 * be PAGE_SIZE aligned.
++		 */
++		const struct bio_vec *bvec = imu->bvec;
++
++		if (offset <= bvec->bv_len) {
++			iov_iter_advance(iter, offset);
++		} else {
++			unsigned long seg_skip;
++
++			/* skip first vec */
++			offset -= bvec->bv_len;
++			seg_skip = 1 + (offset >> PAGE_SHIFT);
++
++			iter->bvec = bvec + seg_skip;
++			iter->nr_segs -= seg_skip;
++			iter->count -= (seg_skip << PAGE_SHIFT);
++			iter->iov_offset = offset & ~PAGE_MASK;
++			if (iter->iov_offset)
++				iter->count -= iter->iov_offset;
++		}
++	}
+ 
+ 	/* don't drop a reference to these pages */
+ 	iter->type |= ITER_BVEC_FLAG_NO_REF;
diff --git a/queue-5.2/io_uring-ensure-list-is-initialized-for-poll-commands.patch b/queue-5.2/io_uring-ensure-list-is-initialized-for-poll-commands.patch
new file mode 100644
index 00000000000..086597fb13b
--- /dev/null
+++ b/queue-5.2/io_uring-ensure-list-is-initialized-for-poll-commands.patch
@@ -0,0 +1,34 @@
+From 36703247d5f52a679df9da51192b6950fe81689f Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@kernel.dk>
+Date: Thu, 25 Jul 2019 10:20:18 -0600
+Subject: io_uring: ensure ->list is initialized for poll commands
+
+From: Jens Axboe <axboe@kernel.dk>
+
+commit 36703247d5f52a679df9da51192b6950fe81689f upstream.
+
+Daniel reports that when testing an http server that uses io_uring
+to poll for incoming connections, sometimes it hard crashes. This is
+due to an uninitialized list member for the io_uring request. Normally
+this doesn't trigger and none of the test cases caught it.
+
+Reported-by: Daniel Kozak <kozzi11@gmail.com>
+Tested-by: Daniel Kozak <kozzi11@gmail.com>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/io_uring.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/fs/io_uring.c
++++ b/fs/io_uring.c
+@@ -1489,6 +1489,8 @@ static int io_poll_add(struct io_kiocb *
+ 	INIT_LIST_HEAD(&poll->wait.entry);
+ 	init_waitqueue_func_entry(&poll->wait, io_poll_wake);
+ 
++	INIT_LIST_HEAD(&req->list);
++
+ 	mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
+ 
+ 	spin_lock_irq(&ctx->completion_lock);
diff --git a/queue-5.2/io_uring-fix-counter-inc-dec-mismatch-in-async_list.patch b/queue-5.2/io_uring-fix-counter-inc-dec-mismatch-in-async_list.patch
new file mode 100644
index 00000000000..161ff8c90dc
--- /dev/null
+++ b/queue-5.2/io_uring-fix-counter-inc-dec-mismatch-in-async_list.patch
@@ -0,0 +1,56 @@
+From f7b76ac9d17e16e44feebb6d2749fec92bfd6dd4 Mon Sep 17 00:00:00 2001
+From: Zhengyuan Liu <liuzhengyuan@kylinos.cn>
+Date: Tue, 16 Jul 2019 23:26:14 +0800
+Subject: io_uring: fix counter inc/dec mismatch in async_list
+
+From: Zhengyuan Liu <liuzhengyuan@kylinos.cn>
+
+commit f7b76ac9d17e16e44feebb6d2749fec92bfd6dd4 upstream.
+
+We could queue a work for each req in defer and link list without
+increasing async_list->cnt, so we shouldn't decrease it while exiting
+from workqueue as well if we didn't process the req in async list.
+
+Thanks to Jens Axboe <axboe@kernel.dk> for his guidance.
+
+Fixes: 31b515106428 ("io_uring: allow workqueue item to handle multiple buffered requests")
+Signed-off-by: Zhengyuan Liu <liuzhengyuan@kylinos.cn>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+
+---
+ fs/io_uring.c |    8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+--- a/fs/io_uring.c
++++ b/fs/io_uring.c
+@@ -331,6 +331,9 @@ struct io_kiocb {
+ #define REQ_F_SEQ_PREV		8	/* sequential with previous */
+ #define REQ_F_IO_DRAIN		16	/* drain existing IO first */
+ #define REQ_F_IO_DRAINED	32	/* drain done */
++#define REQ_F_LINK		64	/* linked sqes */
++#define REQ_F_LINK_DONE		128	/* linked sqes done */
++#define REQ_F_FAIL_LINK		256	/* fail rest of links */
+ 	u64			user_data;
+ 	u32			error;	/* iopoll result from callback */
+ 	u32			sequence;
+@@ -1698,6 +1701,10 @@ restart:
+ 		/* async context always use a copy of the sqe */
+ 		kfree(sqe);
+ 
++		/* req from defer and link list needn't decrease async cnt */
++		if (req->flags & (REQ_F_IO_DRAINED | REQ_F_LINK_DONE))
++			goto out;
++
+ 		if (!async_list)
+ 			break;
+ 		if (!list_empty(&req_list)) {
+@@ -1745,6 +1752,7 @@ restart:
+ 		}
+ 	}
+ 
++out:
+ 	if (cur_mm) {
+ 		set_fs(old_fs);
+ 		unuse_mm(cur_mm);
diff --git a/queue-5.2/series b/queue-5.2/series
index 9d582aae157..d491ba35635 100644
--- a/queue-5.2/series
+++ b/queue-5.2/series
@@ -209,3 +209,7 @@ libnvdimm-bus-stop-holding-nvdimm_bus_list_mutex-over-__nd_ioctl.patch
 structleak-disable-structleak_byref-in-combination-with-kasan_stack.patch
 drm-i915-make-the-semaphore-saturation-mask-global.patch
 access-avoid-the-rcu-grace-period-for-the-temporary-subjective-credentials.patch
+io_uring-add-a-memory-barrier-before-atomic_read.patch
+io_uring-ensure-list-is-initialized-for-poll-commands.patch
+io_uring-fix-counter-inc-dec-mismatch-in-async_list.patch
+io_uring-don-t-use-iov_iter_advance-for-fixed-buffers.patch