]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
5.2-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 29 Jul 2019 18:57:52 +0000 (20:57 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 29 Jul 2019 18:57:52 +0000 (20:57 +0200)
added patches:
io_uring-add-a-memory-barrier-before-atomic_read.patch
io_uring-don-t-use-iov_iter_advance-for-fixed-buffers.patch
io_uring-ensure-list-is-initialized-for-poll-commands.patch
io_uring-fix-counter-inc-dec-mismatch-in-async_list.patch

queue-5.2/io_uring-add-a-memory-barrier-before-atomic_read.patch [new file with mode: 0644]
queue-5.2/io_uring-don-t-use-iov_iter_advance-for-fixed-buffers.patch [new file with mode: 0644]
queue-5.2/io_uring-ensure-list-is-initialized-for-poll-commands.patch [new file with mode: 0644]
queue-5.2/io_uring-fix-counter-inc-dec-mismatch-in-async_list.patch [new file with mode: 0644]
queue-5.2/series

diff --git a/queue-5.2/io_uring-add-a-memory-barrier-before-atomic_read.patch b/queue-5.2/io_uring-add-a-memory-barrier-before-atomic_read.patch
new file mode 100644 (file)
index 0000000..ff03082
--- /dev/null
@@ -0,0 +1,93 @@
+From c0e48f9dea9129aa11bec3ed13803bcc26e96e49 Mon Sep 17 00:00:00 2001
+From: Zhengyuan Liu <liuzhengyuan@kylinos.cn>
+Date: Thu, 18 Jul 2019 20:44:00 +0800
+Subject: io_uring: add a memory barrier before atomic_read
+
+From: Zhengyuan Liu <liuzhengyuan@kylinos.cn>
+
+commit c0e48f9dea9129aa11bec3ed13803bcc26e96e49 upstream.
+
+There is a hang issue while using fio to do some basic test. The issue
+can be easily reproduced using the below script:
+
+        while true
+        do
+                fio  --ioengine=io_uring  -rw=write -bs=4k -numjobs=1 \
+                     -size=1G -iodepth=64 -name=uring   --filename=/dev/zero
+        done
+
+After several minutes (or more), fio would block at
+io_uring_enter->io_cqring_wait in order to waiting for previously
+committed sqes to be completed and can't return to user anymore until
+we send a SIGTERM to fio. After receiving SIGTERM, fio hangs at
+io_ring_ctx_wait_and_kill with a backtrace like this:
+
+        [54133.243816] Call Trace:
+        [54133.243842]  __schedule+0x3a0/0x790
+        [54133.243868]  schedule+0x38/0xa0
+        [54133.243880]  schedule_timeout+0x218/0x3b0
+        [54133.243891]  ? sched_clock+0x9/0x10
+        [54133.243903]  ? wait_for_completion+0xa3/0x130
+        [54133.243916]  ? _raw_spin_unlock_irq+0x2c/0x40
+        [54133.243930]  ? trace_hardirqs_on+0x3f/0xe0
+        [54133.243951]  wait_for_completion+0xab/0x130
+        [54133.243962]  ? wake_up_q+0x70/0x70
+        [54133.243984]  io_ring_ctx_wait_and_kill+0xa0/0x1d0
+        [54133.243998]  io_uring_release+0x20/0x30
+        [54133.244008]  __fput+0xcf/0x270
+        [54133.244029]  ____fput+0xe/0x10
+        [54133.244040]  task_work_run+0x7f/0xa0
+        [54133.244056]  do_exit+0x305/0xc40
+        [54133.244067]  ? get_signal+0x13b/0xbd0
+        [54133.244088]  do_group_exit+0x50/0xd0
+        [54133.244103]  get_signal+0x18d/0xbd0
+        [54133.244112]  ? _raw_spin_unlock_irqrestore+0x36/0x60
+        [54133.244142]  do_signal+0x34/0x720
+        [54133.244171]  ? exit_to_usermode_loop+0x7e/0x130
+        [54133.244190]  exit_to_usermode_loop+0xc0/0x130
+        [54133.244209]  do_syscall_64+0x16b/0x1d0
+        [54133.244221]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
+
+The reason is that we had added a req to ctx->pending_async at the very
+end, but it didn't get a chance to be processed. How could this happen?
+
+        fio#cpu0                                        wq#cpu1
+
+        io_add_to_prev_work                    io_sq_wq_submit_work
+
+          atomic_read() <<< 1
+
+                                                  atomic_dec_return() << 1->0
+                                                  list_empty();    <<< true;
+
+          list_add_tail()
+          atomic_read() << 0 or 1?
+
+As atomic_ops.rst states, atomic_read does not guarantee that the
+runtime modification by any other thread is visible yet, so we must take
+care of that with a proper implicit or explicit memory barrier.
+
+This issue was detected with the help of Jackie's <liuyun01@kylinos.cn>
+
+Fixes: 31b515106428 ("io_uring: allow workqueue item to handle multiple buffered requests")
+Signed-off-by: Zhengyuan Liu <liuzhengyuan@kylinos.cn>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/io_uring.c |    4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/fs/io_uring.c
++++ b/fs/io_uring.c
+@@ -1769,6 +1769,10 @@ static bool io_add_to_prev_work(struct a
+       ret = true;
+       spin_lock(&list->lock);
+       list_add_tail(&req->list, &list->list);
++      /*
++       * Ensure we see a simultaneous modification from io_sq_wq_submit_work()
++       */
++      smp_mb();
+       if (!atomic_read(&list->cnt)) {
+               list_del_init(&req->list);
+               ret = false;
diff --git a/queue-5.2/io_uring-don-t-use-iov_iter_advance-for-fixed-buffers.patch b/queue-5.2/io_uring-don-t-use-iov_iter_advance-for-fixed-buffers.patch
new file mode 100644 (file)
index 0000000..21f4a78
--- /dev/null
@@ -0,0 +1,92 @@
+From bd11b3a391e3df6fa958facbe4b3f9f4cca9bd49 Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@kernel.dk>
+Date: Sat, 20 Jul 2019 08:37:31 -0600
+Subject: io_uring: don't use iov_iter_advance() for fixed buffers
+
+From: Jens Axboe <axboe@kernel.dk>
+
+commit bd11b3a391e3df6fa958facbe4b3f9f4cca9bd49 upstream.
+
+Hrvoje reports that when a large fixed buffer is registered and IO is
+being done to the latter pages of said buffer, the IO submission time
+is much worse:
+
+reading to the start of the buffer: 11238 ns
+reading to the end of the buffer:   1039879 ns
+
+In fact, it's worse by two orders of magnitude. The reason for that is
+how io_uring figures out how to setup the iov_iter. We point the iter
+at the first bvec, and then use iov_iter_advance() to fast-forward to
+the offset within that buffer we need.
+
+However, that is abysmally slow, as it entails iterating the bvecs
+that we setup as part of buffer registration. There's really no need
+to use this generic helper, as we know it's a BVEC type iterator, and
+we also know that each bvec is PAGE_SIZE in size, apart from possibly
+the first and last. Hence we can just use a shift on the offset to
+find the right index, and then adjust the iov_iter appropriately.
+After this fix, the timings are:
+
+reading to the start of the buffer: 10135 ns
+reading to the end of the buffer:   1377 ns
+
+Or about an 755x improvement for the tail page.
+
+Reported-by: Hrvoje Zeba <zeba.hrvoje@gmail.com>
+Tested-by: Hrvoje Zeba <zeba.hrvoje@gmail.com>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+
+---
+ fs/io_uring.c |   39 +++++++++++++++++++++++++++++++++++++--
+ 1 file changed, 37 insertions(+), 2 deletions(-)
+
+--- a/fs/io_uring.c
++++ b/fs/io_uring.c
+@@ -1001,8 +1001,43 @@ static int io_import_fixed(struct io_rin
+        */
+       offset = buf_addr - imu->ubuf;
+       iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
+-      if (offset)
+-              iov_iter_advance(iter, offset);
++
++      if (offset) {
++              /*
++               * Don't use iov_iter_advance() here, as it's really slow for
++               * using the latter parts of a big fixed buffer - it iterates
++               * over each segment manually. We can cheat a bit here, because
++               * we know that:
++               *
++               * 1) it's a BVEC iter, we set it up
++               * 2) all bvecs are PAGE_SIZE in size, except potentially the
++               *    first and last bvec
++               *
++               * So just find our index, and adjust the iterator afterwards.
++               * If the offset is within the first bvec (or the whole first
++               * bvec, just use iov_iter_advance(). This makes it easier
++               * since we can just skip the first segment, which may not
++               * be PAGE_SIZE aligned.
++               */
++              const struct bio_vec *bvec = imu->bvec;
++
++              if (offset <= bvec->bv_len) {
++                      iov_iter_advance(iter, offset);
++              } else {
++                      unsigned long seg_skip;
++
++                      /* skip first vec */
++                      offset -= bvec->bv_len;
++                      seg_skip = 1 + (offset >> PAGE_SHIFT);
++
++                      iter->bvec = bvec + seg_skip;
++                      iter->nr_segs -= seg_skip;
++                      iter->count -= (seg_skip << PAGE_SHIFT);
++                      iter->iov_offset = offset & ~PAGE_MASK;
++                      if (iter->iov_offset)
++                              iter->count -= iter->iov_offset;
++              }
++      }
+       /* don't drop a reference to these pages */
+       iter->type |= ITER_BVEC_FLAG_NO_REF;
diff --git a/queue-5.2/io_uring-ensure-list-is-initialized-for-poll-commands.patch b/queue-5.2/io_uring-ensure-list-is-initialized-for-poll-commands.patch
new file mode 100644 (file)
index 0000000..086597f
--- /dev/null
@@ -0,0 +1,34 @@
+From 36703247d5f52a679df9da51192b6950fe81689f Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@kernel.dk>
+Date: Thu, 25 Jul 2019 10:20:18 -0600
+Subject: io_uring: ensure ->list is initialized for poll commands
+
+From: Jens Axboe <axboe@kernel.dk>
+
+commit 36703247d5f52a679df9da51192b6950fe81689f upstream.
+
+Daniel reports that when testing an http server that uses io_uring
+to poll for incoming connections, sometimes it hard crashes. This is
+due to an uninitialized list member for the io_uring request. Normally
+this doesn't trigger and none of the test cases caught it.
+
+Reported-by: Daniel Kozak <kozzi11@gmail.com>
+Tested-by: Daniel Kozak <kozzi11@gmail.com>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/io_uring.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/fs/io_uring.c
++++ b/fs/io_uring.c
+@@ -1489,6 +1489,8 @@ static int io_poll_add(struct io_kiocb *
+       INIT_LIST_HEAD(&poll->wait.entry);
+       init_waitqueue_func_entry(&poll->wait, io_poll_wake);
++      INIT_LIST_HEAD(&req->list);
++
+       mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
+       spin_lock_irq(&ctx->completion_lock);
diff --git a/queue-5.2/io_uring-fix-counter-inc-dec-mismatch-in-async_list.patch b/queue-5.2/io_uring-fix-counter-inc-dec-mismatch-in-async_list.patch
new file mode 100644 (file)
index 0000000..161ff8c
--- /dev/null
@@ -0,0 +1,56 @@
+From f7b76ac9d17e16e44feebb6d2749fec92bfd6dd4 Mon Sep 17 00:00:00 2001
+From: Zhengyuan Liu <liuzhengyuan@kylinos.cn>
+Date: Tue, 16 Jul 2019 23:26:14 +0800
+Subject: io_uring: fix counter inc/dec mismatch in async_list
+
+From: Zhengyuan Liu <liuzhengyuan@kylinos.cn>
+
+commit f7b76ac9d17e16e44feebb6d2749fec92bfd6dd4 upstream.
+
+We could queue a work for each req in defer and link list without
+increasing async_list->cnt, so we shouldn't decrease it while exiting
+from workqueue as well if we didn't process the req in async list.
+
+Thanks to Jens Axboe <axboe@kernel.dk> for his guidance.
+
+Fixes: 31b515106428 ("io_uring: allow workqueue item to handle multiple buffered requests")
+Signed-off-by: Zhengyuan Liu <liuzhengyuan@kylinos.cn>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+
+---
+ fs/io_uring.c |    8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+--- a/fs/io_uring.c
++++ b/fs/io_uring.c
+@@ -331,6 +331,9 @@ struct io_kiocb {
+ #define REQ_F_SEQ_PREV                8       /* sequential with previous */
+ #define REQ_F_IO_DRAIN                16      /* drain existing IO first */
+ #define REQ_F_IO_DRAINED      32      /* drain done */
++#define REQ_F_LINK            64      /* linked sqes */
++#define REQ_F_LINK_DONE               128     /* linked sqes done */
++#define REQ_F_FAIL_LINK               256     /* fail rest of links */
+       u64                     user_data;
+       u32                     error;  /* iopoll result from callback */
+       u32                     sequence;
+@@ -1698,6 +1701,10 @@ restart:
+               /* async context always use a copy of the sqe */
+               kfree(sqe);
++              /* req from defer and link list needn't decrease async cnt */
++              if (req->flags & (REQ_F_IO_DRAINED | REQ_F_LINK_DONE))
++                      goto out;
++
+               if (!async_list)
+                       break;
+               if (!list_empty(&req_list)) {
+@@ -1745,6 +1752,7 @@ restart:
+               }
+       }
++out:
+       if (cur_mm) {
+               set_fs(old_fs);
+               unuse_mm(cur_mm);
index 9d582aae1573a4259912ad76fc24211144d08d87..d491ba35635e1723c550e971603dcccea2c0024f 100644 (file)
@@ -209,3 +209,7 @@ libnvdimm-bus-stop-holding-nvdimm_bus_list_mutex-over-__nd_ioctl.patch
 structleak-disable-structleak_byref-in-combination-with-kasan_stack.patch
 drm-i915-make-the-semaphore-saturation-mask-global.patch
 access-avoid-the-rcu-grace-period-for-the-temporary-subjective-credentials.patch
+io_uring-add-a-memory-barrier-before-atomic_read.patch
+io_uring-ensure-list-is-initialized-for-poll-commands.patch
+io_uring-fix-counter-inc-dec-mismatch-in-async_list.patch
+io_uring-don-t-use-iov_iter_advance-for-fixed-buffers.patch