4.19-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Mon, 29 Apr 2019 13:15:14 +0000 (15:15 +0200)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Mon, 29 Apr 2019 13:15:14 +0000 (15:15 +0200)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 29 Apr 2019 13:15:14 +0000 (15:15 +0200)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 29 Apr 2019 13:15:14 +0000 (15:15 +0200)
diff --git a/queue-4.19/aio-abstract-out-io_event-filler-helper.patch b/queue-4.19/aio-abstract-out-io_event-filler-helper.patch

new file mode 100644 (file)

index 0000000..c33ef21
--- /dev/null
+++ b/queue-4.19/aio-abstract-out-io_event-filler-helper.patch
@@ -0,0 +1,48 @@
+From 875736bb3f3ded168469f6a14df7a938416a99d5 Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@kernel.dk>
+Date: Tue, 20 Nov 2018 20:06:23 -0700
+Subject: aio: abstract out io_event filler helper
+
+From: Jens Axboe <axboe@kernel.dk>
+
+commit 875736bb3f3ded168469f6a14df7a938416a99d5 upstream.
+
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Cc: Guenter Roeck <linux@roeck-us.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/aio.c |   14 ++++++++++----
+ 1 file changed, 10 insertions(+), 4 deletions(-)
+
+--- a/fs/aio.c
++++ b/fs/aio.c
+@@ -1059,6 +1059,15 @@ static inline void iocb_put(struct aio_k
+       }
+ }
+ 
++static void aio_fill_event(struct io_event *ev, struct aio_kiocb *iocb,
++                         long res, long res2)
++{
++      ev->obj = (u64)(unsigned long)iocb->ki_user_iocb;
++      ev->data = iocb->ki_user_data;
++      ev->res = res;
++      ev->res2 = res2;
++}
++
+ /* aio_complete
+  *    Called when the io request on the given iocb is complete.
+  */
+@@ -1086,10 +1095,7 @@ static void aio_complete(struct aio_kioc
+       ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
+       event = ev_page + pos % AIO_EVENTS_PER_PAGE;
+ 
+-      event->obj = (u64)(unsigned long)iocb->ki_user_iocb;
+-      event->data = iocb->ki_user_data;
+-      event->res = res;
+-      event->res2 = res2;
++      aio_fill_event(event, iocb, res, res2);
+ 
+       kunmap_atomic(ev_page);
+       flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
diff --git a/queue-4.19/aio-clear-iocb_hipri.patch b/queue-4.19/aio-clear-iocb_hipri.patch

new file mode 100644 (file)

index 0000000..2c45f87
--- /dev/null
+++ b/queue-4.19/aio-clear-iocb_hipri.patch
@@ -0,0 +1,53 @@
+From 154989e45fd8de9bfb52bbd6e5ea763e437e54c5 Mon Sep 17 00:00:00 2001
+From: Christoph Hellwig <hch@lst.de>
+Date: Thu, 22 Nov 2018 16:44:07 +0100
+Subject: aio: clear IOCB_HIPRI
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit 154989e45fd8de9bfb52bbd6e5ea763e437e54c5 upstream.
+
+No one is going to poll for aio (yet), so we must clear the HIPRI
+flag, as we would otherwise send it down the poll queues, where no
+one will be polling for completions.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+
+IOCB_HIPRI, not RWF_HIPRI.
+
+Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Cc: Guenter Roeck <linux@roeck-us.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/aio.c |   11 ++++++++---
+ 1 file changed, 8 insertions(+), 3 deletions(-)
+
+--- a/fs/aio.c
++++ b/fs/aio.c
+@@ -1438,8 +1438,7 @@ static int aio_prep_rw(struct kiocb *req
+               ret = ioprio_check_cap(iocb->aio_reqprio);
+               if (ret) {
+                       pr_debug("aio ioprio check cap error: %d\n", ret);
+-                      fput(req->ki_filp);
+-                      return ret;
++                      goto out_fput;
+               }
+ 
+               req->ki_ioprio = iocb->aio_reqprio;
+@@ -1448,7 +1447,13 @@ static int aio_prep_rw(struct kiocb *req
+ 
+       ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags);
+       if (unlikely(ret))
+-              fput(req->ki_filp);
++              goto out_fput;
++
++      req->ki_flags &= ~IOCB_HIPRI; /* no one is going to poll for this I/O */
++      return 0;
++
++out_fput:
++      fput(req->ki_filp);
+       return ret;
+ }
+ 
diff --git a/queue-4.19/aio-don-t-zero-entire-aio_kiocb-aio_get_req.patch b/queue-4.19/aio-don-t-zero-entire-aio_kiocb-aio_get_req.patch

new file mode 100644 (file)

index 0000000..79dad84
--- /dev/null
+++ b/queue-4.19/aio-don-t-zero-entire-aio_kiocb-aio_get_req.patch
@@ -0,0 +1,53 @@
+From 2bc4ca9bb600cbe36941da2b2a67189fc4302a04 Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@kernel.dk>
+Date: Tue, 4 Dec 2018 09:44:49 -0700
+Subject: aio: don't zero entire aio_kiocb aio_get_req()
+
+From: Jens Axboe <axboe@kernel.dk>
+
+commit 2bc4ca9bb600cbe36941da2b2a67189fc4302a04 upstream.
+
+It's 192 bytes, fairly substantial. Most items don't need to be cleared,
+especially not upfront. Clear the ones we do need to clear, and leave
+the other ones for setup when the iocb is prepared and submitted.
+
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Cc: Guenter Roeck <linux@roeck-us.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/aio.c |    9 +++++++--
+ 1 file changed, 7 insertions(+), 2 deletions(-)
+
+--- a/fs/aio.c
++++ b/fs/aio.c
+@@ -1010,14 +1010,15 @@ static inline struct aio_kiocb *aio_get_
+ {
+       struct aio_kiocb *req;
+ 
+-      req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO);
++      req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL);
+       if (unlikely(!req))
+               return NULL;
+ 
+       percpu_ref_get(&ctx->reqs);
++      req->ki_ctx = ctx;
+       INIT_LIST_HEAD(&req->ki_list);
+       refcount_set(&req->ki_refcnt, 0);
+-      req->ki_ctx = ctx;
++      req->ki_eventfd = NULL;
+       return req;
+ }
+ 
+@@ -1738,6 +1739,10 @@ static ssize_t aio_poll(struct aio_kiocb
+       if (unlikely(!req->file))
+               return -EBADF;
+ 
++      req->head = NULL;
++      req->woken = false;
++      req->cancelled = false;
++
+       apt.pt._qproc = aio_poll_queue_proc;
+       apt.pt._key = req->events;
+       apt.iocb = aiocb;
diff --git a/queue-4.19/aio-fold-lookup_kiocb-into-its-sole-caller.patch b/queue-4.19/aio-fold-lookup_kiocb-into-its-sole-caller.patch

new file mode 100644 (file)

index 0000000..3f3fc91
--- /dev/null
+++ b/queue-4.19/aio-fold-lookup_kiocb-into-its-sole-caller.patch
@@ -0,0 +1,62 @@
+From 833f4154ed560232120bc475935ee1d6a20e159f Mon Sep 17 00:00:00 2001
+From: Al Viro <viro@zeniv.linux.org.uk>
+Date: Mon, 11 Mar 2019 19:00:36 -0400
+Subject: aio: fold lookup_kiocb() into its sole caller
+
+From: Al Viro <viro@zeniv.linux.org.uk>
+
+commit 833f4154ed560232120bc475935ee1d6a20e159f upstream.
+
+Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
+Cc: Guenter Roeck <linux@roeck-us.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/aio.c |   29 +++++++----------------------
+ 1 file changed, 7 insertions(+), 22 deletions(-)
+
+--- a/fs/aio.c
++++ b/fs/aio.c
+@@ -1992,24 +1992,6 @@ COMPAT_SYSCALL_DEFINE3(io_submit, compat
+ }
+ #endif
+ 
+-/* lookup_kiocb
+- *    Finds a given iocb for cancellation.
+- */
+-static struct aio_kiocb *
+-lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb)
+-{
+-      struct aio_kiocb *kiocb;
+-
+-      assert_spin_locked(&ctx->ctx_lock);
+-
+-      /* TODO: use a hash or array, this sucks. */
+-      list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
+-              if (kiocb->ki_user_iocb == iocb)
+-                      return kiocb;
+-      }
+-      return NULL;
+-}
+-
+ /* sys_io_cancel:
+  *    Attempts to cancel an iocb previously passed to io_submit.  If
+  *    the operation is successfully cancelled, the resulting event is
+@@ -2038,10 +2020,13 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t
+               return -EINVAL;
+ 
+       spin_lock_irq(&ctx->ctx_lock);
+-      kiocb = lookup_kiocb(ctx, iocb);
+-      if (kiocb) {
+-              ret = kiocb->ki_cancel(&kiocb->rw);
+-              list_del_init(&kiocb->ki_list);
++      /* TODO: use a hash or array, this sucks. */
++      list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
++              if (kiocb->ki_user_iocb == iocb) {
++                      ret = kiocb->ki_cancel(&kiocb->rw);
++                      list_del_init(&kiocb->ki_list);
++                      break;
++              }
+       }
+       spin_unlock_irq(&ctx->ctx_lock);
+ 
diff --git a/queue-4.19/aio-initialize-kiocb-private-in-case-any-filesystems-expect-it.patch b/queue-4.19/aio-initialize-kiocb-private-in-case-any-filesystems-expect-it.patch

new file mode 100644 (file)

index 0000000..bfc8645
--- /dev/null
+++ b/queue-4.19/aio-initialize-kiocb-private-in-case-any-filesystems-expect-it.patch
@@ -0,0 +1,32 @@
+From ec51f8ee1e63498e9f521ec0e5a6d04622bb2c67 Mon Sep 17 00:00:00 2001
+From: Mike Marshall <hubcap@omnibond.com>
+Date: Tue, 5 Feb 2019 14:13:35 -0500
+Subject: aio: initialize kiocb private in case any filesystems expect it.
+
+From: Mike Marshall <hubcap@omnibond.com>
+
+commit ec51f8ee1e63498e9f521ec0e5a6d04622bb2c67 upstream.
+
+A recent optimization had left private uninitialized.
+
+Fixes: 2bc4ca9bb600 ("aio: don't zero entire aio_kiocb aio_get_req()")
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Mike Marshall <hubcap@omnibond.com>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Cc: Guenter Roeck <linux@roeck-us.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/aio.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/fs/aio.c
++++ b/fs/aio.c
+@@ -1430,6 +1430,7 @@ static int aio_prep_rw(struct kiocb *req
+       if (unlikely(!req->ki_filp))
+               return -EBADF;
+       req->ki_complete = aio_complete_rw;
++      req->private = NULL;
+       req->ki_pos = iocb->aio_offset;
+       req->ki_flags = iocb_flags(req->ki_filp);
+       if (iocb->aio_flags & IOCB_FLAG_RESFD)
diff --git a/queue-4.19/aio-keep-io_event-in-aio_kiocb.patch b/queue-4.19/aio-keep-io_event-in-aio_kiocb.patch

new file mode 100644 (file)

index 0000000..b885ee9
--- /dev/null
+++ b/queue-4.19/aio-keep-io_event-in-aio_kiocb.patch
@@ -0,0 +1,106 @@
+From a9339b7855094ba11a97e8822ae038135e879e79 Mon Sep 17 00:00:00 2001
+From: Al Viro <viro@zeniv.linux.org.uk>
+Date: Thu, 7 Mar 2019 19:43:45 -0500
+Subject: aio: keep io_event in aio_kiocb
+
+From: Al Viro <viro@zeniv.linux.org.uk>
+
+commit a9339b7855094ba11a97e8822ae038135e879e79 upstream.
+
+We want to separate forming the resulting io_event from putting it
+into the ring buffer.
+
+Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
+Cc: Guenter Roeck <linux@roeck-us.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/aio.c |   31 +++++++++++++------------------
+ 1 file changed, 13 insertions(+), 18 deletions(-)
+
+--- a/fs/aio.c
++++ b/fs/aio.c
+@@ -198,8 +198,7 @@ struct aio_kiocb {
+       struct kioctx           *ki_ctx;
+       kiocb_cancel_fn         *ki_cancel;
+ 
+-      struct iocb __user      *ki_user_iocb;  /* user's aiocb */
+-      __u64                   ki_user_data;   /* user's data for completion */
++      struct io_event         ki_res;
+ 
+       struct list_head        ki_list;        /* the aio core uses this
+                                                * for cancellation */
+@@ -1078,15 +1077,6 @@ static inline void iocb_put(struct aio_k
+               iocb_destroy(iocb);
+ }
+ 
+-static void aio_fill_event(struct io_event *ev, struct aio_kiocb *iocb,
+-                         long res, long res2)
+-{
+-      ev->obj = (u64)(unsigned long)iocb->ki_user_iocb;
+-      ev->data = iocb->ki_user_data;
+-      ev->res = res;
+-      ev->res2 = res2;
+-}
+-
+ /* aio_complete
+  *    Called when the io request on the given iocb is complete.
+  */
+@@ -1098,6 +1088,8 @@ static void aio_complete(struct aio_kioc
+       unsigned tail, pos, head;
+       unsigned long   flags;
+ 
++      iocb->ki_res.res = res;
++      iocb->ki_res.res2 = res2;
+       /*
+        * Add a completion event to the ring buffer. Must be done holding
+        * ctx->completion_lock to prevent other code from messing with the tail
+@@ -1114,14 +1106,14 @@ static void aio_complete(struct aio_kioc
+       ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
+       event = ev_page + pos % AIO_EVENTS_PER_PAGE;
+ 
+-      aio_fill_event(event, iocb, res, res2);
++      *event = iocb->ki_res;
+ 
+       kunmap_atomic(ev_page);
+       flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
+ 
+-      pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n",
+-               ctx, tail, iocb, iocb->ki_user_iocb, iocb->ki_user_data,
+-               res, res2);
++      pr_debug("%p[%u]: %p: %p %Lx %Lx %Lx\n", ctx, tail, iocb,
++               (void __user *)(unsigned long)iocb->ki_res.obj,
++               iocb->ki_res.data, iocb->ki_res.res, iocb->ki_res.res2);
+ 
+       /* after flagging the request as done, we
+        * must never even look at it again
+@@ -1838,8 +1830,10 @@ static int __io_submit_one(struct kioctx
+               goto out_put_req;
+       }
+ 
+-      req->ki_user_iocb = user_iocb;
+-      req->ki_user_data = iocb->aio_data;
++      req->ki_res.obj = (u64)(unsigned long)user_iocb;
++      req->ki_res.data = iocb->aio_data;
++      req->ki_res.res = 0;
++      req->ki_res.res2 = 0;
+ 
+       switch (iocb->aio_lio_opcode) {
+       case IOCB_CMD_PREAD:
+@@ -2009,6 +2003,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t
+       struct aio_kiocb *kiocb;
+       int ret = -EINVAL;
+       u32 key;
++      u64 obj = (u64)(unsigned long)iocb;
+ 
+       if (unlikely(get_user(key, &iocb->aio_key)))
+               return -EFAULT;
+@@ -2022,7 +2017,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t
+       spin_lock_irq(&ctx->ctx_lock);
+       /* TODO: use a hash or array, this sucks. */
+       list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
+-              if (kiocb->ki_user_iocb == iocb) {
++              if (kiocb->ki_res.obj == obj) {
+                       ret = kiocb->ki_cancel(&kiocb->rw);
+                       list_del_init(&kiocb->ki_list);
+                       break;
diff --git a/queue-4.19/aio-separate-out-ring-reservation-from-req-allocation.patch b/queue-4.19/aio-separate-out-ring-reservation-from-req-allocation.patch

new file mode 100644 (file)

index 0000000..13b699a
--- /dev/null
+++ b/queue-4.19/aio-separate-out-ring-reservation-from-req-allocation.patch
@@ -0,0 +1,102 @@
+From 432c79978c33ecef91b1b04cea6936c20810da29 Mon Sep 17 00:00:00 2001
+From: Christoph Hellwig <hch@lst.de>
+Date: Mon, 19 Nov 2018 15:57:42 -0700
+Subject: aio: separate out ring reservation from req allocation
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit 432c79978c33ecef91b1b04cea6936c20810da29 upstream.
+
+This is in preparation for certain types of IO not needing a ring
+reserveration.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Cc: Guenter Roeck <linux@roeck-us.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/aio.c |   30 +++++++++++++++++-------------
+ 1 file changed, 17 insertions(+), 13 deletions(-)
+
+--- a/fs/aio.c
++++ b/fs/aio.c
+@@ -902,7 +902,7 @@ static void put_reqs_available(struct ki
+       local_irq_restore(flags);
+ }
+ 
+-static bool get_reqs_available(struct kioctx *ctx)
++static bool __get_reqs_available(struct kioctx *ctx)
+ {
+       struct kioctx_cpu *kcpu;
+       bool ret = false;
+@@ -994,6 +994,14 @@ static void user_refill_reqs_available(s
+       spin_unlock_irq(&ctx->completion_lock);
+ }
+ 
++static bool get_reqs_available(struct kioctx *ctx)
++{
++      if (__get_reqs_available(ctx))
++              return true;
++      user_refill_reqs_available(ctx);
++      return __get_reqs_available(ctx);
++}
++
+ /* aio_get_req
+  *    Allocate a slot for an aio request.
+  * Returns NULL if no requests are free.
+@@ -1002,24 +1010,15 @@ static inline struct aio_kiocb *aio_get_
+ {
+       struct aio_kiocb *req;
+ 
+-      if (!get_reqs_available(ctx)) {
+-              user_refill_reqs_available(ctx);
+-              if (!get_reqs_available(ctx))
+-                      return NULL;
+-      }
+-
+       req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO);
+       if (unlikely(!req))
+-              goto out_put;
++              return NULL;
+ 
+       percpu_ref_get(&ctx->reqs);
+       INIT_LIST_HEAD(&req->ki_list);
+       refcount_set(&req->ki_refcnt, 0);
+       req->ki_ctx = ctx;
+       return req;
+-out_put:
+-      put_reqs_available(ctx, 1);
+-      return NULL;
+ }
+ 
+ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
+@@ -1813,9 +1812,13 @@ static int io_submit_one(struct kioctx *
+               return -EINVAL;
+       }
+ 
++      if (!get_reqs_available(ctx))
++              return -EAGAIN;
++
++      ret = -EAGAIN;
+       req = aio_get_req(ctx);
+       if (unlikely(!req))
+-              return -EAGAIN;
++              goto out_put_reqs_available;
+ 
+       if (iocb.aio_flags & IOCB_FLAG_RESFD) {
+               /*
+@@ -1878,11 +1881,12 @@ static int io_submit_one(struct kioctx *
+               goto out_put_req;
+       return 0;
+ out_put_req:
+-      put_reqs_available(ctx, 1);
+       percpu_ref_put(&ctx->reqs);
+       if (req->ki_eventfd)
+               eventfd_ctx_put(req->ki_eventfd);
+       kmem_cache_free(kiocb_cachep, req);
++out_put_reqs_available:
++      put_reqs_available(ctx, 1);
+       return ret;
+ }
+ 
diff --git a/queue-4.19/aio-simplify-and-fix-fget-fput-for-io_submit.patch b/queue-4.19/aio-simplify-and-fix-fget-fput-for-io_submit.patch

new file mode 100644 (file)

index 0000000..f9eb500
--- /dev/null
+++ b/queue-4.19/aio-simplify-and-fix-fget-fput-for-io_submit.patch
@@ -0,0 +1,310 @@
+From 84c4e1f89fefe70554da0ab33be72c9be7994379 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Sun, 3 Mar 2019 14:23:33 -0800
+Subject: aio: simplify - and fix - fget/fput for io_submit()
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit 84c4e1f89fefe70554da0ab33be72c9be7994379 upstream.
+
+Al Viro root-caused a race where the IOCB_CMD_POLL handling of
+fget/fput() could cause us to access the file pointer after it had
+already been freed:
+
+ "In more details - normally IOCB_CMD_POLL handling looks so:
+
+   1) io_submit(2) allocates aio_kiocb instance and passes it to
+      aio_poll()
+
+   2) aio_poll() resolves the descriptor to struct file by req->file =
+      fget(iocb->aio_fildes)
+
+   3) aio_poll() sets ->woken to false and raises ->ki_refcnt of that
+      aio_kiocb to 2 (bumps by 1, that is).
+
+   4) aio_poll() calls vfs_poll(). After sanity checks (basically,
+      "poll_wait() had been called and only once") it locks the queue.
+      That's what the extra reference to iocb had been for - we know we
+      can safely access it.
+
+   5) With queue locked, we check if ->woken has already been set to
+      true (by aio_poll_wake()) and, if it had been, we unlock the
+      queue, drop a reference to aio_kiocb and bugger off - at that
+      point it's a responsibility to aio_poll_wake() and the stuff
+      called/scheduled by it. That code will drop the reference to file
+      in req->file, along with the other reference to our aio_kiocb.
+
+   6) otherwise, we see whether we need to wait. If we do, we unlock the
+      queue, drop one reference to aio_kiocb and go away - eventual
+      wakeup (or cancel) will deal with the reference to file and with
+      the other reference to aio_kiocb
+
+   7) otherwise we remove ourselves from waitqueue (still under the
+      queue lock), so that wakeup won't get us. No async activity will
+      be happening, so we can safely drop req->file and iocb ourselves.
+
+  If wakeup happens while we are in vfs_poll(), we are fine - aio_kiocb
+  won't get freed under us, so we can do all the checks and locking
+  safely. And we don't touch ->file if we detect that case.
+
+  However, vfs_poll() most certainly *does* touch the file it had been
+  given. So wakeup coming while we are still in ->poll() might end up
+  doing fput() on that file. That case is not too rare, and usually we
+  are saved by the still present reference from descriptor table - that
+  fput() is not the final one.
+
+  But if another thread closes that descriptor right after our fget()
+  and wakeup does happen before ->poll() returns, we are in trouble -
+  final fput() done while we are in the middle of a method:
+
+Al also wrote a patch to take an extra reference to the file descriptor
+to fix this, but I instead suggested we just streamline the whole file
+pointer handling by submit_io() so that the generic aio submission code
+simply keeps the file pointer around until the aio has completed.
+
+Fixes: bfe4037e722e ("aio: implement IOCB_CMD_POLL")
+Acked-by: Al Viro <viro@zeniv.linux.org.uk>
+Reported-by: syzbot+503d4cc169fcec1cb18c@syzkaller.appspotmail.com
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Guenter Roeck <linux@roeck-us.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/aio.c           |   72 +++++++++++++++++++++--------------------------------
+ include/linux/fs.h |    8 +++++
+ 2 files changed, 36 insertions(+), 44 deletions(-)
+
+--- a/fs/aio.c
++++ b/fs/aio.c
+@@ -161,9 +161,13 @@ struct kioctx {
+       unsigned                id;
+ };
+ 
++/*
++ * First field must be the file pointer in all the
++ * iocb unions! See also 'struct kiocb' in <linux/fs.h>
++ */
+ struct fsync_iocb {
+-      struct work_struct      work;
+       struct file             *file;
++      struct work_struct      work;
+       bool                    datasync;
+ };
+ 
+@@ -177,8 +181,15 @@ struct poll_iocb {
+       struct work_struct      work;
+ };
+ 
++/*
++ * NOTE! Each of the iocb union members has the file pointer
++ * as the first entry in their struct definition. So you can
++ * access the file pointer through any of the sub-structs,
++ * or directly as just 'ki_filp' in this struct.
++ */
+ struct aio_kiocb {
+       union {
++              struct file             *ki_filp;
+               struct kiocb            rw;
+               struct fsync_iocb       fsync;
+               struct poll_iocb        poll;
+@@ -1054,6 +1065,8 @@ static inline void iocb_put(struct aio_k
+ {
+       if (refcount_read(&iocb->ki_refcnt) == 0 ||
+           refcount_dec_and_test(&iocb->ki_refcnt)) {
++              if (iocb->ki_filp)
++                      fput(iocb->ki_filp);
+               percpu_ref_put(&iocb->ki_ctx->reqs);
+               kmem_cache_free(kiocb_cachep, iocb);
+       }
+@@ -1418,7 +1431,6 @@ static void aio_complete_rw(struct kiocb
+               file_end_write(kiocb->ki_filp);
+       }
+ 
+-      fput(kiocb->ki_filp);
+       aio_complete(iocb, res, res2);
+ }
+ 
+@@ -1426,9 +1438,6 @@ static int aio_prep_rw(struct kiocb *req
+ {
+       int ret;
+ 
+-      req->ki_filp = fget(iocb->aio_fildes);
+-      if (unlikely(!req->ki_filp))
+-              return -EBADF;
+       req->ki_complete = aio_complete_rw;
+       req->private = NULL;
+       req->ki_pos = iocb->aio_offset;
+@@ -1445,7 +1454,7 @@ static int aio_prep_rw(struct kiocb *req
+               ret = ioprio_check_cap(iocb->aio_reqprio);
+               if (ret) {
+                       pr_debug("aio ioprio check cap error: %d\n", ret);
+-                      goto out_fput;
++                      return ret;
+               }
+ 
+               req->ki_ioprio = iocb->aio_reqprio;
+@@ -1454,14 +1463,10 @@ static int aio_prep_rw(struct kiocb *req
+ 
+       ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags);
+       if (unlikely(ret))
+-              goto out_fput;
++              return ret;
+ 
+       req->ki_flags &= ~IOCB_HIPRI; /* no one is going to poll for this I/O */
+       return 0;
+-
+-out_fput:
+-      fput(req->ki_filp);
+-      return ret;
+ }
+ 
+ static int aio_setup_rw(int rw, const struct iocb *iocb, struct iovec **iovec,
+@@ -1515,24 +1520,19 @@ static ssize_t aio_read(struct kiocb *re
+       if (ret)
+               return ret;
+       file = req->ki_filp;
+-
+-      ret = -EBADF;
+       if (unlikely(!(file->f_mode & FMODE_READ)))
+-              goto out_fput;
++              return -EBADF;
+       ret = -EINVAL;
+       if (unlikely(!file->f_op->read_iter))
+-              goto out_fput;
++              return -EINVAL;
+ 
+       ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter);
+       if (ret)
+-              goto out_fput;
++              return ret;
+       ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter));
+       if (!ret)
+               aio_rw_done(req, call_read_iter(file, req, &iter));
+       kfree(iovec);
+-out_fput:
+-      if (unlikely(ret))
+-              fput(file);
+       return ret;
+ }
+ 
+@@ -1549,16 +1549,14 @@ static ssize_t aio_write(struct kiocb *r
+               return ret;
+       file = req->ki_filp;
+ 
+-      ret = -EBADF;
+       if (unlikely(!(file->f_mode & FMODE_WRITE)))
+-              goto out_fput;
+-      ret = -EINVAL;
++              return -EBADF;
+       if (unlikely(!file->f_op->write_iter))
+-              goto out_fput;
++              return -EINVAL;
+ 
+       ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter);
+       if (ret)
+-              goto out_fput;
++              return ret;
+       ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter));
+       if (!ret) {
+               /*
+@@ -1576,9 +1574,6 @@ static ssize_t aio_write(struct kiocb *r
+               aio_rw_done(req, call_write_iter(file, req, &iter));
+       }
+       kfree(iovec);
+-out_fput:
+-      if (unlikely(ret))
+-              fput(file);
+       return ret;
+ }
+ 
+@@ -1588,7 +1583,6 @@ static void aio_fsync_work(struct work_s
+       int ret;
+ 
+       ret = vfs_fsync(req->file, req->datasync);
+-      fput(req->file);
+       aio_complete(container_of(req, struct aio_kiocb, fsync), ret, 0);
+ }
+ 
+@@ -1599,13 +1593,8 @@ static int aio_fsync(struct fsync_iocb *
+                       iocb->aio_rw_flags))
+               return -EINVAL;
+ 
+-      req->file = fget(iocb->aio_fildes);
+-      if (unlikely(!req->file))
+-              return -EBADF;
+-      if (unlikely(!req->file->f_op->fsync)) {
+-              fput(req->file);
++      if (unlikely(!req->file->f_op->fsync))
+               return -EINVAL;
+-      }
+ 
+       req->datasync = datasync;
+       INIT_WORK(&req->work, aio_fsync_work);
+@@ -1615,10 +1604,7 @@ static int aio_fsync(struct fsync_iocb *
+ 
+ static inline void aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask)
+ {
+-      struct file *file = iocb->poll.file;
+-
+       aio_complete(iocb, mangle_poll(mask), 0);
+-      fput(file);
+ }
+ 
+ static void aio_poll_complete_work(struct work_struct *work)
+@@ -1743,9 +1729,6 @@ static ssize_t aio_poll(struct aio_kiocb
+ 
+       INIT_WORK(&req->work, aio_poll_complete_work);
+       req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP;
+-      req->file = fget(iocb->aio_fildes);
+-      if (unlikely(!req->file))
+-              return -EBADF;
+ 
+       req->head = NULL;
+       req->woken = false;
+@@ -1788,10 +1771,8 @@ static ssize_t aio_poll(struct aio_kiocb
+       spin_unlock_irq(&ctx->ctx_lock);
+ 
+ out:
+-      if (unlikely(apt.error)) {
+-              fput(req->file);
++      if (unlikely(apt.error))
+               return apt.error;
+-      }
+ 
+       if (mask)
+               aio_poll_complete(aiocb, mask);
+@@ -1829,6 +1810,11 @@ static int __io_submit_one(struct kioctx
+       if (unlikely(!req))
+               goto out_put_reqs_available;
+ 
++      req->ki_filp = fget(iocb->aio_fildes);
++      ret = -EBADF;
++      if (unlikely(!req->ki_filp))
++              goto out_put_req;
++
+       if (iocb->aio_flags & IOCB_FLAG_RESFD) {
+               /*
+                * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an
+--- a/include/linux/fs.h
++++ b/include/linux/fs.h
+@@ -304,13 +304,19 @@ enum rw_hint {
+ 
+ struct kiocb {
+       struct file             *ki_filp;
++
++      /* The 'ki_filp' pointer is shared in a union for aio */
++      randomized_struct_fields_start
++
+       loff_t                  ki_pos;
+       void (*ki_complete)(struct kiocb *iocb, long ret, long ret2);
+       void                    *private;
+       int                     ki_flags;
+       u16                     ki_hint;
+       u16                     ki_ioprio; /* See linux/ioprio.h */
+-} __randomize_layout;
++
++      randomized_struct_fields_end
++};
+ 
+ static inline bool is_sync_kiocb(struct kiocb *kiocb)
+ {
diff --git a/queue-4.19/aio-split-out-iocb-copy-from-io_submit_one.patch b/queue-4.19/aio-split-out-iocb-copy-from-io_submit_one.patch

new file mode 100644 (file)

index 0000000..f8f3c24
--- /dev/null
+++ b/queue-4.19/aio-split-out-iocb-copy-from-io_submit_one.patch
@@ -0,0 +1,195 @@
+From 88a6f18b950e2e4dce57d31daa151105f4f3dcff Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@kernel.dk>
+Date: Sat, 24 Nov 2018 14:46:14 -0700
+Subject: aio: split out iocb copy from io_submit_one()
+
+From: Jens Axboe <axboe@kernel.dk>
+
+commit 88a6f18b950e2e4dce57d31daa151105f4f3dcff upstream.
+
+In preparation of handing in iocbs in a different fashion as well. Also
+make it clear that the iocb being passed in isn't modified, by marking
+it const throughout.
+
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Cc: Guenter Roeck <linux@roeck-us.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/aio.c |   68 +++++++++++++++++++++++++++++++++++----------------------------
+ 1 file changed, 38 insertions(+), 30 deletions(-)
+
+--- a/fs/aio.c
++++ b/fs/aio.c
+@@ -1416,7 +1416,7 @@ static void aio_complete_rw(struct kiocb
+       aio_complete(iocb, res, res2);
+ }
+ 
+-static int aio_prep_rw(struct kiocb *req, struct iocb *iocb)
++static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
+ {
+       int ret;
+ 
+@@ -1457,7 +1457,7 @@ out_fput:
+       return ret;
+ }
+ 
+-static int aio_setup_rw(int rw, struct iocb *iocb, struct iovec **iovec,
++static int aio_setup_rw(int rw, const struct iocb *iocb, struct iovec **iovec,
+               bool vectored, bool compat, struct iov_iter *iter)
+ {
+       void __user *buf = (void __user *)(uintptr_t)iocb->aio_buf;
+@@ -1496,8 +1496,8 @@ static inline void aio_rw_done(struct ki
+       }
+ }
+ 
+-static ssize_t aio_read(struct kiocb *req, struct iocb *iocb, bool vectored,
+-              bool compat)
++static ssize_t aio_read(struct kiocb *req, const struct iocb *iocb,
++                      bool vectored, bool compat)
+ {
+       struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
+       struct iov_iter iter;
+@@ -1529,8 +1529,8 @@ out_fput:
+       return ret;
+ }
+ 
+-static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored,
+-              bool compat)
++static ssize_t aio_write(struct kiocb *req, const struct iocb *iocb,
++                       bool vectored, bool compat)
+ {
+       struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
+       struct iov_iter iter;
+@@ -1585,7 +1585,8 @@ static void aio_fsync_work(struct work_s
+       aio_complete(container_of(req, struct aio_kiocb, fsync), ret, 0);
+ }
+ 
+-static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync)
++static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb,
++                   bool datasync)
+ {
+       if (unlikely(iocb->aio_buf || iocb->aio_offset || iocb->aio_nbytes ||
+                       iocb->aio_rw_flags))
+@@ -1719,7 +1720,7 @@ aio_poll_queue_proc(struct file *file, s
+       add_wait_queue(head, &pt->iocb->poll.wait);
+ }
+ 
+-static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb)
++static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
+ {
+       struct kioctx *ctx = aiocb->ki_ctx;
+       struct poll_iocb *req = &aiocb->poll;
+@@ -1791,27 +1792,23 @@ out:
+       return 0;
+ }
+ 
+-static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
+-                       bool compat)
++static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb,
++                         struct iocb __user *user_iocb, bool compat)
+ {
+       struct aio_kiocb *req;
+-      struct iocb iocb;
+       ssize_t ret;
+ 
+-      if (unlikely(copy_from_user(&iocb, user_iocb, sizeof(iocb))))
+-              return -EFAULT;
+-
+       /* enforce forwards compatibility on users */
+-      if (unlikely(iocb.aio_reserved2)) {
++      if (unlikely(iocb->aio_reserved2)) {
+               pr_debug("EINVAL: reserve field set\n");
+               return -EINVAL;
+       }
+ 
+       /* prevent overflows */
+       if (unlikely(
+-          (iocb.aio_buf != (unsigned long)iocb.aio_buf) ||
+-          (iocb.aio_nbytes != (size_t)iocb.aio_nbytes) ||
+-          ((ssize_t)iocb.aio_nbytes < 0)
++          (iocb->aio_buf != (unsigned long)iocb->aio_buf) ||
++          (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) ||
++          ((ssize_t)iocb->aio_nbytes < 0)
+          )) {
+               pr_debug("EINVAL: overflow check\n");
+               return -EINVAL;
+@@ -1825,14 +1822,14 @@ static int io_submit_one(struct kioctx *
+       if (unlikely(!req))
+               goto out_put_reqs_available;
+ 
+-      if (iocb.aio_flags & IOCB_FLAG_RESFD) {
++      if (iocb->aio_flags & IOCB_FLAG_RESFD) {
+               /*
+                * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an
+                * instance of the file* now. The file descriptor must be
+                * an eventfd() fd, and will be signaled for each completed
+                * event using the eventfd_signal() function.
+                */
+-              req->ki_eventfd = eventfd_ctx_fdget((int) iocb.aio_resfd);
++              req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd);
+               if (IS_ERR(req->ki_eventfd)) {
+                       ret = PTR_ERR(req->ki_eventfd);
+                       req->ki_eventfd = NULL;
+@@ -1847,32 +1844,32 @@ static int io_submit_one(struct kioctx *
+       }
+ 
+       req->ki_user_iocb = user_iocb;
+-      req->ki_user_data = iocb.aio_data;
++      req->ki_user_data = iocb->aio_data;
+ 
+-      switch (iocb.aio_lio_opcode) {
++      switch (iocb->aio_lio_opcode) {
+       case IOCB_CMD_PREAD:
+-              ret = aio_read(&req->rw, &iocb, false, compat);
++              ret = aio_read(&req->rw, iocb, false, compat);
+               break;
+       case IOCB_CMD_PWRITE:
+-              ret = aio_write(&req->rw, &iocb, false, compat);
++              ret = aio_write(&req->rw, iocb, false, compat);
+               break;
+       case IOCB_CMD_PREADV:
+-              ret = aio_read(&req->rw, &iocb, true, compat);
++              ret = aio_read(&req->rw, iocb, true, compat);
+               break;
+       case IOCB_CMD_PWRITEV:
+-              ret = aio_write(&req->rw, &iocb, true, compat);
++              ret = aio_write(&req->rw, iocb, true, compat);
+               break;
+       case IOCB_CMD_FSYNC:
+-              ret = aio_fsync(&req->fsync, &iocb, false);
++              ret = aio_fsync(&req->fsync, iocb, false);
+               break;
+       case IOCB_CMD_FDSYNC:
+-              ret = aio_fsync(&req->fsync, &iocb, true);
++              ret = aio_fsync(&req->fsync, iocb, true);
+               break;
+       case IOCB_CMD_POLL:
+-              ret = aio_poll(req, &iocb);
++              ret = aio_poll(req, iocb);
+               break;
+       default:
+-              pr_debug("invalid aio operation %d\n", iocb.aio_lio_opcode);
++              pr_debug("invalid aio operation %d\n", iocb->aio_lio_opcode);
+               ret = -EINVAL;
+               break;
+       }
+@@ -1894,6 +1891,17 @@ out_put_reqs_available:
+       return ret;
+ }
+ 
++static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
++                       bool compat)
++{
++      struct iocb iocb;
++
++      if (unlikely(copy_from_user(&iocb, user_iocb, sizeof(iocb))))
++              return -EFAULT;
++
++      return __io_submit_one(ctx, &iocb, user_iocb, compat);
++}
++
+ /* sys_io_submit:
+  *    Queue the nr iocbs pointed to by iocbpp for processing.  Returns
+  *    the number of iocbs queued.  May return -EINVAL if the aio_context
diff --git a/queue-4.19/aio-store-event-at-final-iocb_put.patch b/queue-4.19/aio-store-event-at-final-iocb_put.patch

new file mode 100644 (file)

index 0000000..7960f26
--- /dev/null
+++ b/queue-4.19/aio-store-event-at-final-iocb_put.patch
@@ -0,0 +1,102 @@
+From 2bb874c0d873d13bd9b9b9c6d7b7c4edab18c8b4 Mon Sep 17 00:00:00 2001
+From: Al Viro <viro@zeniv.linux.org.uk>
+Date: Thu, 7 Mar 2019 19:49:55 -0500
+Subject: aio: store event at final iocb_put()
+
+From: Al Viro <viro@zeniv.linux.org.uk>
+
+commit 2bb874c0d873d13bd9b9b9c6d7b7c4edab18c8b4 upstream.
+
+Instead of having aio_complete() set ->ki_res.{res,res2}, do that
+explicitly in its callers, drop the reference (as aio_complete()
+used to do) and delay the rest until the final iocb_put().
+
+Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
+Cc: Guenter Roeck <linux@roeck-us.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/aio.c |   33 +++++++++++++++++----------------
+ 1 file changed, 17 insertions(+), 16 deletions(-)
+
+--- a/fs/aio.c
++++ b/fs/aio.c
+@@ -1071,16 +1071,10 @@ static inline void iocb_destroy(struct a
+       kmem_cache_free(kiocb_cachep, iocb);
+ }
+ 
+-static inline void iocb_put(struct aio_kiocb *iocb)
+-{
+-      if (refcount_dec_and_test(&iocb->ki_refcnt))
+-              iocb_destroy(iocb);
+-}
+-
+ /* aio_complete
+  *    Called when the io request on the given iocb is complete.
+  */
+-static void aio_complete(struct aio_kiocb *iocb, long res, long res2)
++static void aio_complete(struct aio_kiocb *iocb)
+ {
+       struct kioctx   *ctx = iocb->ki_ctx;
+       struct aio_ring *ring;
+@@ -1088,8 +1082,6 @@ static void aio_complete(struct aio_kioc
+       unsigned tail, pos, head;
+       unsigned long   flags;
+ 
+-      iocb->ki_res.res = res;
+-      iocb->ki_res.res2 = res2;
+       /*
+        * Add a completion event to the ring buffer. Must be done holding
+        * ctx->completion_lock to prevent other code from messing with the tail
+@@ -1155,7 +1147,14 @@ static void aio_complete(struct aio_kioc
+ 
+       if (waitqueue_active(&ctx->wait))
+               wake_up(&ctx->wait);
+-      iocb_put(iocb);
++}
++
++static inline void iocb_put(struct aio_kiocb *iocb)
++{
++      if (refcount_dec_and_test(&iocb->ki_refcnt)) {
++              aio_complete(iocb);
++              iocb_destroy(iocb);
++      }
+ }
+ 
+ /* aio_read_events_ring
+@@ -1429,7 +1428,9 @@ static void aio_complete_rw(struct kiocb
+               file_end_write(kiocb->ki_filp);
+       }
+ 
+-      aio_complete(iocb, res, res2);
++      iocb->ki_res.res = res;
++      iocb->ki_res.res2 = res2;
++      iocb_put(iocb);
+ }
+ 
+ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
+@@ -1577,11 +1578,10 @@ static ssize_t aio_write(struct kiocb *r
+ 
+ static void aio_fsync_work(struct work_struct *work)
+ {
+-      struct fsync_iocb *req = container_of(work, struct fsync_iocb, work);
+-      int ret;
++      struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, fsync.work);
+ 
+-      ret = vfs_fsync(req->file, req->datasync);
+-      aio_complete(container_of(req, struct aio_kiocb, fsync), ret, 0);
++      iocb->ki_res.res = vfs_fsync(iocb->fsync.file, iocb->fsync.datasync);
++      iocb_put(iocb);
+ }
+ 
+ static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb,
+@@ -1602,7 +1602,8 @@ static int aio_fsync(struct fsync_iocb *
+ 
+ static inline void aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask)
+ {
+-      aio_complete(iocb, mangle_poll(mask), 0);
++      iocb->ki_res.res = mangle_poll(mask);
++      iocb_put(iocb);
+ }
+ 
+ static void aio_poll_complete_work(struct work_struct *work)
diff --git a/queue-4.19/aio-use-assigned-completion-handler.patch b/queue-4.19/aio-use-assigned-completion-handler.patch

new file mode 100644 (file)

index 0000000..e4d4a45
--- /dev/null
+++ b/queue-4.19/aio-use-assigned-completion-handler.patch
@@ -0,0 +1,33 @@
+From bc9bff61624ac33b7c95861abea1af24ee7a94fc Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@kernel.dk>
+Date: Tue, 6 Nov 2018 14:27:13 -0700
+Subject: aio: use assigned completion handler
+
+From: Jens Axboe <axboe@kernel.dk>
+
+commit bc9bff61624ac33b7c95861abea1af24ee7a94fc upstream.
+
+We know this is a read/write request, but in preparation for
+having different kinds of those, ensure that we call the assigned
+handler instead of assuming it's aio_complete_rq().
+
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Cc: Guenter Roeck <linux@roeck-us.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/aio.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/aio.c
++++ b/fs/aio.c
+@@ -1492,7 +1492,7 @@ static inline void aio_rw_done(struct ki
+               ret = -EINTR;
+               /*FALLTHRU*/
+       default:
+-              aio_complete_rw(req, ret, 0);
++              req->ki_complete(req, ret, 0);
+       }
+ }
+ 
diff --git a/queue-4.19/aio-use-iocb_put-instead-of-open-coding-it.patch b/queue-4.19/aio-use-iocb_put-instead-of-open-coding-it.patch

new file mode 100644 (file)

index 0000000..656cc8a
--- /dev/null
+++ b/queue-4.19/aio-use-iocb_put-instead-of-open-coding-it.patch
@@ -0,0 +1,35 @@
+From 71ebc6fef0f53459f37fb39e1466792232fa52ee Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@kernel.dk>
+Date: Sat, 24 Nov 2018 21:33:09 -0700
+Subject: aio: use iocb_put() instead of open coding it
+
+From: Jens Axboe <axboe@kernel.dk>
+
+commit 71ebc6fef0f53459f37fb39e1466792232fa52ee upstream.
+
+Replace the percpu_ref_put() + kmem_cache_free() with a call to
+iocb_put() instead.
+
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Cc: Guenter Roeck <linux@roeck-us.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/aio.c |    3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/fs/aio.c
++++ b/fs/aio.c
+@@ -1886,10 +1886,9 @@ static int io_submit_one(struct kioctx *
+               goto out_put_req;
+       return 0;
+ out_put_req:
+-      percpu_ref_put(&ctx->reqs);
+       if (req->ki_eventfd)
+               eventfd_ctx_put(req->ki_eventfd);
+-      kmem_cache_free(kiocb_cachep, req);
++      iocb_put(req);
+ out_put_reqs_available:
+       put_reqs_available(ctx, 1);
+       return ret;
diff --git a/queue-4.19/fix-aio_poll-races.patch b/queue-4.19/fix-aio_poll-races.patch

new file mode 100644 (file)

index 0000000..19845b2
--- /dev/null
+++ b/queue-4.19/fix-aio_poll-races.patch
@@ -0,0 +1,226 @@
+From af5c72b1fc7a00aa484e90b0c4e0eeb582545634 Mon Sep 17 00:00:00 2001
+From: Al Viro <viro@zeniv.linux.org.uk>
+Date: Thu, 7 Mar 2019 21:45:41 -0500
+Subject: Fix aio_poll() races
+
+From: Al Viro <viro@zeniv.linux.org.uk>
+
+commit af5c72b1fc7a00aa484e90b0c4e0eeb582545634 upstream.
+
+aio_poll() has to cope with several unpleasant problems:
+       * requests that might stay around indefinitely need to
+be made visible for io_cancel(2); that must not be done to
+a request already completed, though.
+       * in cases when ->poll() has placed us on a waitqueue,
+wakeup might have happened (and request completed) before ->poll()
+returns.
+       * worse, in some early wakeup cases request might end
+up re-added into the queue later - we can't treat "woken up and
+currently not in the queue" as "it's not going to stick around
+indefinitely"
+       * ... moreover, ->poll() might have decided not to
+put it on any queues to start with, and that needs to be distinguished
+from the previous case
+       * ->poll() might have tried to put us on more than one queue.
+Only the first will succeed for aio poll, so we might end up missing
+wakeups.  OTOH, we might very well notice that only after the
+wakeup hits and request gets completed (all before ->poll() gets
+around to the second poll_wait()).  In that case it's too late to
+decide that we have an error.
+
+req->woken was an attempt to deal with that.  Unfortunately, it was
+broken.  What we need to keep track of is not that wakeup has happened -
+the thing might come back after that.  It's that async reference is
+already gone and won't come back, so we can't (and needn't) put the
+request on the list of cancellables.
+
+The easiest case is "request hadn't been put on any waitqueues"; we
+can tell by seeing NULL apt.head, and in that case there won't be
+anything async.  We should either complete the request ourselves
+(if vfs_poll() reports anything of interest) or return an error.
+
+In all other cases we get exclusion with wakeups by grabbing the
+queue lock.
+
+If request is currently on queue and we have something interesting
+from vfs_poll(), we can steal it and complete the request ourselves.
+
+If it's on queue and vfs_poll() has not reported anything interesting,
+we either put it on the cancellable list, or, if we know that it
+hadn't been put on all queues ->poll() wanted it on, we steal it and
+return an error.
+
+If it's _not_ on queue, it's either been already dealt with (in which
+case we do nothing), or there's aio_poll_complete_work() about to be
+executed.  In that case we either put it on the cancellable list,
+or, if we know it hadn't been put on all queues ->poll() wanted it on,
+simulate what cancel would've done.
+
+It's a lot more convoluted than I'd like it to be.  Single-consumer APIs
+suck, and unfortunately aio is not an exception...
+
+Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
+Cc: Guenter Roeck <linux@roeck-us.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/aio.c |   90 ++++++++++++++++++++++++++++-----------------------------------
+ 1 file changed, 40 insertions(+), 50 deletions(-)
+
+--- a/fs/aio.c
++++ b/fs/aio.c
+@@ -175,7 +175,7 @@ struct poll_iocb {
+       struct file             *file;
+       struct wait_queue_head  *head;
+       __poll_t                events;
+-      bool                    woken;
++      bool                    done;
+       bool                    cancelled;
+       struct wait_queue_entry wait;
+       struct work_struct      work;
+@@ -1600,12 +1600,6 @@ static int aio_fsync(struct fsync_iocb *
+       return 0;
+ }
+ 
+-static inline void aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask)
+-{
+-      iocb->ki_res.res = mangle_poll(mask);
+-      iocb_put(iocb);
+-}
+-
+ static void aio_poll_complete_work(struct work_struct *work)
+ {
+       struct poll_iocb *req = container_of(work, struct poll_iocb, work);
+@@ -1631,9 +1625,11 @@ static void aio_poll_complete_work(struc
+               return;
+       }
+       list_del_init(&iocb->ki_list);
++      iocb->ki_res.res = mangle_poll(mask);
++      req->done = true;
+       spin_unlock_irq(&ctx->ctx_lock);
+ 
+-      aio_poll_complete(iocb, mask);
++      iocb_put(iocb);
+ }
+ 
+ /* assumes we are called with irqs disabled */
+@@ -1661,31 +1657,27 @@ static int aio_poll_wake(struct wait_que
+       __poll_t mask = key_to_poll(key);
+       unsigned long flags;
+ 
+-      req->woken = true;
+-
+       /* for instances that support it check for an event match first: */
+-      if (mask) {
+-              if (!(mask & req->events))
+-                      return 0;
++      if (mask && !(mask & req->events))
++              return 0;
+ 
++      list_del_init(&req->wait.entry);
++
++      if (mask && spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
+               /*
+                * Try to complete the iocb inline if we can. Use
+                * irqsave/irqrestore because not all filesystems (e.g. fuse)
+                * call this function with IRQs disabled and because IRQs
+                * have to be disabled before ctx_lock is obtained.
+                */
+-              if (spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
+-                      list_del(&iocb->ki_list);
+-                      spin_unlock_irqrestore(&iocb->ki_ctx->ctx_lock, flags);
+-
+-                      list_del_init(&req->wait.entry);
+-                      aio_poll_complete(iocb, mask);
+-                      return 1;
+-              }
++              list_del(&iocb->ki_list);
++              iocb->ki_res.res = mangle_poll(mask);
++              req->done = true;
++              spin_unlock_irqrestore(&iocb->ki_ctx->ctx_lock, flags);
++              iocb_put(iocb);
++      } else {
++              schedule_work(&req->work);
+       }
+-
+-      list_del_init(&req->wait.entry);
+-      schedule_work(&req->work);
+       return 1;
+ }
+ 
+@@ -1717,6 +1709,7 @@ static ssize_t aio_poll(struct aio_kiocb
+       struct kioctx *ctx = aiocb->ki_ctx;
+       struct poll_iocb *req = &aiocb->poll;
+       struct aio_poll_table apt;
++      bool cancel = false;
+       __poll_t mask;
+ 
+       /* reject any unknown events outside the normal event mask. */
+@@ -1730,7 +1723,7 @@ static ssize_t aio_poll(struct aio_kiocb
+       req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP;
+ 
+       req->head = NULL;
+-      req->woken = false;
++      req->done = false;
+       req->cancelled = false;
+ 
+       apt.pt._qproc = aio_poll_queue_proc;
+@@ -1743,36 +1736,33 @@ static ssize_t aio_poll(struct aio_kiocb
+       init_waitqueue_func_entry(&req->wait, aio_poll_wake);
+ 
+       mask = vfs_poll(req->file, &apt.pt) & req->events;
+-      if (unlikely(!req->head)) {
+-              /* we did not manage to set up a waitqueue, done */
+-              goto out;
+-      }
+-
+       spin_lock_irq(&ctx->ctx_lock);
+-      spin_lock(&req->head->lock);
+-      if (req->woken) {
+-              /* wake_up context handles the rest */
+-              mask = 0;
++      if (likely(req->head)) {
++              spin_lock(&req->head->lock);
++              if (unlikely(list_empty(&req->wait.entry))) {
++                      if (apt.error)
++                              cancel = true;
++                      apt.error = 0;
++                      mask = 0;
++              }
++              if (mask || apt.error) {
++                      list_del_init(&req->wait.entry);
++              } else if (cancel) {
++                      WRITE_ONCE(req->cancelled, true);
++              } else if (!req->done) { /* actually waiting for an event */
++                      list_add_tail(&aiocb->ki_list, &ctx->active_reqs);
++                      aiocb->ki_cancel = aio_poll_cancel;
++              }
++              spin_unlock(&req->head->lock);
++      }
++      if (mask) { /* no async, we'd stolen it */
++              aiocb->ki_res.res = mangle_poll(mask);
+               apt.error = 0;
+-      } else if (mask || apt.error) {
+-              /* if we get an error or a mask we are done */
+-              WARN_ON_ONCE(list_empty(&req->wait.entry));
+-              list_del_init(&req->wait.entry);
+-      } else {
+-              /* actually waiting for an event */
+-              list_add_tail(&aiocb->ki_list, &ctx->active_reqs);
+-              aiocb->ki_cancel = aio_poll_cancel;
+       }
+-      spin_unlock(&req->head->lock);
+       spin_unlock_irq(&ctx->ctx_lock);
+-
+-out:
+-      if (unlikely(apt.error))
+-              return apt.error;
+-
+       if (mask)
+-              aio_poll_complete(aiocb, mask);
+-      return 0;
++              iocb_put(aiocb);
++      return apt.error;
+ }
+ 
+ static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb,
diff --git a/queue-4.19/mm-fix-warning-in-insert_pfn.patch b/queue-4.19/mm-fix-warning-in-insert_pfn.patch

new file mode 100644 (file)

index 0000000..7ebd343
--- /dev/null
+++ b/queue-4.19/mm-fix-warning-in-insert_pfn.patch
@@ -0,0 +1,74 @@
+From f2c57d91b0d96aa13ccff4e3b178038f17b00658 Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.cz>
+Date: Tue, 30 Oct 2018 15:10:47 -0700
+Subject: mm: Fix warning in insert_pfn()
+
+From: Jan Kara <jack@suse.cz>
+
+commit f2c57d91b0d96aa13ccff4e3b178038f17b00658 upstream.
+
+In DAX mode a write pagefault can race with write(2) in the following
+way:
+
+CPU0                            CPU1
+                                write fault for mapped zero page (hole)
+dax_iomap_rw()
+  iomap_apply()
+    xfs_file_iomap_begin()
+      - allocates blocks
+    dax_iomap_actor()
+      invalidate_inode_pages2_range()
+        - invalidates radix tree entries in given range
+                                dax_iomap_pte_fault()
+                                  grab_mapping_entry()
+                                    - no entry found, creates empty
+                                  ...
+                                  xfs_file_iomap_begin()
+                                    - finds already allocated block
+                                  ...
+                                  vmf_insert_mixed_mkwrite()
+                                    - WARNs and does nothing because there
+                                      is still zero page mapped in PTE
+        unmap_mapping_pages()
+
+This race results in WARN_ON from insert_pfn() and is occasionally
+triggered by fstest generic/344. Note that the race is otherwise
+harmless as before write(2) on CPU0 is finished, we will invalidate page
+tables properly and thus user of mmap will see modified data from
+write(2) from that point on. So just restrict the warning only to the
+case when the PFN in PTE is not zero page.
+
+Link: http://lkml.kernel.org/r/20180824154542.26872-1-jack@suse.cz
+Signed-off-by: Jan Kara <jack@suse.cz>
+Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
+Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Dave Jiang <dave.jiang@intel.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/memory.c |    9 +++++++--
+ 1 file changed, 7 insertions(+), 2 deletions(-)
+
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -1787,10 +1787,15 @@ static int insert_pfn(struct vm_area_str
+                        * in may not match the PFN we have mapped if the
+                        * mapped PFN is a writeable COW page.  In the mkwrite
+                        * case we are creating a writable PTE for a shared
+-                       * mapping and we expect the PFNs to match.
++                       * mapping and we expect the PFNs to match. If they
++                       * don't match, we are likely racing with block
++                       * allocation and mapping invalidation so just skip the
++                       * update.
+                        */
+-                      if (WARN_ON_ONCE(pte_pfn(*pte) != pfn_t_to_pfn(pfn)))
++                      if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) {
++                              WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte)));
+                               goto out_unlock;
++                      }
+                       entry = *pte;
+                       goto out_mkwrite;
+               } else
diff --git a/queue-4.19/pin-iocb-through-aio.patch b/queue-4.19/pin-iocb-through-aio.patch

new file mode 100644 (file)

index 0000000..b133ac5
--- /dev/null
+++ b/queue-4.19/pin-iocb-through-aio.patch
@@ -0,0 +1,113 @@
+From b53119f13a04879c3bf502828d99d13726639ead Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Wed, 6 Mar 2019 20:22:54 -0500
+Subject: pin iocb through aio.
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit b53119f13a04879c3bf502828d99d13726639ead upstream.
+
+aio_poll() is not the only case that needs file pinned; worse, while
+aio_read()/aio_write() can live without pinning iocb itself, the
+proof is rather brittle and can easily break on later changes.
+
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
+Cc: Guenter Roeck <linux@roeck-us.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/aio.c |   37 +++++++++++++++++++++----------------
+ 1 file changed, 21 insertions(+), 16 deletions(-)
+
+--- a/fs/aio.c
++++ b/fs/aio.c
+@@ -1016,6 +1016,9 @@ static bool get_reqs_available(struct ki
+ /* aio_get_req
+  *    Allocate a slot for an aio request.
+  * Returns NULL if no requests are free.
++ *
++ * The refcount is initialized to 2 - one for the async op completion,
++ * one for the synchronous code that does this.
+  */
+ static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx)
+ {
+@@ -1028,7 +1031,7 @@ static inline struct aio_kiocb *aio_get_
+       percpu_ref_get(&ctx->reqs);
+       req->ki_ctx = ctx;
+       INIT_LIST_HEAD(&req->ki_list);
+-      refcount_set(&req->ki_refcnt, 0);
++      refcount_set(&req->ki_refcnt, 2);
+       req->ki_eventfd = NULL;
+       return req;
+ }
+@@ -1061,15 +1064,18 @@ out:
+       return ret;
+ }
+ 
++static inline void iocb_destroy(struct aio_kiocb *iocb)
++{
++      if (iocb->ki_filp)
++              fput(iocb->ki_filp);
++      percpu_ref_put(&iocb->ki_ctx->reqs);
++      kmem_cache_free(kiocb_cachep, iocb);
++}
++
+ static inline void iocb_put(struct aio_kiocb *iocb)
+ {
+-      if (refcount_read(&iocb->ki_refcnt) == 0 ||
+-          refcount_dec_and_test(&iocb->ki_refcnt)) {
+-              if (iocb->ki_filp)
+-                      fput(iocb->ki_filp);
+-              percpu_ref_put(&iocb->ki_ctx->reqs);
+-              kmem_cache_free(kiocb_cachep, iocb);
+-      }
++      if (refcount_dec_and_test(&iocb->ki_refcnt))
++              iocb_destroy(iocb);
+ }
+ 
+ static void aio_fill_event(struct io_event *ev, struct aio_kiocb *iocb,
+@@ -1743,9 +1749,6 @@ static ssize_t aio_poll(struct aio_kiocb
+       INIT_LIST_HEAD(&req->wait.entry);
+       init_waitqueue_func_entry(&req->wait, aio_poll_wake);
+ 
+-      /* one for removal from waitqueue, one for this function */
+-      refcount_set(&aiocb->ki_refcnt, 2);
+-
+       mask = vfs_poll(req->file, &apt.pt) & req->events;
+       if (unlikely(!req->head)) {
+               /* we did not manage to set up a waitqueue, done */
+@@ -1776,7 +1779,6 @@ out:
+ 
+       if (mask)
+               aio_poll_complete(aiocb, mask);
+-      iocb_put(aiocb);
+       return 0;
+ }
+ 
+@@ -1867,18 +1869,21 @@ static int __io_submit_one(struct kioctx
+               break;
+       }
+ 
++      /* Done with the synchronous reference */
++      iocb_put(req);
++
+       /*
+        * If ret is 0, we'd either done aio_complete() ourselves or have
+        * arranged for that to be done asynchronously.  Anything non-zero
+        * means that we need to destroy req ourselves.
+        */
+-      if (ret)
+-              goto out_put_req;
+-      return 0;
++      if (!ret)
++              return 0;
++
+ out_put_req:
+       if (req->ki_eventfd)
+               eventfd_ctx_put(req->ki_eventfd);
+-      iocb_put(req);
++      iocb_destroy(req);
+ out_put_reqs_available:
+       put_reqs_available(ctx, 1);
+       return ret;
diff --git a/queue-4.19/series b/queue-4.19/series

index bfbc02c8cf319144ce2fa6c6b123f20c67d4bd95..b33883c7ca7f696bb2a116ccd5046d8ec17513a0 100644 (file)
--- a/queue-4.19/series
+++ b/queue-4.19/series
@@ -63,3 +63,20 @@ tipc-check-link-name-with-right-length-in-tipc_nl_compat_link_set.patch
  net-netrom-fix-error-cleanup-path-of-nr_proto_init.patch
  net-rds-check-address-length-before-reading-address-family.patch
  rxrpc-fix-race-condition-in-rxrpc_input_packet.patch
+aio-clear-iocb_hipri.patch
+aio-use-assigned-completion-handler.patch
+aio-separate-out-ring-reservation-from-req-allocation.patch
+aio-don-t-zero-entire-aio_kiocb-aio_get_req.patch
+aio-use-iocb_put-instead-of-open-coding-it.patch
+aio-split-out-iocb-copy-from-io_submit_one.patch
+aio-abstract-out-io_event-filler-helper.patch
+aio-initialize-kiocb-private-in-case-any-filesystems-expect-it.patch
+aio-simplify-and-fix-fget-fput-for-io_submit.patch
+pin-iocb-through-aio.patch
+aio-fold-lookup_kiocb-into-its-sole-caller.patch
+aio-keep-io_event-in-aio_kiocb.patch
+aio-store-event-at-final-iocb_put.patch
+fix-aio_poll-races.patch
+x86-retpolines-raise-limit-for-generating-indirect-calls-from-switch-case.patch
+x86-retpolines-disable-switch-jump-tables-when-retpolines-are-enabled.patch
+mm-fix-warning-in-insert_pfn.patch
diff --git a/queue-4.19/x86-retpolines-disable-switch-jump-tables-when-retpolines-are-enabled.patch b/queue-4.19/x86-retpolines-disable-switch-jump-tables-when-retpolines-are-enabled.patch

new file mode 100644 (file)

index 0000000..d4bd615
--- /dev/null
+++ b/queue-4.19/x86-retpolines-disable-switch-jump-tables-when-retpolines-are-enabled.patch
@@ -0,0 +1,70 @@
+From a9d57ef15cbe327fe54416dd194ee0ea66ae53a4 Mon Sep 17 00:00:00 2001
+From: Daniel Borkmann <daniel@iogearbox.net>
+Date: Mon, 25 Mar 2019 14:56:20 +0100
+Subject: x86/retpolines: Disable switch jump tables when retpolines are enabled
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Daniel Borkmann <daniel@iogearbox.net>
+
+commit a9d57ef15cbe327fe54416dd194ee0ea66ae53a4 upstream.
+
+Commit ce02ef06fcf7 ("x86, retpolines: Raise limit for generating indirect
+calls from switch-case") raised the limit under retpolines to 20 switch
+cases where gcc would only then start to emit jump tables, and therefore
+effectively disabling the emission of slow indirect calls in this area.
+
+After this has been brought to attention to gcc folks [0], Martin Liska
+has then fixed gcc to align with clang by avoiding to generate switch jump
+tables entirely under retpolines. This is taking effect in gcc starting
+from stable version 8.4.0. Given kernel supports compilation with older
+versions of gcc where the fix is not being available or backported anymore,
+we need to keep the extra KBUILD_CFLAGS around for some time and generally
+set the -fno-jump-tables to align with what more recent gcc is doing
+automatically today.
+
+More than 20 switch cases are not expected to be fast-path critical, but
+it would still be good to align with gcc behavior for versions < 8.4.0 in
+order to have consistency across supported gcc versions. vmlinux size is
+slightly growing by 0.27% for older gcc. This flag is only set to work
+around affected gcc, no change for clang.
+
+  [0] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=86952
+
+Suggested-by: Martin Liska <mliska@suse.cz>
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: David Woodhouse <dwmw2@infradead.org>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Jesper Dangaard Brouer <brouer@redhat.com>
+Cc: Björn Töpel<bjorn.topel@intel.com>
+Cc: Magnus Karlsson <magnus.karlsson@intel.com>
+Cc: Alexei Starovoitov <ast@kernel.org>
+Cc: H.J. Lu <hjl.tools@gmail.com>
+Cc: Alexei Starovoitov <ast@kernel.org>
+Cc: David S. Miller <davem@davemloft.net>
+Link: https://lkml.kernel.org/r/20190325135620.14882-1-daniel@iogearbox.net
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/Makefile |    8 ++++++--
+ 1 file changed, 6 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/Makefile
++++ b/arch/x86/Makefile
+@@ -227,8 +227,12 @@ ifdef CONFIG_RETPOLINE
+   # Additionally, avoid generating expensive indirect jumps which
+   # are subject to retpolines for small number of switch cases.
+   # clang turns off jump table generation by default when under
+-  # retpoline builds, however, gcc does not for x86.
+-  KBUILD_CFLAGS += $(call cc-option,--param=case-values-threshold=20)
++  # retpoline builds, however, gcc does not for x86. This has
++  # only been fixed starting from gcc stable version 8.4.0 and
++  # onwards, but not for older ones. See gcc bug #86952.
++  ifndef CONFIG_CC_IS_CLANG
++    KBUILD_CFLAGS += $(call cc-option,-fno-jump-tables)
++  endif
+ endif
+ 
+ archscripts: scripts_basic
diff --git a/queue-4.19/x86-retpolines-raise-limit-for-generating-indirect-calls-from-switch-case.patch b/queue-4.19/x86-retpolines-raise-limit-for-generating-indirect-calls-from-switch-case.patch

new file mode 100644 (file)

index 0000000..9c4e974
--- /dev/null
+++ b/queue-4.19/x86-retpolines-raise-limit-for-generating-indirect-calls-from-switch-case.patch
@@ -0,0 +1,175 @@
+From ce02ef06fcf7a399a6276adb83f37373d10cbbe1 Mon Sep 17 00:00:00 2001
+From: Daniel Borkmann <daniel@iogearbox.net>
+Date: Thu, 21 Feb 2019 23:19:41 +0100
+Subject: x86, retpolines: Raise limit for generating indirect calls from switch-case
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Daniel Borkmann <daniel@iogearbox.net>
+
+commit ce02ef06fcf7a399a6276adb83f37373d10cbbe1 upstream.
+
+From networking side, there are numerous attempts to get rid of indirect
+calls in fast-path wherever feasible in order to avoid the cost of
+retpolines, for example, just to name a few:
+
+  * 283c16a2dfd3 ("indirect call wrappers: helpers to speed-up indirect calls of builtin")
+  * aaa5d90b395a ("net: use indirect call wrappers at GRO network layer")
+  * 028e0a476684 ("net: use indirect call wrappers at GRO transport layer")
+  * 356da6d0cde3 ("dma-mapping: bypass indirect calls for dma-direct")
+  * 09772d92cd5a ("bpf: avoid retpoline for lookup/update/delete calls on maps")
+  * 10870dd89e95 ("netfilter: nf_tables: add direct calls for all builtin expressions")
+  [...]
+
+Recent work on XDP from Björn and Magnus additionally found that manually
+transforming the XDP return code switch statement with more than 5 cases
+into if-else combination would result in a considerable speedup in XDP
+layer due to avoidance of indirect calls in CONFIG_RETPOLINE enabled
+builds. On i40e driver with XDP prog attached, a 20-26% speedup has been
+observed [0]. Aside from XDP, there are many other places later in the
+networking stack's critical path with similar switch-case
+processing. Rather than fixing every XDP-enabled driver and locations in
+stack by hand, it would be good to instead raise the limit where gcc would
+emit expensive indirect calls from the switch under retpolines and stick
+with the default as-is in case of !retpoline configured kernels. This would
+also have the advantage that for archs where this is not necessary, we let
+compiler select the underlying target optimization for these constructs and
+avoid potential slow-downs by if-else hand-rewrite.
+
+In case of gcc, this setting is controlled by case-values-threshold which
+has an architecture global default that selects 4 or 5 (latter if target
+does not have a case insn that compares the bounds) where some arch back
+ends like arm64 or s390 override it with their own target hooks, for
+example, in gcc commit db7a90aa0de5 ("S/390: Disable prediction of indirect
+branches") the threshold pretty much disables jump tables by limit of 20
+under retpoline builds.  Comparing gcc's and clang's default code
+generation on x86-64 under O2 level with retpoline build results in the
+following outcome for 5 switch cases:
+
+* gcc with -mindirect-branch=thunk-inline -mindirect-branch-register:
+
+  # gdb -batch -ex 'disassemble dispatch' ./c-switch
+  Dump of assembler code for function dispatch:
+   0x0000000000400be0 <+0>:     cmp    $0x4,%edi
+   0x0000000000400be3 <+3>:     ja     0x400c35 <dispatch+85>
+   0x0000000000400be5 <+5>:     lea    0x915f8(%rip),%rdx        # 0x4921e4
+   0x0000000000400bec <+12>:    mov    %edi,%edi
+   0x0000000000400bee <+14>:    movslq (%rdx,%rdi,4),%rax
+   0x0000000000400bf2 <+18>:    add    %rdx,%rax
+   0x0000000000400bf5 <+21>:    callq  0x400c01 <dispatch+33>
+   0x0000000000400bfa <+26>:    pause
+   0x0000000000400bfc <+28>:    lfence
+   0x0000000000400bff <+31>:    jmp    0x400bfa <dispatch+26>
+   0x0000000000400c01 <+33>:    mov    %rax,(%rsp)
+   0x0000000000400c05 <+37>:    retq
+   0x0000000000400c06 <+38>:    nopw   %cs:0x0(%rax,%rax,1)
+   0x0000000000400c10 <+48>:    jmpq   0x400c90 <fn_3>
+   0x0000000000400c15 <+53>:    nopl   (%rax)
+   0x0000000000400c18 <+56>:    jmpq   0x400c70 <fn_2>
+   0x0000000000400c1d <+61>:    nopl   (%rax)
+   0x0000000000400c20 <+64>:    jmpq   0x400c50 <fn_1>
+   0x0000000000400c25 <+69>:    nopl   (%rax)
+   0x0000000000400c28 <+72>:    jmpq   0x400c40 <fn_0>
+   0x0000000000400c2d <+77>:    nopl   (%rax)
+   0x0000000000400c30 <+80>:    jmpq   0x400cb0 <fn_4>
+   0x0000000000400c35 <+85>:    push   %rax
+   0x0000000000400c36 <+86>:    callq  0x40dd80 <abort>
+  End of assembler dump.
+
+* clang with -mretpoline emitting search tree:
+
+  # gdb -batch -ex 'disassemble dispatch' ./c-switch
+  Dump of assembler code for function dispatch:
+   0x0000000000400b30 <+0>:     cmp    $0x1,%edi
+   0x0000000000400b33 <+3>:     jle    0x400b44 <dispatch+20>
+   0x0000000000400b35 <+5>:     cmp    $0x2,%edi
+   0x0000000000400b38 <+8>:     je     0x400b4d <dispatch+29>
+   0x0000000000400b3a <+10>:    cmp    $0x3,%edi
+   0x0000000000400b3d <+13>:    jne    0x400b52 <dispatch+34>
+   0x0000000000400b3f <+15>:    jmpq   0x400c50 <fn_3>
+   0x0000000000400b44 <+20>:    test   %edi,%edi
+   0x0000000000400b46 <+22>:    jne    0x400b5c <dispatch+44>
+   0x0000000000400b48 <+24>:    jmpq   0x400c20 <fn_0>
+   0x0000000000400b4d <+29>:    jmpq   0x400c40 <fn_2>
+   0x0000000000400b52 <+34>:    cmp    $0x4,%edi
+   0x0000000000400b55 <+37>:    jne    0x400b66 <dispatch+54>
+   0x0000000000400b57 <+39>:    jmpq   0x400c60 <fn_4>
+   0x0000000000400b5c <+44>:    cmp    $0x1,%edi
+   0x0000000000400b5f <+47>:    jne    0x400b66 <dispatch+54>
+   0x0000000000400b61 <+49>:    jmpq   0x400c30 <fn_1>
+   0x0000000000400b66 <+54>:    push   %rax
+   0x0000000000400b67 <+55>:    callq  0x40dd20 <abort>
+  End of assembler dump.
+
+  For sake of comparison, clang without -mretpoline:
+
+  # gdb -batch -ex 'disassemble dispatch' ./c-switch
+  Dump of assembler code for function dispatch:
+   0x0000000000400b30 <+0>:    cmp    $0x4,%edi
+   0x0000000000400b33 <+3>:    ja     0x400b57 <dispatch+39>
+   0x0000000000400b35 <+5>:    mov    %edi,%eax
+   0x0000000000400b37 <+7>:    jmpq   *0x492148(,%rax,8)
+   0x0000000000400b3e <+14>:   jmpq   0x400bf0 <fn_0>
+   0x0000000000400b43 <+19>:   jmpq   0x400c30 <fn_4>
+   0x0000000000400b48 <+24>:   jmpq   0x400c10 <fn_2>
+   0x0000000000400b4d <+29>:   jmpq   0x400c20 <fn_3>
+   0x0000000000400b52 <+34>:   jmpq   0x400c00 <fn_1>
+   0x0000000000400b57 <+39>:   push   %rax
+   0x0000000000400b58 <+40>:   callq  0x40dcf0 <abort>
+  End of assembler dump.
+
+Raising the cases to a high number (e.g. 100) will still result in similar
+code generation pattern with clang and gcc as above, in other words clang
+generally turns off jump table emission by having an extra expansion pass
+under retpoline build to turn indirectbr instructions from their IR into
+switch instructions as a built-in -mno-jump-table lowering of a switch (in
+this case, even if IR input already contained an indirect branch).
+
+For gcc, adding --param=case-values-threshold=20 as in similar fashion as
+s390 in order to raise the limit for x86 retpoline enabled builds results
+in a small vmlinux size increase of only 0.13% (before=18,027,528
+after=18,051,192). For clang this option is ignored due to i) not being
+needed as mentioned and ii) not having above cmdline
+parameter. Non-retpoline-enabled builds with gcc continue to use the
+default case-values-threshold setting, so nothing changes here.
+
+[0] https://lore.kernel.org/netdev/20190129095754.9390-1-bjorn.topel@gmail.com/
+    and "The Path to DPDK Speeds for AF_XDP", LPC 2018, networking track:
+  - http://vger.kernel.org/lpc_net2018_talks/lpc18_pres_af_xdp_perf-v3.pdf
+  - http://vger.kernel.org/lpc_net2018_talks/lpc18_paper_af_xdp_perf-v2.pdf
+
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
+Acked-by: Björn Töpel <bjorn.topel@intel.com>
+Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: netdev@vger.kernel.org
+Cc: David S. Miller <davem@davemloft.net>
+Cc: Magnus Karlsson <magnus.karlsson@intel.com>
+Cc: Alexei Starovoitov <ast@kernel.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: David Woodhouse <dwmw2@infradead.org>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Borislav Petkov <bp@alien8.de>
+Link: https://lkml.kernel.org/r/20190221221941.29358-1-daniel@iogearbox.net
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/Makefile |    5 +++++
+ 1 file changed, 5 insertions(+)
+
+--- a/arch/x86/Makefile
++++ b/arch/x86/Makefile
+@@ -224,6 +224,11 @@ KBUILD_CFLAGS += -fno-asynchronous-unwin
+ # Avoid indirect branches in kernel to deal with Spectre
+ ifdef CONFIG_RETPOLINE
+   KBUILD_CFLAGS += $(RETPOLINE_CFLAGS)
++  # Additionally, avoid generating expensive indirect jumps which
++  # are subject to retpolines for small number of switch cases.
++  # clang turns off jump table generation by default when under
++  # retpoline builds, however, gcc does not for x86.
++  KBUILD_CFLAGS += $(call cc-option,--param=case-values-threshold=20)
+ endif
+ 
+ archscripts: scripts_basic
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Mon, 29 Apr 2019 13:15:14 +0000 (15:15 +0200)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Mon, 29 Apr 2019 13:15:14 +0000 (15:15 +0200)
queue-4.19/aio-abstract-out-io_event-filler-helper.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/aio-clear-iocb_hipri.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/aio-don-t-zero-entire-aio_kiocb-aio_get_req.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/aio-fold-lookup_kiocb-into-its-sole-caller.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/aio-initialize-kiocb-private-in-case-any-filesystems-expect-it.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/aio-keep-io_event-in-aio_kiocb.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/aio-separate-out-ring-reservation-from-req-allocation.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/aio-simplify-and-fix-fget-fput-for-io_submit.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/aio-split-out-iocb-copy-from-io_submit_one.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/aio-store-event-at-final-iocb_put.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/aio-use-assigned-completion-handler.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/aio-use-iocb_put-instead-of-open-coding-it.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/fix-aio_poll-races.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/mm-fix-warning-in-insert_pfn.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/pin-iocb-through-aio.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/series		patch \| blob \| blame \| history
queue-4.19/x86-retpolines-disable-switch-jump-tables-when-retpolines-are-enabled.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/x86-retpolines-raise-limit-for-generating-indirect-calls-from-switch-case.patch	[new file with mode: 0644]	patch \| blob