From: Greg Kroah-Hartman Date: Mon, 29 Apr 2019 13:15:14 +0000 (+0200) Subject: 4.19-stable patches X-Git-Tag: v4.9.172~22 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=3c37b6bbc085c819f12a703a7b3e6b707e70dfb7;p=thirdparty%2Fkernel%2Fstable-queue.git 4.19-stable patches added patches: aio-abstract-out-io_event-filler-helper.patch aio-clear-iocb_hipri.patch aio-don-t-zero-entire-aio_kiocb-aio_get_req.patch aio-fold-lookup_kiocb-into-its-sole-caller.patch aio-initialize-kiocb-private-in-case-any-filesystems-expect-it.patch aio-keep-io_event-in-aio_kiocb.patch aio-separate-out-ring-reservation-from-req-allocation.patch aio-simplify-and-fix-fget-fput-for-io_submit.patch aio-split-out-iocb-copy-from-io_submit_one.patch aio-store-event-at-final-iocb_put.patch aio-use-assigned-completion-handler.patch aio-use-iocb_put-instead-of-open-coding-it.patch fix-aio_poll-races.patch mm-fix-warning-in-insert_pfn.patch pin-iocb-through-aio.patch x86-retpolines-disable-switch-jump-tables-when-retpolines-are-enabled.patch x86-retpolines-raise-limit-for-generating-indirect-calls-from-switch-case.patch --- diff --git a/queue-4.19/aio-abstract-out-io_event-filler-helper.patch b/queue-4.19/aio-abstract-out-io_event-filler-helper.patch new file mode 100644 index 00000000000..c33ef21d7b4 --- /dev/null +++ b/queue-4.19/aio-abstract-out-io_event-filler-helper.patch @@ -0,0 +1,48 @@ +From 875736bb3f3ded168469f6a14df7a938416a99d5 Mon Sep 17 00:00:00 2001 +From: Jens Axboe +Date: Tue, 20 Nov 2018 20:06:23 -0700 +Subject: aio: abstract out io_event filler helper + +From: Jens Axboe + +commit 875736bb3f3ded168469f6a14df7a938416a99d5 upstream. + +Reviewed-by: Christoph Hellwig +Signed-off-by: Jens Axboe +Cc: Guenter Roeck +Signed-off-by: Greg Kroah-Hartman + +--- + fs/aio.c | 14 ++++++++++---- + 1 file changed, 10 insertions(+), 4 deletions(-) + +--- a/fs/aio.c ++++ b/fs/aio.c +@@ -1059,6 +1059,15 @@ static inline void iocb_put(struct aio_k + } + } + ++static void aio_fill_event(struct io_event *ev, struct aio_kiocb *iocb, ++ long res, long res2) ++{ ++ ev->obj = (u64)(unsigned long)iocb->ki_user_iocb; ++ ev->data = iocb->ki_user_data; ++ ev->res = res; ++ ev->res2 = res2; ++} ++ + /* aio_complete + * Called when the io request on the given iocb is complete. + */ +@@ -1086,10 +1095,7 @@ static void aio_complete(struct aio_kioc + ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); + event = ev_page + pos % AIO_EVENTS_PER_PAGE; + +- event->obj = (u64)(unsigned long)iocb->ki_user_iocb; +- event->data = iocb->ki_user_data; +- event->res = res; +- event->res2 = res2; ++ aio_fill_event(event, iocb, res, res2); + + kunmap_atomic(ev_page); + flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); diff --git a/queue-4.19/aio-clear-iocb_hipri.patch b/queue-4.19/aio-clear-iocb_hipri.patch new file mode 100644 index 00000000000..2c45f8798fd --- /dev/null +++ b/queue-4.19/aio-clear-iocb_hipri.patch @@ -0,0 +1,53 @@ +From 154989e45fd8de9bfb52bbd6e5ea763e437e54c5 Mon Sep 17 00:00:00 2001 +From: Christoph Hellwig +Date: Thu, 22 Nov 2018 16:44:07 +0100 +Subject: aio: clear IOCB_HIPRI + +From: Christoph Hellwig + +commit 154989e45fd8de9bfb52bbd6e5ea763e437e54c5 upstream. + +No one is going to poll for aio (yet), so we must clear the HIPRI +flag, as we would otherwise send it down the poll queues, where no +one will be polling for completions. + +Signed-off-by: Christoph Hellwig + +IOCB_HIPRI, not RWF_HIPRI. + +Reviewed-by: Johannes Thumshirn +Signed-off-by: Jens Axboe +Cc: Guenter Roeck +Signed-off-by: Greg Kroah-Hartman + +--- + fs/aio.c | 11 ++++++++--- + 1 file changed, 8 insertions(+), 3 deletions(-) + +--- a/fs/aio.c ++++ b/fs/aio.c +@@ -1438,8 +1438,7 @@ static int aio_prep_rw(struct kiocb *req + ret = ioprio_check_cap(iocb->aio_reqprio); + if (ret) { + pr_debug("aio ioprio check cap error: %d\n", ret); +- fput(req->ki_filp); +- return ret; ++ goto out_fput; + } + + req->ki_ioprio = iocb->aio_reqprio; +@@ -1448,7 +1447,13 @@ static int aio_prep_rw(struct kiocb *req + + ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags); + if (unlikely(ret)) +- fput(req->ki_filp); ++ goto out_fput; ++ ++ req->ki_flags &= ~IOCB_HIPRI; /* no one is going to poll for this I/O */ ++ return 0; ++ ++out_fput: ++ fput(req->ki_filp); + return ret; + } + diff --git a/queue-4.19/aio-don-t-zero-entire-aio_kiocb-aio_get_req.patch b/queue-4.19/aio-don-t-zero-entire-aio_kiocb-aio_get_req.patch new file mode 100644 index 00000000000..79dad84010c --- /dev/null +++ b/queue-4.19/aio-don-t-zero-entire-aio_kiocb-aio_get_req.patch @@ -0,0 +1,53 @@ +From 2bc4ca9bb600cbe36941da2b2a67189fc4302a04 Mon Sep 17 00:00:00 2001 +From: Jens Axboe +Date: Tue, 4 Dec 2018 09:44:49 -0700 +Subject: aio: don't zero entire aio_kiocb aio_get_req() + +From: Jens Axboe + +commit 2bc4ca9bb600cbe36941da2b2a67189fc4302a04 upstream. + +It's 192 bytes, fairly substantial. Most items don't need to be cleared, +especially not upfront. Clear the ones we do need to clear, and leave +the other ones for setup when the iocb is prepared and submitted. + +Reviewed-by: Christoph Hellwig +Signed-off-by: Jens Axboe +Cc: Guenter Roeck +Signed-off-by: Greg Kroah-Hartman + +--- + fs/aio.c | 9 +++++++-- + 1 file changed, 7 insertions(+), 2 deletions(-) + +--- a/fs/aio.c ++++ b/fs/aio.c +@@ -1010,14 +1010,15 @@ static inline struct aio_kiocb *aio_get_ + { + struct aio_kiocb *req; + +- req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO); ++ req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL); + if (unlikely(!req)) + return NULL; + + percpu_ref_get(&ctx->reqs); ++ req->ki_ctx = ctx; + INIT_LIST_HEAD(&req->ki_list); + refcount_set(&req->ki_refcnt, 0); +- req->ki_ctx = ctx; ++ req->ki_eventfd = NULL; + return req; + } + +@@ -1738,6 +1739,10 @@ static ssize_t aio_poll(struct aio_kiocb + if (unlikely(!req->file)) + return -EBADF; + ++ req->head = NULL; ++ req->woken = false; ++ req->cancelled = false; ++ + apt.pt._qproc = aio_poll_queue_proc; + apt.pt._key = req->events; + apt.iocb = aiocb; diff --git a/queue-4.19/aio-fold-lookup_kiocb-into-its-sole-caller.patch b/queue-4.19/aio-fold-lookup_kiocb-into-its-sole-caller.patch new file mode 100644 index 00000000000..3f3fc914fd8 --- /dev/null +++ b/queue-4.19/aio-fold-lookup_kiocb-into-its-sole-caller.patch @@ -0,0 +1,62 @@ +From 833f4154ed560232120bc475935ee1d6a20e159f Mon Sep 17 00:00:00 2001 +From: Al Viro +Date: Mon, 11 Mar 2019 19:00:36 -0400 +Subject: aio: fold lookup_kiocb() into its sole caller + +From: Al Viro + +commit 833f4154ed560232120bc475935ee1d6a20e159f upstream. + +Signed-off-by: Al Viro +Cc: Guenter Roeck +Signed-off-by: Greg Kroah-Hartman + +--- + fs/aio.c | 29 +++++++---------------------- + 1 file changed, 7 insertions(+), 22 deletions(-) + +--- a/fs/aio.c ++++ b/fs/aio.c +@@ -1992,24 +1992,6 @@ COMPAT_SYSCALL_DEFINE3(io_submit, compat + } + #endif + +-/* lookup_kiocb +- * Finds a given iocb for cancellation. +- */ +-static struct aio_kiocb * +-lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb) +-{ +- struct aio_kiocb *kiocb; +- +- assert_spin_locked(&ctx->ctx_lock); +- +- /* TODO: use a hash or array, this sucks. */ +- list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) { +- if (kiocb->ki_user_iocb == iocb) +- return kiocb; +- } +- return NULL; +-} +- + /* sys_io_cancel: + * Attempts to cancel an iocb previously passed to io_submit. If + * the operation is successfully cancelled, the resulting event is +@@ -2038,10 +2020,13 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t + return -EINVAL; + + spin_lock_irq(&ctx->ctx_lock); +- kiocb = lookup_kiocb(ctx, iocb); +- if (kiocb) { +- ret = kiocb->ki_cancel(&kiocb->rw); +- list_del_init(&kiocb->ki_list); ++ /* TODO: use a hash or array, this sucks. */ ++ list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) { ++ if (kiocb->ki_user_iocb == iocb) { ++ ret = kiocb->ki_cancel(&kiocb->rw); ++ list_del_init(&kiocb->ki_list); ++ break; ++ } + } + spin_unlock_irq(&ctx->ctx_lock); + diff --git a/queue-4.19/aio-initialize-kiocb-private-in-case-any-filesystems-expect-it.patch b/queue-4.19/aio-initialize-kiocb-private-in-case-any-filesystems-expect-it.patch new file mode 100644 index 00000000000..bfc8645c2e9 --- /dev/null +++ b/queue-4.19/aio-initialize-kiocb-private-in-case-any-filesystems-expect-it.patch @@ -0,0 +1,32 @@ +From ec51f8ee1e63498e9f521ec0e5a6d04622bb2c67 Mon Sep 17 00:00:00 2001 +From: Mike Marshall +Date: Tue, 5 Feb 2019 14:13:35 -0500 +Subject: aio: initialize kiocb private in case any filesystems expect it. + +From: Mike Marshall + +commit ec51f8ee1e63498e9f521ec0e5a6d04622bb2c67 upstream. + +A recent optimization had left private uninitialized. + +Fixes: 2bc4ca9bb600 ("aio: don't zero entire aio_kiocb aio_get_req()") +Reviewed-by: Christoph Hellwig +Signed-off-by: Mike Marshall +Signed-off-by: Jens Axboe +Cc: Guenter Roeck +Signed-off-by: Greg Kroah-Hartman + +--- + fs/aio.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/fs/aio.c ++++ b/fs/aio.c +@@ -1430,6 +1430,7 @@ static int aio_prep_rw(struct kiocb *req + if (unlikely(!req->ki_filp)) + return -EBADF; + req->ki_complete = aio_complete_rw; ++ req->private = NULL; + req->ki_pos = iocb->aio_offset; + req->ki_flags = iocb_flags(req->ki_filp); + if (iocb->aio_flags & IOCB_FLAG_RESFD) diff --git a/queue-4.19/aio-keep-io_event-in-aio_kiocb.patch b/queue-4.19/aio-keep-io_event-in-aio_kiocb.patch new file mode 100644 index 00000000000..b885ee9c9d4 --- /dev/null +++ b/queue-4.19/aio-keep-io_event-in-aio_kiocb.patch @@ -0,0 +1,106 @@ +From a9339b7855094ba11a97e8822ae038135e879e79 Mon Sep 17 00:00:00 2001 +From: Al Viro +Date: Thu, 7 Mar 2019 19:43:45 -0500 +Subject: aio: keep io_event in aio_kiocb + +From: Al Viro + +commit a9339b7855094ba11a97e8822ae038135e879e79 upstream. + +We want to separate forming the resulting io_event from putting it +into the ring buffer. + +Signed-off-by: Al Viro +Cc: Guenter Roeck +Signed-off-by: Greg Kroah-Hartman + +--- + fs/aio.c | 31 +++++++++++++------------------ + 1 file changed, 13 insertions(+), 18 deletions(-) + +--- a/fs/aio.c ++++ b/fs/aio.c +@@ -198,8 +198,7 @@ struct aio_kiocb { + struct kioctx *ki_ctx; + kiocb_cancel_fn *ki_cancel; + +- struct iocb __user *ki_user_iocb; /* user's aiocb */ +- __u64 ki_user_data; /* user's data for completion */ ++ struct io_event ki_res; + + struct list_head ki_list; /* the aio core uses this + * for cancellation */ +@@ -1078,15 +1077,6 @@ static inline void iocb_put(struct aio_k + iocb_destroy(iocb); + } + +-static void aio_fill_event(struct io_event *ev, struct aio_kiocb *iocb, +- long res, long res2) +-{ +- ev->obj = (u64)(unsigned long)iocb->ki_user_iocb; +- ev->data = iocb->ki_user_data; +- ev->res = res; +- ev->res2 = res2; +-} +- + /* aio_complete + * Called when the io request on the given iocb is complete. + */ +@@ -1098,6 +1088,8 @@ static void aio_complete(struct aio_kioc + unsigned tail, pos, head; + unsigned long flags; + ++ iocb->ki_res.res = res; ++ iocb->ki_res.res2 = res2; + /* + * Add a completion event to the ring buffer. Must be done holding + * ctx->completion_lock to prevent other code from messing with the tail +@@ -1114,14 +1106,14 @@ static void aio_complete(struct aio_kioc + ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); + event = ev_page + pos % AIO_EVENTS_PER_PAGE; + +- aio_fill_event(event, iocb, res, res2); ++ *event = iocb->ki_res; + + kunmap_atomic(ev_page); + flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); + +- pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n", +- ctx, tail, iocb, iocb->ki_user_iocb, iocb->ki_user_data, +- res, res2); ++ pr_debug("%p[%u]: %p: %p %Lx %Lx %Lx\n", ctx, tail, iocb, ++ (void __user *)(unsigned long)iocb->ki_res.obj, ++ iocb->ki_res.data, iocb->ki_res.res, iocb->ki_res.res2); + + /* after flagging the request as done, we + * must never even look at it again +@@ -1838,8 +1830,10 @@ static int __io_submit_one(struct kioctx + goto out_put_req; + } + +- req->ki_user_iocb = user_iocb; +- req->ki_user_data = iocb->aio_data; ++ req->ki_res.obj = (u64)(unsigned long)user_iocb; ++ req->ki_res.data = iocb->aio_data; ++ req->ki_res.res = 0; ++ req->ki_res.res2 = 0; + + switch (iocb->aio_lio_opcode) { + case IOCB_CMD_PREAD: +@@ -2009,6 +2003,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t + struct aio_kiocb *kiocb; + int ret = -EINVAL; + u32 key; ++ u64 obj = (u64)(unsigned long)iocb; + + if (unlikely(get_user(key, &iocb->aio_key))) + return -EFAULT; +@@ -2022,7 +2017,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t + spin_lock_irq(&ctx->ctx_lock); + /* TODO: use a hash or array, this sucks. */ + list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) { +- if (kiocb->ki_user_iocb == iocb) { ++ if (kiocb->ki_res.obj == obj) { + ret = kiocb->ki_cancel(&kiocb->rw); + list_del_init(&kiocb->ki_list); + break; diff --git a/queue-4.19/aio-separate-out-ring-reservation-from-req-allocation.patch b/queue-4.19/aio-separate-out-ring-reservation-from-req-allocation.patch new file mode 100644 index 00000000000..13b699adcfe --- /dev/null +++ b/queue-4.19/aio-separate-out-ring-reservation-from-req-allocation.patch @@ -0,0 +1,102 @@ +From 432c79978c33ecef91b1b04cea6936c20810da29 Mon Sep 17 00:00:00 2001 +From: Christoph Hellwig +Date: Mon, 19 Nov 2018 15:57:42 -0700 +Subject: aio: separate out ring reservation from req allocation + +From: Christoph Hellwig + +commit 432c79978c33ecef91b1b04cea6936c20810da29 upstream. + +This is in preparation for certain types of IO not needing a ring +reserveration. + +Signed-off-by: Christoph Hellwig +Signed-off-by: Jens Axboe +Cc: Guenter Roeck +Signed-off-by: Greg Kroah-Hartman + +--- + fs/aio.c | 30 +++++++++++++++++------------- + 1 file changed, 17 insertions(+), 13 deletions(-) + +--- a/fs/aio.c ++++ b/fs/aio.c +@@ -902,7 +902,7 @@ static void put_reqs_available(struct ki + local_irq_restore(flags); + } + +-static bool get_reqs_available(struct kioctx *ctx) ++static bool __get_reqs_available(struct kioctx *ctx) + { + struct kioctx_cpu *kcpu; + bool ret = false; +@@ -994,6 +994,14 @@ static void user_refill_reqs_available(s + spin_unlock_irq(&ctx->completion_lock); + } + ++static bool get_reqs_available(struct kioctx *ctx) ++{ ++ if (__get_reqs_available(ctx)) ++ return true; ++ user_refill_reqs_available(ctx); ++ return __get_reqs_available(ctx); ++} ++ + /* aio_get_req + * Allocate a slot for an aio request. + * Returns NULL if no requests are free. +@@ -1002,24 +1010,15 @@ static inline struct aio_kiocb *aio_get_ + { + struct aio_kiocb *req; + +- if (!get_reqs_available(ctx)) { +- user_refill_reqs_available(ctx); +- if (!get_reqs_available(ctx)) +- return NULL; +- } +- + req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO); + if (unlikely(!req)) +- goto out_put; ++ return NULL; + + percpu_ref_get(&ctx->reqs); + INIT_LIST_HEAD(&req->ki_list); + refcount_set(&req->ki_refcnt, 0); + req->ki_ctx = ctx; + return req; +-out_put: +- put_reqs_available(ctx, 1); +- return NULL; + } + + static struct kioctx *lookup_ioctx(unsigned long ctx_id) +@@ -1813,9 +1812,13 @@ static int io_submit_one(struct kioctx * + return -EINVAL; + } + ++ if (!get_reqs_available(ctx)) ++ return -EAGAIN; ++ ++ ret = -EAGAIN; + req = aio_get_req(ctx); + if (unlikely(!req)) +- return -EAGAIN; ++ goto out_put_reqs_available; + + if (iocb.aio_flags & IOCB_FLAG_RESFD) { + /* +@@ -1878,11 +1881,12 @@ static int io_submit_one(struct kioctx * + goto out_put_req; + return 0; + out_put_req: +- put_reqs_available(ctx, 1); + percpu_ref_put(&ctx->reqs); + if (req->ki_eventfd) + eventfd_ctx_put(req->ki_eventfd); + kmem_cache_free(kiocb_cachep, req); ++out_put_reqs_available: ++ put_reqs_available(ctx, 1); + return ret; + } + diff --git a/queue-4.19/aio-simplify-and-fix-fget-fput-for-io_submit.patch b/queue-4.19/aio-simplify-and-fix-fget-fput-for-io_submit.patch new file mode 100644 index 00000000000..f9eb5001891 --- /dev/null +++ b/queue-4.19/aio-simplify-and-fix-fget-fput-for-io_submit.patch @@ -0,0 +1,310 @@ +From 84c4e1f89fefe70554da0ab33be72c9be7994379 Mon Sep 17 00:00:00 2001 +From: Linus Torvalds +Date: Sun, 3 Mar 2019 14:23:33 -0800 +Subject: aio: simplify - and fix - fget/fput for io_submit() + +From: Linus Torvalds + +commit 84c4e1f89fefe70554da0ab33be72c9be7994379 upstream. + +Al Viro root-caused a race where the IOCB_CMD_POLL handling of +fget/fput() could cause us to access the file pointer after it had +already been freed: + + "In more details - normally IOCB_CMD_POLL handling looks so: + + 1) io_submit(2) allocates aio_kiocb instance and passes it to + aio_poll() + + 2) aio_poll() resolves the descriptor to struct file by req->file = + fget(iocb->aio_fildes) + + 3) aio_poll() sets ->woken to false and raises ->ki_refcnt of that + aio_kiocb to 2 (bumps by 1, that is). + + 4) aio_poll() calls vfs_poll(). After sanity checks (basically, + "poll_wait() had been called and only once") it locks the queue. + That's what the extra reference to iocb had been for - we know we + can safely access it. + + 5) With queue locked, we check if ->woken has already been set to + true (by aio_poll_wake()) and, if it had been, we unlock the + queue, drop a reference to aio_kiocb and bugger off - at that + point it's a responsibility to aio_poll_wake() and the stuff + called/scheduled by it. That code will drop the reference to file + in req->file, along with the other reference to our aio_kiocb. + + 6) otherwise, we see whether we need to wait. If we do, we unlock the + queue, drop one reference to aio_kiocb and go away - eventual + wakeup (or cancel) will deal with the reference to file and with + the other reference to aio_kiocb + + 7) otherwise we remove ourselves from waitqueue (still under the + queue lock), so that wakeup won't get us. No async activity will + be happening, so we can safely drop req->file and iocb ourselves. + + If wakeup happens while we are in vfs_poll(), we are fine - aio_kiocb + won't get freed under us, so we can do all the checks and locking + safely. And we don't touch ->file if we detect that case. + + However, vfs_poll() most certainly *does* touch the file it had been + given. So wakeup coming while we are still in ->poll() might end up + doing fput() on that file. That case is not too rare, and usually we + are saved by the still present reference from descriptor table - that + fput() is not the final one. + + But if another thread closes that descriptor right after our fget() + and wakeup does happen before ->poll() returns, we are in trouble - + final fput() done while we are in the middle of a method: + +Al also wrote a patch to take an extra reference to the file descriptor +to fix this, but I instead suggested we just streamline the whole file +pointer handling by submit_io() so that the generic aio submission code +simply keeps the file pointer around until the aio has completed. + +Fixes: bfe4037e722e ("aio: implement IOCB_CMD_POLL") +Acked-by: Al Viro +Reported-by: syzbot+503d4cc169fcec1cb18c@syzkaller.appspotmail.com +Signed-off-by: Linus Torvalds +Cc: Guenter Roeck +Signed-off-by: Greg Kroah-Hartman + +--- + fs/aio.c | 72 +++++++++++++++++++++-------------------------------- + include/linux/fs.h | 8 +++++ + 2 files changed, 36 insertions(+), 44 deletions(-) + +--- a/fs/aio.c ++++ b/fs/aio.c +@@ -161,9 +161,13 @@ struct kioctx { + unsigned id; + }; + ++/* ++ * First field must be the file pointer in all the ++ * iocb unions! See also 'struct kiocb' in ++ */ + struct fsync_iocb { +- struct work_struct work; + struct file *file; ++ struct work_struct work; + bool datasync; + }; + +@@ -177,8 +181,15 @@ struct poll_iocb { + struct work_struct work; + }; + ++/* ++ * NOTE! Each of the iocb union members has the file pointer ++ * as the first entry in their struct definition. So you can ++ * access the file pointer through any of the sub-structs, ++ * or directly as just 'ki_filp' in this struct. ++ */ + struct aio_kiocb { + union { ++ struct file *ki_filp; + struct kiocb rw; + struct fsync_iocb fsync; + struct poll_iocb poll; +@@ -1054,6 +1065,8 @@ static inline void iocb_put(struct aio_k + { + if (refcount_read(&iocb->ki_refcnt) == 0 || + refcount_dec_and_test(&iocb->ki_refcnt)) { ++ if (iocb->ki_filp) ++ fput(iocb->ki_filp); + percpu_ref_put(&iocb->ki_ctx->reqs); + kmem_cache_free(kiocb_cachep, iocb); + } +@@ -1418,7 +1431,6 @@ static void aio_complete_rw(struct kiocb + file_end_write(kiocb->ki_filp); + } + +- fput(kiocb->ki_filp); + aio_complete(iocb, res, res2); + } + +@@ -1426,9 +1438,6 @@ static int aio_prep_rw(struct kiocb *req + { + int ret; + +- req->ki_filp = fget(iocb->aio_fildes); +- if (unlikely(!req->ki_filp)) +- return -EBADF; + req->ki_complete = aio_complete_rw; + req->private = NULL; + req->ki_pos = iocb->aio_offset; +@@ -1445,7 +1454,7 @@ static int aio_prep_rw(struct kiocb *req + ret = ioprio_check_cap(iocb->aio_reqprio); + if (ret) { + pr_debug("aio ioprio check cap error: %d\n", ret); +- goto out_fput; ++ return ret; + } + + req->ki_ioprio = iocb->aio_reqprio; +@@ -1454,14 +1463,10 @@ static int aio_prep_rw(struct kiocb *req + + ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags); + if (unlikely(ret)) +- goto out_fput; ++ return ret; + + req->ki_flags &= ~IOCB_HIPRI; /* no one is going to poll for this I/O */ + return 0; +- +-out_fput: +- fput(req->ki_filp); +- return ret; + } + + static int aio_setup_rw(int rw, const struct iocb *iocb, struct iovec **iovec, +@@ -1515,24 +1520,19 @@ static ssize_t aio_read(struct kiocb *re + if (ret) + return ret; + file = req->ki_filp; +- +- ret = -EBADF; + if (unlikely(!(file->f_mode & FMODE_READ))) +- goto out_fput; ++ return -EBADF; + ret = -EINVAL; + if (unlikely(!file->f_op->read_iter)) +- goto out_fput; ++ return -EINVAL; + + ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter); + if (ret) +- goto out_fput; ++ return ret; + ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter)); + if (!ret) + aio_rw_done(req, call_read_iter(file, req, &iter)); + kfree(iovec); +-out_fput: +- if (unlikely(ret)) +- fput(file); + return ret; + } + +@@ -1549,16 +1549,14 @@ static ssize_t aio_write(struct kiocb *r + return ret; + file = req->ki_filp; + +- ret = -EBADF; + if (unlikely(!(file->f_mode & FMODE_WRITE))) +- goto out_fput; +- ret = -EINVAL; ++ return -EBADF; + if (unlikely(!file->f_op->write_iter)) +- goto out_fput; ++ return -EINVAL; + + ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter); + if (ret) +- goto out_fput; ++ return ret; + ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter)); + if (!ret) { + /* +@@ -1576,9 +1574,6 @@ static ssize_t aio_write(struct kiocb *r + aio_rw_done(req, call_write_iter(file, req, &iter)); + } + kfree(iovec); +-out_fput: +- if (unlikely(ret)) +- fput(file); + return ret; + } + +@@ -1588,7 +1583,6 @@ static void aio_fsync_work(struct work_s + int ret; + + ret = vfs_fsync(req->file, req->datasync); +- fput(req->file); + aio_complete(container_of(req, struct aio_kiocb, fsync), ret, 0); + } + +@@ -1599,13 +1593,8 @@ static int aio_fsync(struct fsync_iocb * + iocb->aio_rw_flags)) + return -EINVAL; + +- req->file = fget(iocb->aio_fildes); +- if (unlikely(!req->file)) +- return -EBADF; +- if (unlikely(!req->file->f_op->fsync)) { +- fput(req->file); ++ if (unlikely(!req->file->f_op->fsync)) + return -EINVAL; +- } + + req->datasync = datasync; + INIT_WORK(&req->work, aio_fsync_work); +@@ -1615,10 +1604,7 @@ static int aio_fsync(struct fsync_iocb * + + static inline void aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask) + { +- struct file *file = iocb->poll.file; +- + aio_complete(iocb, mangle_poll(mask), 0); +- fput(file); + } + + static void aio_poll_complete_work(struct work_struct *work) +@@ -1743,9 +1729,6 @@ static ssize_t aio_poll(struct aio_kiocb + + INIT_WORK(&req->work, aio_poll_complete_work); + req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP; +- req->file = fget(iocb->aio_fildes); +- if (unlikely(!req->file)) +- return -EBADF; + + req->head = NULL; + req->woken = false; +@@ -1788,10 +1771,8 @@ static ssize_t aio_poll(struct aio_kiocb + spin_unlock_irq(&ctx->ctx_lock); + + out: +- if (unlikely(apt.error)) { +- fput(req->file); ++ if (unlikely(apt.error)) + return apt.error; +- } + + if (mask) + aio_poll_complete(aiocb, mask); +@@ -1829,6 +1810,11 @@ static int __io_submit_one(struct kioctx + if (unlikely(!req)) + goto out_put_reqs_available; + ++ req->ki_filp = fget(iocb->aio_fildes); ++ ret = -EBADF; ++ if (unlikely(!req->ki_filp)) ++ goto out_put_req; ++ + if (iocb->aio_flags & IOCB_FLAG_RESFD) { + /* + * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an +--- a/include/linux/fs.h ++++ b/include/linux/fs.h +@@ -304,13 +304,19 @@ enum rw_hint { + + struct kiocb { + struct file *ki_filp; ++ ++ /* The 'ki_filp' pointer is shared in a union for aio */ ++ randomized_struct_fields_start ++ + loff_t ki_pos; + void (*ki_complete)(struct kiocb *iocb, long ret, long ret2); + void *private; + int ki_flags; + u16 ki_hint; + u16 ki_ioprio; /* See linux/ioprio.h */ +-} __randomize_layout; ++ ++ randomized_struct_fields_end ++}; + + static inline bool is_sync_kiocb(struct kiocb *kiocb) + { diff --git a/queue-4.19/aio-split-out-iocb-copy-from-io_submit_one.patch b/queue-4.19/aio-split-out-iocb-copy-from-io_submit_one.patch new file mode 100644 index 00000000000..f8f3c24832c --- /dev/null +++ b/queue-4.19/aio-split-out-iocb-copy-from-io_submit_one.patch @@ -0,0 +1,195 @@ +From 88a6f18b950e2e4dce57d31daa151105f4f3dcff Mon Sep 17 00:00:00 2001 +From: Jens Axboe +Date: Sat, 24 Nov 2018 14:46:14 -0700 +Subject: aio: split out iocb copy from io_submit_one() + +From: Jens Axboe + +commit 88a6f18b950e2e4dce57d31daa151105f4f3dcff upstream. + +In preparation of handing in iocbs in a different fashion as well. Also +make it clear that the iocb being passed in isn't modified, by marking +it const throughout. + +Reviewed-by: Christoph Hellwig +Signed-off-by: Jens Axboe +Cc: Guenter Roeck +Signed-off-by: Greg Kroah-Hartman + +--- + fs/aio.c | 68 +++++++++++++++++++++++++++++++++++---------------------------- + 1 file changed, 38 insertions(+), 30 deletions(-) + +--- a/fs/aio.c ++++ b/fs/aio.c +@@ -1416,7 +1416,7 @@ static void aio_complete_rw(struct kiocb + aio_complete(iocb, res, res2); + } + +-static int aio_prep_rw(struct kiocb *req, struct iocb *iocb) ++static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) + { + int ret; + +@@ -1457,7 +1457,7 @@ out_fput: + return ret; + } + +-static int aio_setup_rw(int rw, struct iocb *iocb, struct iovec **iovec, ++static int aio_setup_rw(int rw, const struct iocb *iocb, struct iovec **iovec, + bool vectored, bool compat, struct iov_iter *iter) + { + void __user *buf = (void __user *)(uintptr_t)iocb->aio_buf; +@@ -1496,8 +1496,8 @@ static inline void aio_rw_done(struct ki + } + } + +-static ssize_t aio_read(struct kiocb *req, struct iocb *iocb, bool vectored, +- bool compat) ++static ssize_t aio_read(struct kiocb *req, const struct iocb *iocb, ++ bool vectored, bool compat) + { + struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; + struct iov_iter iter; +@@ -1529,8 +1529,8 @@ out_fput: + return ret; + } + +-static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored, +- bool compat) ++static ssize_t aio_write(struct kiocb *req, const struct iocb *iocb, ++ bool vectored, bool compat) + { + struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; + struct iov_iter iter; +@@ -1585,7 +1585,8 @@ static void aio_fsync_work(struct work_s + aio_complete(container_of(req, struct aio_kiocb, fsync), ret, 0); + } + +-static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync) ++static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb, ++ bool datasync) + { + if (unlikely(iocb->aio_buf || iocb->aio_offset || iocb->aio_nbytes || + iocb->aio_rw_flags)) +@@ -1719,7 +1720,7 @@ aio_poll_queue_proc(struct file *file, s + add_wait_queue(head, &pt->iocb->poll.wait); + } + +-static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb) ++static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) + { + struct kioctx *ctx = aiocb->ki_ctx; + struct poll_iocb *req = &aiocb->poll; +@@ -1791,27 +1792,23 @@ out: + return 0; + } + +-static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, +- bool compat) ++static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb, ++ struct iocb __user *user_iocb, bool compat) + { + struct aio_kiocb *req; +- struct iocb iocb; + ssize_t ret; + +- if (unlikely(copy_from_user(&iocb, user_iocb, sizeof(iocb)))) +- return -EFAULT; +- + /* enforce forwards compatibility on users */ +- if (unlikely(iocb.aio_reserved2)) { ++ if (unlikely(iocb->aio_reserved2)) { + pr_debug("EINVAL: reserve field set\n"); + return -EINVAL; + } + + /* prevent overflows */ + if (unlikely( +- (iocb.aio_buf != (unsigned long)iocb.aio_buf) || +- (iocb.aio_nbytes != (size_t)iocb.aio_nbytes) || +- ((ssize_t)iocb.aio_nbytes < 0) ++ (iocb->aio_buf != (unsigned long)iocb->aio_buf) || ++ (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) || ++ ((ssize_t)iocb->aio_nbytes < 0) + )) { + pr_debug("EINVAL: overflow check\n"); + return -EINVAL; +@@ -1825,14 +1822,14 @@ static int io_submit_one(struct kioctx * + if (unlikely(!req)) + goto out_put_reqs_available; + +- if (iocb.aio_flags & IOCB_FLAG_RESFD) { ++ if (iocb->aio_flags & IOCB_FLAG_RESFD) { + /* + * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an + * instance of the file* now. The file descriptor must be + * an eventfd() fd, and will be signaled for each completed + * event using the eventfd_signal() function. + */ +- req->ki_eventfd = eventfd_ctx_fdget((int) iocb.aio_resfd); ++ req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd); + if (IS_ERR(req->ki_eventfd)) { + ret = PTR_ERR(req->ki_eventfd); + req->ki_eventfd = NULL; +@@ -1847,32 +1844,32 @@ static int io_submit_one(struct kioctx * + } + + req->ki_user_iocb = user_iocb; +- req->ki_user_data = iocb.aio_data; ++ req->ki_user_data = iocb->aio_data; + +- switch (iocb.aio_lio_opcode) { ++ switch (iocb->aio_lio_opcode) { + case IOCB_CMD_PREAD: +- ret = aio_read(&req->rw, &iocb, false, compat); ++ ret = aio_read(&req->rw, iocb, false, compat); + break; + case IOCB_CMD_PWRITE: +- ret = aio_write(&req->rw, &iocb, false, compat); ++ ret = aio_write(&req->rw, iocb, false, compat); + break; + case IOCB_CMD_PREADV: +- ret = aio_read(&req->rw, &iocb, true, compat); ++ ret = aio_read(&req->rw, iocb, true, compat); + break; + case IOCB_CMD_PWRITEV: +- ret = aio_write(&req->rw, &iocb, true, compat); ++ ret = aio_write(&req->rw, iocb, true, compat); + break; + case IOCB_CMD_FSYNC: +- ret = aio_fsync(&req->fsync, &iocb, false); ++ ret = aio_fsync(&req->fsync, iocb, false); + break; + case IOCB_CMD_FDSYNC: +- ret = aio_fsync(&req->fsync, &iocb, true); ++ ret = aio_fsync(&req->fsync, iocb, true); + break; + case IOCB_CMD_POLL: +- ret = aio_poll(req, &iocb); ++ ret = aio_poll(req, iocb); + break; + default: +- pr_debug("invalid aio operation %d\n", iocb.aio_lio_opcode); ++ pr_debug("invalid aio operation %d\n", iocb->aio_lio_opcode); + ret = -EINVAL; + break; + } +@@ -1894,6 +1891,17 @@ out_put_reqs_available: + return ret; + } + ++static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, ++ bool compat) ++{ ++ struct iocb iocb; ++ ++ if (unlikely(copy_from_user(&iocb, user_iocb, sizeof(iocb)))) ++ return -EFAULT; ++ ++ return __io_submit_one(ctx, &iocb, user_iocb, compat); ++} ++ + /* sys_io_submit: + * Queue the nr iocbs pointed to by iocbpp for processing. Returns + * the number of iocbs queued. May return -EINVAL if the aio_context diff --git a/queue-4.19/aio-store-event-at-final-iocb_put.patch b/queue-4.19/aio-store-event-at-final-iocb_put.patch new file mode 100644 index 00000000000..7960f262a43 --- /dev/null +++ b/queue-4.19/aio-store-event-at-final-iocb_put.patch @@ -0,0 +1,102 @@ +From 2bb874c0d873d13bd9b9b9c6d7b7c4edab18c8b4 Mon Sep 17 00:00:00 2001 +From: Al Viro +Date: Thu, 7 Mar 2019 19:49:55 -0500 +Subject: aio: store event at final iocb_put() + +From: Al Viro + +commit 2bb874c0d873d13bd9b9b9c6d7b7c4edab18c8b4 upstream. + +Instead of having aio_complete() set ->ki_res.{res,res2}, do that +explicitly in its callers, drop the reference (as aio_complete() +used to do) and delay the rest until the final iocb_put(). + +Signed-off-by: Al Viro +Cc: Guenter Roeck +Signed-off-by: Greg Kroah-Hartman + +--- + fs/aio.c | 33 +++++++++++++++++---------------- + 1 file changed, 17 insertions(+), 16 deletions(-) + +--- a/fs/aio.c ++++ b/fs/aio.c +@@ -1071,16 +1071,10 @@ static inline void iocb_destroy(struct a + kmem_cache_free(kiocb_cachep, iocb); + } + +-static inline void iocb_put(struct aio_kiocb *iocb) +-{ +- if (refcount_dec_and_test(&iocb->ki_refcnt)) +- iocb_destroy(iocb); +-} +- + /* aio_complete + * Called when the io request on the given iocb is complete. + */ +-static void aio_complete(struct aio_kiocb *iocb, long res, long res2) ++static void aio_complete(struct aio_kiocb *iocb) + { + struct kioctx *ctx = iocb->ki_ctx; + struct aio_ring *ring; +@@ -1088,8 +1082,6 @@ static void aio_complete(struct aio_kioc + unsigned tail, pos, head; + unsigned long flags; + +- iocb->ki_res.res = res; +- iocb->ki_res.res2 = res2; + /* + * Add a completion event to the ring buffer. Must be done holding + * ctx->completion_lock to prevent other code from messing with the tail +@@ -1155,7 +1147,14 @@ static void aio_complete(struct aio_kioc + + if (waitqueue_active(&ctx->wait)) + wake_up(&ctx->wait); +- iocb_put(iocb); ++} ++ ++static inline void iocb_put(struct aio_kiocb *iocb) ++{ ++ if (refcount_dec_and_test(&iocb->ki_refcnt)) { ++ aio_complete(iocb); ++ iocb_destroy(iocb); ++ } + } + + /* aio_read_events_ring +@@ -1429,7 +1428,9 @@ static void aio_complete_rw(struct kiocb + file_end_write(kiocb->ki_filp); + } + +- aio_complete(iocb, res, res2); ++ iocb->ki_res.res = res; ++ iocb->ki_res.res2 = res2; ++ iocb_put(iocb); + } + + static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) +@@ -1577,11 +1578,10 @@ static ssize_t aio_write(struct kiocb *r + + static void aio_fsync_work(struct work_struct *work) + { +- struct fsync_iocb *req = container_of(work, struct fsync_iocb, work); +- int ret; ++ struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, fsync.work); + +- ret = vfs_fsync(req->file, req->datasync); +- aio_complete(container_of(req, struct aio_kiocb, fsync), ret, 0); ++ iocb->ki_res.res = vfs_fsync(iocb->fsync.file, iocb->fsync.datasync); ++ iocb_put(iocb); + } + + static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb, +@@ -1602,7 +1602,8 @@ static int aio_fsync(struct fsync_iocb * + + static inline void aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask) + { +- aio_complete(iocb, mangle_poll(mask), 0); ++ iocb->ki_res.res = mangle_poll(mask); ++ iocb_put(iocb); + } + + static void aio_poll_complete_work(struct work_struct *work) diff --git a/queue-4.19/aio-use-assigned-completion-handler.patch b/queue-4.19/aio-use-assigned-completion-handler.patch new file mode 100644 index 00000000000..e4d4a453c48 --- /dev/null +++ b/queue-4.19/aio-use-assigned-completion-handler.patch @@ -0,0 +1,33 @@ +From bc9bff61624ac33b7c95861abea1af24ee7a94fc Mon Sep 17 00:00:00 2001 +From: Jens Axboe +Date: Tue, 6 Nov 2018 14:27:13 -0700 +Subject: aio: use assigned completion handler + +From: Jens Axboe + +commit bc9bff61624ac33b7c95861abea1af24ee7a94fc upstream. + +We know this is a read/write request, but in preparation for +having different kinds of those, ensure that we call the assigned +handler instead of assuming it's aio_complete_rq(). + +Reviewed-by: Christoph Hellwig +Signed-off-by: Jens Axboe +Cc: Guenter Roeck +Signed-off-by: Greg Kroah-Hartman + +--- + fs/aio.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/aio.c ++++ b/fs/aio.c +@@ -1492,7 +1492,7 @@ static inline void aio_rw_done(struct ki + ret = -EINTR; + /*FALLTHRU*/ + default: +- aio_complete_rw(req, ret, 0); ++ req->ki_complete(req, ret, 0); + } + } + diff --git a/queue-4.19/aio-use-iocb_put-instead-of-open-coding-it.patch b/queue-4.19/aio-use-iocb_put-instead-of-open-coding-it.patch new file mode 100644 index 00000000000..656cc8acea7 --- /dev/null +++ b/queue-4.19/aio-use-iocb_put-instead-of-open-coding-it.patch @@ -0,0 +1,35 @@ +From 71ebc6fef0f53459f37fb39e1466792232fa52ee Mon Sep 17 00:00:00 2001 +From: Jens Axboe +Date: Sat, 24 Nov 2018 21:33:09 -0700 +Subject: aio: use iocb_put() instead of open coding it + +From: Jens Axboe + +commit 71ebc6fef0f53459f37fb39e1466792232fa52ee upstream. + +Replace the percpu_ref_put() + kmem_cache_free() with a call to +iocb_put() instead. + +Reviewed-by: Christoph Hellwig +Signed-off-by: Jens Axboe +Cc: Guenter Roeck +Signed-off-by: Greg Kroah-Hartman + +--- + fs/aio.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +--- a/fs/aio.c ++++ b/fs/aio.c +@@ -1886,10 +1886,9 @@ static int io_submit_one(struct kioctx * + goto out_put_req; + return 0; + out_put_req: +- percpu_ref_put(&ctx->reqs); + if (req->ki_eventfd) + eventfd_ctx_put(req->ki_eventfd); +- kmem_cache_free(kiocb_cachep, req); ++ iocb_put(req); + out_put_reqs_available: + put_reqs_available(ctx, 1); + return ret; diff --git a/queue-4.19/fix-aio_poll-races.patch b/queue-4.19/fix-aio_poll-races.patch new file mode 100644 index 00000000000..19845b2c2db --- /dev/null +++ b/queue-4.19/fix-aio_poll-races.patch @@ -0,0 +1,226 @@ +From af5c72b1fc7a00aa484e90b0c4e0eeb582545634 Mon Sep 17 00:00:00 2001 +From: Al Viro +Date: Thu, 7 Mar 2019 21:45:41 -0500 +Subject: Fix aio_poll() races + +From: Al Viro + +commit af5c72b1fc7a00aa484e90b0c4e0eeb582545634 upstream. + +aio_poll() has to cope with several unpleasant problems: + * requests that might stay around indefinitely need to +be made visible for io_cancel(2); that must not be done to +a request already completed, though. + * in cases when ->poll() has placed us on a waitqueue, +wakeup might have happened (and request completed) before ->poll() +returns. + * worse, in some early wakeup cases request might end +up re-added into the queue later - we can't treat "woken up and +currently not in the queue" as "it's not going to stick around +indefinitely" + * ... moreover, ->poll() might have decided not to +put it on any queues to start with, and that needs to be distinguished +from the previous case + * ->poll() might have tried to put us on more than one queue. +Only the first will succeed for aio poll, so we might end up missing +wakeups. OTOH, we might very well notice that only after the +wakeup hits and request gets completed (all before ->poll() gets +around to the second poll_wait()). In that case it's too late to +decide that we have an error. + +req->woken was an attempt to deal with that. Unfortunately, it was +broken. What we need to keep track of is not that wakeup has happened - +the thing might come back after that. It's that async reference is +already gone and won't come back, so we can't (and needn't) put the +request on the list of cancellables. + +The easiest case is "request hadn't been put on any waitqueues"; we +can tell by seeing NULL apt.head, and in that case there won't be +anything async. We should either complete the request ourselves +(if vfs_poll() reports anything of interest) or return an error. + +In all other cases we get exclusion with wakeups by grabbing the +queue lock. + +If request is currently on queue and we have something interesting +from vfs_poll(), we can steal it and complete the request ourselves. + +If it's on queue and vfs_poll() has not reported anything interesting, +we either put it on the cancellable list, or, if we know that it +hadn't been put on all queues ->poll() wanted it on, we steal it and +return an error. + +If it's _not_ on queue, it's either been already dealt with (in which +case we do nothing), or there's aio_poll_complete_work() about to be +executed. In that case we either put it on the cancellable list, +or, if we know it hadn't been put on all queues ->poll() wanted it on, +simulate what cancel would've done. + +It's a lot more convoluted than I'd like it to be. Single-consumer APIs +suck, and unfortunately aio is not an exception... + +Signed-off-by: Al Viro +Cc: Guenter Roeck +Signed-off-by: Greg Kroah-Hartman + +--- + fs/aio.c | 90 ++++++++++++++++++++++++++++----------------------------------- + 1 file changed, 40 insertions(+), 50 deletions(-) + +--- a/fs/aio.c ++++ b/fs/aio.c +@@ -175,7 +175,7 @@ struct poll_iocb { + struct file *file; + struct wait_queue_head *head; + __poll_t events; +- bool woken; ++ bool done; + bool cancelled; + struct wait_queue_entry wait; + struct work_struct work; +@@ -1600,12 +1600,6 @@ static int aio_fsync(struct fsync_iocb * + return 0; + } + +-static inline void aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask) +-{ +- iocb->ki_res.res = mangle_poll(mask); +- iocb_put(iocb); +-} +- + static void aio_poll_complete_work(struct work_struct *work) + { + struct poll_iocb *req = container_of(work, struct poll_iocb, work); +@@ -1631,9 +1625,11 @@ static void aio_poll_complete_work(struc + return; + } + list_del_init(&iocb->ki_list); ++ iocb->ki_res.res = mangle_poll(mask); ++ req->done = true; + spin_unlock_irq(&ctx->ctx_lock); + +- aio_poll_complete(iocb, mask); ++ iocb_put(iocb); + } + + /* assumes we are called with irqs disabled */ +@@ -1661,31 +1657,27 @@ static int aio_poll_wake(struct wait_que + __poll_t mask = key_to_poll(key); + unsigned long flags; + +- req->woken = true; +- + /* for instances that support it check for an event match first: */ +- if (mask) { +- if (!(mask & req->events)) +- return 0; ++ if (mask && !(mask & req->events)) ++ return 0; + ++ list_del_init(&req->wait.entry); ++ ++ if (mask && spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) { + /* + * Try to complete the iocb inline if we can. Use + * irqsave/irqrestore because not all filesystems (e.g. fuse) + * call this function with IRQs disabled and because IRQs + * have to be disabled before ctx_lock is obtained. + */ +- if (spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) { +- list_del(&iocb->ki_list); +- spin_unlock_irqrestore(&iocb->ki_ctx->ctx_lock, flags); +- +- list_del_init(&req->wait.entry); +- aio_poll_complete(iocb, mask); +- return 1; +- } ++ list_del(&iocb->ki_list); ++ iocb->ki_res.res = mangle_poll(mask); ++ req->done = true; ++ spin_unlock_irqrestore(&iocb->ki_ctx->ctx_lock, flags); ++ iocb_put(iocb); ++ } else { ++ schedule_work(&req->work); + } +- +- list_del_init(&req->wait.entry); +- schedule_work(&req->work); + return 1; + } + +@@ -1717,6 +1709,7 @@ static ssize_t aio_poll(struct aio_kiocb + struct kioctx *ctx = aiocb->ki_ctx; + struct poll_iocb *req = &aiocb->poll; + struct aio_poll_table apt; ++ bool cancel = false; + __poll_t mask; + + /* reject any unknown events outside the normal event mask. */ +@@ -1730,7 +1723,7 @@ static ssize_t aio_poll(struct aio_kiocb + req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP; + + req->head = NULL; +- req->woken = false; ++ req->done = false; + req->cancelled = false; + + apt.pt._qproc = aio_poll_queue_proc; +@@ -1743,36 +1736,33 @@ static ssize_t aio_poll(struct aio_kiocb + init_waitqueue_func_entry(&req->wait, aio_poll_wake); + + mask = vfs_poll(req->file, &apt.pt) & req->events; +- if (unlikely(!req->head)) { +- /* we did not manage to set up a waitqueue, done */ +- goto out; +- } +- + spin_lock_irq(&ctx->ctx_lock); +- spin_lock(&req->head->lock); +- if (req->woken) { +- /* wake_up context handles the rest */ +- mask = 0; ++ if (likely(req->head)) { ++ spin_lock(&req->head->lock); ++ if (unlikely(list_empty(&req->wait.entry))) { ++ if (apt.error) ++ cancel = true; ++ apt.error = 0; ++ mask = 0; ++ } ++ if (mask || apt.error) { ++ list_del_init(&req->wait.entry); ++ } else if (cancel) { ++ WRITE_ONCE(req->cancelled, true); ++ } else if (!req->done) { /* actually waiting for an event */ ++ list_add_tail(&aiocb->ki_list, &ctx->active_reqs); ++ aiocb->ki_cancel = aio_poll_cancel; ++ } ++ spin_unlock(&req->head->lock); ++ } ++ if (mask) { /* no async, we'd stolen it */ ++ aiocb->ki_res.res = mangle_poll(mask); + apt.error = 0; +- } else if (mask || apt.error) { +- /* if we get an error or a mask we are done */ +- WARN_ON_ONCE(list_empty(&req->wait.entry)); +- list_del_init(&req->wait.entry); +- } else { +- /* actually waiting for an event */ +- list_add_tail(&aiocb->ki_list, &ctx->active_reqs); +- aiocb->ki_cancel = aio_poll_cancel; + } +- spin_unlock(&req->head->lock); + spin_unlock_irq(&ctx->ctx_lock); +- +-out: +- if (unlikely(apt.error)) +- return apt.error; +- + if (mask) +- aio_poll_complete(aiocb, mask); +- return 0; ++ iocb_put(aiocb); ++ return apt.error; + } + + static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb, diff --git a/queue-4.19/mm-fix-warning-in-insert_pfn.patch b/queue-4.19/mm-fix-warning-in-insert_pfn.patch new file mode 100644 index 00000000000..7ebd3439b9b --- /dev/null +++ b/queue-4.19/mm-fix-warning-in-insert_pfn.patch @@ -0,0 +1,74 @@ +From f2c57d91b0d96aa13ccff4e3b178038f17b00658 Mon Sep 17 00:00:00 2001 +From: Jan Kara +Date: Tue, 30 Oct 2018 15:10:47 -0700 +Subject: mm: Fix warning in insert_pfn() + +From: Jan Kara + +commit f2c57d91b0d96aa13ccff4e3b178038f17b00658 upstream. + +In DAX mode a write pagefault can race with write(2) in the following +way: + +CPU0 CPU1 + write fault for mapped zero page (hole) +dax_iomap_rw() + iomap_apply() + xfs_file_iomap_begin() + - allocates blocks + dax_iomap_actor() + invalidate_inode_pages2_range() + - invalidates radix tree entries in given range + dax_iomap_pte_fault() + grab_mapping_entry() + - no entry found, creates empty + ... + xfs_file_iomap_begin() + - finds already allocated block + ... + vmf_insert_mixed_mkwrite() + - WARNs and does nothing because there + is still zero page mapped in PTE + unmap_mapping_pages() + +This race results in WARN_ON from insert_pfn() and is occasionally +triggered by fstest generic/344. Note that the race is otherwise +harmless as before write(2) on CPU0 is finished, we will invalidate page +tables properly and thus user of mmap will see modified data from +write(2) from that point on. So just restrict the warning only to the +case when the PFN in PTE is not zero page. + +Link: http://lkml.kernel.org/r/20180824154542.26872-1-jack@suse.cz +Signed-off-by: Jan Kara +Reviewed-by: Andrew Morton +Cc: Ross Zwisler +Cc: Dan Williams +Cc: Dave Jiang +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/memory.c | 9 +++++++-- + 1 file changed, 7 insertions(+), 2 deletions(-) + +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -1787,10 +1787,15 @@ static int insert_pfn(struct vm_area_str + * in may not match the PFN we have mapped if the + * mapped PFN is a writeable COW page. In the mkwrite + * case we are creating a writable PTE for a shared +- * mapping and we expect the PFNs to match. ++ * mapping and we expect the PFNs to match. If they ++ * don't match, we are likely racing with block ++ * allocation and mapping invalidation so just skip the ++ * update. + */ +- if (WARN_ON_ONCE(pte_pfn(*pte) != pfn_t_to_pfn(pfn))) ++ if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) { ++ WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte))); + goto out_unlock; ++ } + entry = *pte; + goto out_mkwrite; + } else diff --git a/queue-4.19/pin-iocb-through-aio.patch b/queue-4.19/pin-iocb-through-aio.patch new file mode 100644 index 00000000000..b133ac5d287 --- /dev/null +++ b/queue-4.19/pin-iocb-through-aio.patch @@ -0,0 +1,113 @@ +From b53119f13a04879c3bf502828d99d13726639ead Mon Sep 17 00:00:00 2001 +From: Linus Torvalds +Date: Wed, 6 Mar 2019 20:22:54 -0500 +Subject: pin iocb through aio. + +From: Linus Torvalds + +commit b53119f13a04879c3bf502828d99d13726639ead upstream. + +aio_poll() is not the only case that needs file pinned; worse, while +aio_read()/aio_write() can live without pinning iocb itself, the +proof is rather brittle and can easily break on later changes. + +Signed-off-by: Linus Torvalds +Signed-off-by: Al Viro +Cc: Guenter Roeck +Signed-off-by: Greg Kroah-Hartman + +--- + fs/aio.c | 37 +++++++++++++++++++++---------------- + 1 file changed, 21 insertions(+), 16 deletions(-) + +--- a/fs/aio.c ++++ b/fs/aio.c +@@ -1016,6 +1016,9 @@ static bool get_reqs_available(struct ki + /* aio_get_req + * Allocate a slot for an aio request. + * Returns NULL if no requests are free. ++ * ++ * The refcount is initialized to 2 - one for the async op completion, ++ * one for the synchronous code that does this. + */ + static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx) + { +@@ -1028,7 +1031,7 @@ static inline struct aio_kiocb *aio_get_ + percpu_ref_get(&ctx->reqs); + req->ki_ctx = ctx; + INIT_LIST_HEAD(&req->ki_list); +- refcount_set(&req->ki_refcnt, 0); ++ refcount_set(&req->ki_refcnt, 2); + req->ki_eventfd = NULL; + return req; + } +@@ -1061,15 +1064,18 @@ out: + return ret; + } + ++static inline void iocb_destroy(struct aio_kiocb *iocb) ++{ ++ if (iocb->ki_filp) ++ fput(iocb->ki_filp); ++ percpu_ref_put(&iocb->ki_ctx->reqs); ++ kmem_cache_free(kiocb_cachep, iocb); ++} ++ + static inline void iocb_put(struct aio_kiocb *iocb) + { +- if (refcount_read(&iocb->ki_refcnt) == 0 || +- refcount_dec_and_test(&iocb->ki_refcnt)) { +- if (iocb->ki_filp) +- fput(iocb->ki_filp); +- percpu_ref_put(&iocb->ki_ctx->reqs); +- kmem_cache_free(kiocb_cachep, iocb); +- } ++ if (refcount_dec_and_test(&iocb->ki_refcnt)) ++ iocb_destroy(iocb); + } + + static void aio_fill_event(struct io_event *ev, struct aio_kiocb *iocb, +@@ -1743,9 +1749,6 @@ static ssize_t aio_poll(struct aio_kiocb + INIT_LIST_HEAD(&req->wait.entry); + init_waitqueue_func_entry(&req->wait, aio_poll_wake); + +- /* one for removal from waitqueue, one for this function */ +- refcount_set(&aiocb->ki_refcnt, 2); +- + mask = vfs_poll(req->file, &apt.pt) & req->events; + if (unlikely(!req->head)) { + /* we did not manage to set up a waitqueue, done */ +@@ -1776,7 +1779,6 @@ out: + + if (mask) + aio_poll_complete(aiocb, mask); +- iocb_put(aiocb); + return 0; + } + +@@ -1867,18 +1869,21 @@ static int __io_submit_one(struct kioctx + break; + } + ++ /* Done with the synchronous reference */ ++ iocb_put(req); ++ + /* + * If ret is 0, we'd either done aio_complete() ourselves or have + * arranged for that to be done asynchronously. Anything non-zero + * means that we need to destroy req ourselves. + */ +- if (ret) +- goto out_put_req; +- return 0; ++ if (!ret) ++ return 0; ++ + out_put_req: + if (req->ki_eventfd) + eventfd_ctx_put(req->ki_eventfd); +- iocb_put(req); ++ iocb_destroy(req); + out_put_reqs_available: + put_reqs_available(ctx, 1); + return ret; diff --git a/queue-4.19/series b/queue-4.19/series index bfbc02c8cf3..b33883c7ca7 100644 --- a/queue-4.19/series +++ b/queue-4.19/series @@ -63,3 +63,20 @@ tipc-check-link-name-with-right-length-in-tipc_nl_compat_link_set.patch net-netrom-fix-error-cleanup-path-of-nr_proto_init.patch net-rds-check-address-length-before-reading-address-family.patch rxrpc-fix-race-condition-in-rxrpc_input_packet.patch +aio-clear-iocb_hipri.patch +aio-use-assigned-completion-handler.patch +aio-separate-out-ring-reservation-from-req-allocation.patch +aio-don-t-zero-entire-aio_kiocb-aio_get_req.patch +aio-use-iocb_put-instead-of-open-coding-it.patch +aio-split-out-iocb-copy-from-io_submit_one.patch +aio-abstract-out-io_event-filler-helper.patch +aio-initialize-kiocb-private-in-case-any-filesystems-expect-it.patch +aio-simplify-and-fix-fget-fput-for-io_submit.patch +pin-iocb-through-aio.patch +aio-fold-lookup_kiocb-into-its-sole-caller.patch +aio-keep-io_event-in-aio_kiocb.patch +aio-store-event-at-final-iocb_put.patch +fix-aio_poll-races.patch +x86-retpolines-raise-limit-for-generating-indirect-calls-from-switch-case.patch +x86-retpolines-disable-switch-jump-tables-when-retpolines-are-enabled.patch +mm-fix-warning-in-insert_pfn.patch diff --git a/queue-4.19/x86-retpolines-disable-switch-jump-tables-when-retpolines-are-enabled.patch b/queue-4.19/x86-retpolines-disable-switch-jump-tables-when-retpolines-are-enabled.patch new file mode 100644 index 00000000000..d4bd615c1ae --- /dev/null +++ b/queue-4.19/x86-retpolines-disable-switch-jump-tables-when-retpolines-are-enabled.patch @@ -0,0 +1,70 @@ +From a9d57ef15cbe327fe54416dd194ee0ea66ae53a4 Mon Sep 17 00:00:00 2001 +From: Daniel Borkmann +Date: Mon, 25 Mar 2019 14:56:20 +0100 +Subject: x86/retpolines: Disable switch jump tables when retpolines are enabled +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Daniel Borkmann + +commit a9d57ef15cbe327fe54416dd194ee0ea66ae53a4 upstream. + +Commit ce02ef06fcf7 ("x86, retpolines: Raise limit for generating indirect +calls from switch-case") raised the limit under retpolines to 20 switch +cases where gcc would only then start to emit jump tables, and therefore +effectively disabling the emission of slow indirect calls in this area. + +After this has been brought to attention to gcc folks [0], Martin Liska +has then fixed gcc to align with clang by avoiding to generate switch jump +tables entirely under retpolines. This is taking effect in gcc starting +from stable version 8.4.0. Given kernel supports compilation with older +versions of gcc where the fix is not being available or backported anymore, +we need to keep the extra KBUILD_CFLAGS around for some time and generally +set the -fno-jump-tables to align with what more recent gcc is doing +automatically today. + +More than 20 switch cases are not expected to be fast-path critical, but +it would still be good to align with gcc behavior for versions < 8.4.0 in +order to have consistency across supported gcc versions. vmlinux size is +slightly growing by 0.27% for older gcc. This flag is only set to work +around affected gcc, no change for clang. + + [0] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=86952 + +Suggested-by: Martin Liska +Signed-off-by: Daniel Borkmann +Signed-off-by: Thomas Gleixner +Cc: David Woodhouse +Cc: Linus Torvalds +Cc: Jesper Dangaard Brouer +Cc: Björn Töpel +Cc: Magnus Karlsson +Cc: Alexei Starovoitov +Cc: H.J. Lu +Cc: Alexei Starovoitov +Cc: David S. Miller +Link: https://lkml.kernel.org/r/20190325135620.14882-1-daniel@iogearbox.net +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/Makefile | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +--- a/arch/x86/Makefile ++++ b/arch/x86/Makefile +@@ -227,8 +227,12 @@ ifdef CONFIG_RETPOLINE + # Additionally, avoid generating expensive indirect jumps which + # are subject to retpolines for small number of switch cases. + # clang turns off jump table generation by default when under +- # retpoline builds, however, gcc does not for x86. +- KBUILD_CFLAGS += $(call cc-option,--param=case-values-threshold=20) ++ # retpoline builds, however, gcc does not for x86. This has ++ # only been fixed starting from gcc stable version 8.4.0 and ++ # onwards, but not for older ones. See gcc bug #86952. ++ ifndef CONFIG_CC_IS_CLANG ++ KBUILD_CFLAGS += $(call cc-option,-fno-jump-tables) ++ endif + endif + + archscripts: scripts_basic diff --git a/queue-4.19/x86-retpolines-raise-limit-for-generating-indirect-calls-from-switch-case.patch b/queue-4.19/x86-retpolines-raise-limit-for-generating-indirect-calls-from-switch-case.patch new file mode 100644 index 00000000000..9c4e9743cd3 --- /dev/null +++ b/queue-4.19/x86-retpolines-raise-limit-for-generating-indirect-calls-from-switch-case.patch @@ -0,0 +1,175 @@ +From ce02ef06fcf7a399a6276adb83f37373d10cbbe1 Mon Sep 17 00:00:00 2001 +From: Daniel Borkmann +Date: Thu, 21 Feb 2019 23:19:41 +0100 +Subject: x86, retpolines: Raise limit for generating indirect calls from switch-case +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Daniel Borkmann + +commit ce02ef06fcf7a399a6276adb83f37373d10cbbe1 upstream. + +From networking side, there are numerous attempts to get rid of indirect +calls in fast-path wherever feasible in order to avoid the cost of +retpolines, for example, just to name a few: + + * 283c16a2dfd3 ("indirect call wrappers: helpers to speed-up indirect calls of builtin") + * aaa5d90b395a ("net: use indirect call wrappers at GRO network layer") + * 028e0a476684 ("net: use indirect call wrappers at GRO transport layer") + * 356da6d0cde3 ("dma-mapping: bypass indirect calls for dma-direct") + * 09772d92cd5a ("bpf: avoid retpoline for lookup/update/delete calls on maps") + * 10870dd89e95 ("netfilter: nf_tables: add direct calls for all builtin expressions") + [...] + +Recent work on XDP from Björn and Magnus additionally found that manually +transforming the XDP return code switch statement with more than 5 cases +into if-else combination would result in a considerable speedup in XDP +layer due to avoidance of indirect calls in CONFIG_RETPOLINE enabled +builds. On i40e driver with XDP prog attached, a 20-26% speedup has been +observed [0]. Aside from XDP, there are many other places later in the +networking stack's critical path with similar switch-case +processing. Rather than fixing every XDP-enabled driver and locations in +stack by hand, it would be good to instead raise the limit where gcc would +emit expensive indirect calls from the switch under retpolines and stick +with the default as-is in case of !retpoline configured kernels. This would +also have the advantage that for archs where this is not necessary, we let +compiler select the underlying target optimization for these constructs and +avoid potential slow-downs by if-else hand-rewrite. + +In case of gcc, this setting is controlled by case-values-threshold which +has an architecture global default that selects 4 or 5 (latter if target +does not have a case insn that compares the bounds) where some arch back +ends like arm64 or s390 override it with their own target hooks, for +example, in gcc commit db7a90aa0de5 ("S/390: Disable prediction of indirect +branches") the threshold pretty much disables jump tables by limit of 20 +under retpoline builds. Comparing gcc's and clang's default code +generation on x86-64 under O2 level with retpoline build results in the +following outcome for 5 switch cases: + +* gcc with -mindirect-branch=thunk-inline -mindirect-branch-register: + + # gdb -batch -ex 'disassemble dispatch' ./c-switch + Dump of assembler code for function dispatch: + 0x0000000000400be0 <+0>: cmp $0x4,%edi + 0x0000000000400be3 <+3>: ja 0x400c35 + 0x0000000000400be5 <+5>: lea 0x915f8(%rip),%rdx # 0x4921e4 + 0x0000000000400bec <+12>: mov %edi,%edi + 0x0000000000400bee <+14>: movslq (%rdx,%rdi,4),%rax + 0x0000000000400bf2 <+18>: add %rdx,%rax + 0x0000000000400bf5 <+21>: callq 0x400c01 + 0x0000000000400bfa <+26>: pause + 0x0000000000400bfc <+28>: lfence + 0x0000000000400bff <+31>: jmp 0x400bfa + 0x0000000000400c01 <+33>: mov %rax,(%rsp) + 0x0000000000400c05 <+37>: retq + 0x0000000000400c06 <+38>: nopw %cs:0x0(%rax,%rax,1) + 0x0000000000400c10 <+48>: jmpq 0x400c90 + 0x0000000000400c15 <+53>: nopl (%rax) + 0x0000000000400c18 <+56>: jmpq 0x400c70 + 0x0000000000400c1d <+61>: nopl (%rax) + 0x0000000000400c20 <+64>: jmpq 0x400c50 + 0x0000000000400c25 <+69>: nopl (%rax) + 0x0000000000400c28 <+72>: jmpq 0x400c40 + 0x0000000000400c2d <+77>: nopl (%rax) + 0x0000000000400c30 <+80>: jmpq 0x400cb0 + 0x0000000000400c35 <+85>: push %rax + 0x0000000000400c36 <+86>: callq 0x40dd80 + End of assembler dump. + +* clang with -mretpoline emitting search tree: + + # gdb -batch -ex 'disassemble dispatch' ./c-switch + Dump of assembler code for function dispatch: + 0x0000000000400b30 <+0>: cmp $0x1,%edi + 0x0000000000400b33 <+3>: jle 0x400b44 + 0x0000000000400b35 <+5>: cmp $0x2,%edi + 0x0000000000400b38 <+8>: je 0x400b4d + 0x0000000000400b3a <+10>: cmp $0x3,%edi + 0x0000000000400b3d <+13>: jne 0x400b52 + 0x0000000000400b3f <+15>: jmpq 0x400c50 + 0x0000000000400b44 <+20>: test %edi,%edi + 0x0000000000400b46 <+22>: jne 0x400b5c + 0x0000000000400b48 <+24>: jmpq 0x400c20 + 0x0000000000400b4d <+29>: jmpq 0x400c40 + 0x0000000000400b52 <+34>: cmp $0x4,%edi + 0x0000000000400b55 <+37>: jne 0x400b66 + 0x0000000000400b57 <+39>: jmpq 0x400c60 + 0x0000000000400b5c <+44>: cmp $0x1,%edi + 0x0000000000400b5f <+47>: jne 0x400b66 + 0x0000000000400b61 <+49>: jmpq 0x400c30 + 0x0000000000400b66 <+54>: push %rax + 0x0000000000400b67 <+55>: callq 0x40dd20 + End of assembler dump. + + For sake of comparison, clang without -mretpoline: + + # gdb -batch -ex 'disassemble dispatch' ./c-switch + Dump of assembler code for function dispatch: + 0x0000000000400b30 <+0>: cmp $0x4,%edi + 0x0000000000400b33 <+3>: ja 0x400b57 + 0x0000000000400b35 <+5>: mov %edi,%eax + 0x0000000000400b37 <+7>: jmpq *0x492148(,%rax,8) + 0x0000000000400b3e <+14>: jmpq 0x400bf0 + 0x0000000000400b43 <+19>: jmpq 0x400c30 + 0x0000000000400b48 <+24>: jmpq 0x400c10 + 0x0000000000400b4d <+29>: jmpq 0x400c20 + 0x0000000000400b52 <+34>: jmpq 0x400c00 + 0x0000000000400b57 <+39>: push %rax + 0x0000000000400b58 <+40>: callq 0x40dcf0 + End of assembler dump. + +Raising the cases to a high number (e.g. 100) will still result in similar +code generation pattern with clang and gcc as above, in other words clang +generally turns off jump table emission by having an extra expansion pass +under retpoline build to turn indirectbr instructions from their IR into +switch instructions as a built-in -mno-jump-table lowering of a switch (in +this case, even if IR input already contained an indirect branch). + +For gcc, adding --param=case-values-threshold=20 as in similar fashion as +s390 in order to raise the limit for x86 retpoline enabled builds results +in a small vmlinux size increase of only 0.13% (before=18,027,528 +after=18,051,192). For clang this option is ignored due to i) not being +needed as mentioned and ii) not having above cmdline +parameter. Non-retpoline-enabled builds with gcc continue to use the +default case-values-threshold setting, so nothing changes here. + +[0] https://lore.kernel.org/netdev/20190129095754.9390-1-bjorn.topel@gmail.com/ + and "The Path to DPDK Speeds for AF_XDP", LPC 2018, networking track: + - http://vger.kernel.org/lpc_net2018_talks/lpc18_pres_af_xdp_perf-v3.pdf + - http://vger.kernel.org/lpc_net2018_talks/lpc18_paper_af_xdp_perf-v2.pdf + +Signed-off-by: Daniel Borkmann +Signed-off-by: Thomas Gleixner +Acked-by: Jesper Dangaard Brouer +Acked-by: Björn Töpel +Acked-by: Linus Torvalds +Cc: netdev@vger.kernel.org +Cc: David S. Miller +Cc: Magnus Karlsson +Cc: Alexei Starovoitov +Cc: Peter Zijlstra +Cc: David Woodhouse +Cc: Andy Lutomirski +Cc: Borislav Petkov +Link: https://lkml.kernel.org/r/20190221221941.29358-1-daniel@iogearbox.net +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/Makefile | 5 +++++ + 1 file changed, 5 insertions(+) + +--- a/arch/x86/Makefile ++++ b/arch/x86/Makefile +@@ -224,6 +224,11 @@ KBUILD_CFLAGS += -fno-asynchronous-unwin + # Avoid indirect branches in kernel to deal with Spectre + ifdef CONFIG_RETPOLINE + KBUILD_CFLAGS += $(RETPOLINE_CFLAGS) ++ # Additionally, avoid generating expensive indirect jumps which ++ # are subject to retpolines for small number of switch cases. ++ # clang turns off jump table generation by default when under ++ # retpoline builds, however, gcc does not for x86. ++ KBUILD_CFLAGS += $(call cc-option,--param=case-values-threshold=20) + endif + + archscripts: scripts_basic