From: Greg Kroah-Hartman Date: Mon, 29 Apr 2019 13:15:31 +0000 (+0200) Subject: 5.0-stable patches X-Git-Tag: v4.9.172~21 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=088b31d94dc530edbc78f28467046d5676079095;p=thirdparty%2Fkernel%2Fstable-queue.git 5.0-stable patches added patches: aio-fold-lookup_kiocb-into-its-sole-caller.patch aio-keep-io_event-in-aio_kiocb.patch aio-store-event-at-final-iocb_put.patch fix-aio_poll-races.patch pin-iocb-through-aio.patch x86-retpolines-disable-switch-jump-tables-when-retpolines-are-enabled.patch x86-retpolines-raise-limit-for-generating-indirect-calls-from-switch-case.patch --- diff --git a/queue-5.0/aio-fold-lookup_kiocb-into-its-sole-caller.patch b/queue-5.0/aio-fold-lookup_kiocb-into-its-sole-caller.patch new file mode 100644 index 00000000000..4113707990c --- /dev/null +++ b/queue-5.0/aio-fold-lookup_kiocb-into-its-sole-caller.patch @@ -0,0 +1,62 @@ +From 833f4154ed560232120bc475935ee1d6a20e159f Mon Sep 17 00:00:00 2001 +From: Al Viro +Date: Mon, 11 Mar 2019 19:00:36 -0400 +Subject: aio: fold lookup_kiocb() into its sole caller + +From: Al Viro + +commit 833f4154ed560232120bc475935ee1d6a20e159f upstream. + +Signed-off-by: Al Viro +Cc: Guenter Roeck +Signed-off-by: Greg Kroah-Hartman + +--- + fs/aio.c | 29 +++++++---------------------- + 1 file changed, 7 insertions(+), 22 deletions(-) + +--- a/fs/aio.c ++++ b/fs/aio.c +@@ -2002,24 +2002,6 @@ COMPAT_SYSCALL_DEFINE3(io_submit, compat + } + #endif + +-/* lookup_kiocb +- * Finds a given iocb for cancellation. +- */ +-static struct aio_kiocb * +-lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb) +-{ +- struct aio_kiocb *kiocb; +- +- assert_spin_locked(&ctx->ctx_lock); +- +- /* TODO: use a hash or array, this sucks. */ +- list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) { +- if (kiocb->ki_user_iocb == iocb) +- return kiocb; +- } +- return NULL; +-} +- + /* sys_io_cancel: + * Attempts to cancel an iocb previously passed to io_submit. If + * the operation is successfully cancelled, the resulting event is +@@ -2048,10 +2030,13 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t + return -EINVAL; + + spin_lock_irq(&ctx->ctx_lock); +- kiocb = lookup_kiocb(ctx, iocb); +- if (kiocb) { +- ret = kiocb->ki_cancel(&kiocb->rw); +- list_del_init(&kiocb->ki_list); ++ /* TODO: use a hash or array, this sucks. */ ++ list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) { ++ if (kiocb->ki_user_iocb == iocb) { ++ ret = kiocb->ki_cancel(&kiocb->rw); ++ list_del_init(&kiocb->ki_list); ++ break; ++ } + } + spin_unlock_irq(&ctx->ctx_lock); + diff --git a/queue-5.0/aio-keep-io_event-in-aio_kiocb.patch b/queue-5.0/aio-keep-io_event-in-aio_kiocb.patch new file mode 100644 index 00000000000..06f6eb5dabd --- /dev/null +++ b/queue-5.0/aio-keep-io_event-in-aio_kiocb.patch @@ -0,0 +1,106 @@ +From a9339b7855094ba11a97e8822ae038135e879e79 Mon Sep 17 00:00:00 2001 +From: Al Viro +Date: Thu, 7 Mar 2019 19:43:45 -0500 +Subject: aio: keep io_event in aio_kiocb + +From: Al Viro + +commit a9339b7855094ba11a97e8822ae038135e879e79 upstream. + +We want to separate forming the resulting io_event from putting it +into the ring buffer. + +Signed-off-by: Al Viro +Cc: Guenter Roeck +Signed-off-by: Greg Kroah-Hartman + +--- + fs/aio.c | 31 +++++++++++++------------------ + 1 file changed, 13 insertions(+), 18 deletions(-) + +--- a/fs/aio.c ++++ b/fs/aio.c +@@ -204,8 +204,7 @@ struct aio_kiocb { + struct kioctx *ki_ctx; + kiocb_cancel_fn *ki_cancel; + +- struct iocb __user *ki_user_iocb; /* user's aiocb */ +- __u64 ki_user_data; /* user's data for completion */ ++ struct io_event ki_res; + + struct list_head ki_list; /* the aio core uses this + * for cancellation */ +@@ -1084,15 +1083,6 @@ static inline void iocb_put(struct aio_k + iocb_destroy(iocb); + } + +-static void aio_fill_event(struct io_event *ev, struct aio_kiocb *iocb, +- long res, long res2) +-{ +- ev->obj = (u64)(unsigned long)iocb->ki_user_iocb; +- ev->data = iocb->ki_user_data; +- ev->res = res; +- ev->res2 = res2; +-} +- + /* aio_complete + * Called when the io request on the given iocb is complete. + */ +@@ -1104,6 +1094,8 @@ static void aio_complete(struct aio_kioc + unsigned tail, pos, head; + unsigned long flags; + ++ iocb->ki_res.res = res; ++ iocb->ki_res.res2 = res2; + /* + * Add a completion event to the ring buffer. Must be done holding + * ctx->completion_lock to prevent other code from messing with the tail +@@ -1120,14 +1112,14 @@ static void aio_complete(struct aio_kioc + ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); + event = ev_page + pos % AIO_EVENTS_PER_PAGE; + +- aio_fill_event(event, iocb, res, res2); ++ *event = iocb->ki_res; + + kunmap_atomic(ev_page); + flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); + +- pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n", +- ctx, tail, iocb, iocb->ki_user_iocb, iocb->ki_user_data, +- res, res2); ++ pr_debug("%p[%u]: %p: %p %Lx %Lx %Lx\n", ctx, tail, iocb, ++ (void __user *)(unsigned long)iocb->ki_res.obj, ++ iocb->ki_res.data, iocb->ki_res.res, iocb->ki_res.res2); + + /* after flagging the request as done, we + * must never even look at it again +@@ -1844,8 +1836,10 @@ static int __io_submit_one(struct kioctx + goto out_put_req; + } + +- req->ki_user_iocb = user_iocb; +- req->ki_user_data = iocb->aio_data; ++ req->ki_res.obj = (u64)(unsigned long)user_iocb; ++ req->ki_res.data = iocb->aio_data; ++ req->ki_res.res = 0; ++ req->ki_res.res2 = 0; + + switch (iocb->aio_lio_opcode) { + case IOCB_CMD_PREAD: +@@ -2019,6 +2013,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t + struct aio_kiocb *kiocb; + int ret = -EINVAL; + u32 key; ++ u64 obj = (u64)(unsigned long)iocb; + + if (unlikely(get_user(key, &iocb->aio_key))) + return -EFAULT; +@@ -2032,7 +2027,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t + spin_lock_irq(&ctx->ctx_lock); + /* TODO: use a hash or array, this sucks. */ + list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) { +- if (kiocb->ki_user_iocb == iocb) { ++ if (kiocb->ki_res.obj == obj) { + ret = kiocb->ki_cancel(&kiocb->rw); + list_del_init(&kiocb->ki_list); + break; diff --git a/queue-5.0/aio-store-event-at-final-iocb_put.patch b/queue-5.0/aio-store-event-at-final-iocb_put.patch new file mode 100644 index 00000000000..ac38a491308 --- /dev/null +++ b/queue-5.0/aio-store-event-at-final-iocb_put.patch @@ -0,0 +1,102 @@ +From 2bb874c0d873d13bd9b9b9c6d7b7c4edab18c8b4 Mon Sep 17 00:00:00 2001 +From: Al Viro +Date: Thu, 7 Mar 2019 19:49:55 -0500 +Subject: aio: store event at final iocb_put() + +From: Al Viro + +commit 2bb874c0d873d13bd9b9b9c6d7b7c4edab18c8b4 upstream. + +Instead of having aio_complete() set ->ki_res.{res,res2}, do that +explicitly in its callers, drop the reference (as aio_complete() +used to do) and delay the rest until the final iocb_put(). + +Signed-off-by: Al Viro +Cc: Guenter Roeck +Signed-off-by: Greg Kroah-Hartman + +--- + fs/aio.c | 33 +++++++++++++++++---------------- + 1 file changed, 17 insertions(+), 16 deletions(-) + +--- a/fs/aio.c ++++ b/fs/aio.c +@@ -1077,16 +1077,10 @@ static inline void iocb_destroy(struct a + kmem_cache_free(kiocb_cachep, iocb); + } + +-static inline void iocb_put(struct aio_kiocb *iocb) +-{ +- if (refcount_dec_and_test(&iocb->ki_refcnt)) +- iocb_destroy(iocb); +-} +- + /* aio_complete + * Called when the io request on the given iocb is complete. + */ +-static void aio_complete(struct aio_kiocb *iocb, long res, long res2) ++static void aio_complete(struct aio_kiocb *iocb) + { + struct kioctx *ctx = iocb->ki_ctx; + struct aio_ring *ring; +@@ -1094,8 +1088,6 @@ static void aio_complete(struct aio_kioc + unsigned tail, pos, head; + unsigned long flags; + +- iocb->ki_res.res = res; +- iocb->ki_res.res2 = res2; + /* + * Add a completion event to the ring buffer. Must be done holding + * ctx->completion_lock to prevent other code from messing with the tail +@@ -1161,7 +1153,14 @@ static void aio_complete(struct aio_kioc + + if (waitqueue_active(&ctx->wait)) + wake_up(&ctx->wait); +- iocb_put(iocb); ++} ++ ++static inline void iocb_put(struct aio_kiocb *iocb) ++{ ++ if (refcount_dec_and_test(&iocb->ki_refcnt)) { ++ aio_complete(iocb); ++ iocb_destroy(iocb); ++ } + } + + /* aio_read_events_ring +@@ -1435,7 +1434,9 @@ static void aio_complete_rw(struct kiocb + file_end_write(kiocb->ki_filp); + } + +- aio_complete(iocb, res, res2); ++ iocb->ki_res.res = res; ++ iocb->ki_res.res2 = res2; ++ iocb_put(iocb); + } + + static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) +@@ -1583,11 +1584,10 @@ static ssize_t aio_write(struct kiocb *r + + static void aio_fsync_work(struct work_struct *work) + { +- struct fsync_iocb *req = container_of(work, struct fsync_iocb, work); +- int ret; ++ struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, fsync.work); + +- ret = vfs_fsync(req->file, req->datasync); +- aio_complete(container_of(req, struct aio_kiocb, fsync), ret, 0); ++ iocb->ki_res.res = vfs_fsync(iocb->fsync.file, iocb->fsync.datasync); ++ iocb_put(iocb); + } + + static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb, +@@ -1608,7 +1608,8 @@ static int aio_fsync(struct fsync_iocb * + + static inline void aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask) + { +- aio_complete(iocb, mangle_poll(mask), 0); ++ iocb->ki_res.res = mangle_poll(mask); ++ iocb_put(iocb); + } + + static void aio_poll_complete_work(struct work_struct *work) diff --git a/queue-5.0/fix-aio_poll-races.patch b/queue-5.0/fix-aio_poll-races.patch new file mode 100644 index 00000000000..0c78e4fbfd8 --- /dev/null +++ b/queue-5.0/fix-aio_poll-races.patch @@ -0,0 +1,226 @@ +From af5c72b1fc7a00aa484e90b0c4e0eeb582545634 Mon Sep 17 00:00:00 2001 +From: Al Viro +Date: Thu, 7 Mar 2019 21:45:41 -0500 +Subject: Fix aio_poll() races + +From: Al Viro + +commit af5c72b1fc7a00aa484e90b0c4e0eeb582545634 upstream. + +aio_poll() has to cope with several unpleasant problems: + * requests that might stay around indefinitely need to +be made visible for io_cancel(2); that must not be done to +a request already completed, though. + * in cases when ->poll() has placed us on a waitqueue, +wakeup might have happened (and request completed) before ->poll() +returns. + * worse, in some early wakeup cases request might end +up re-added into the queue later - we can't treat "woken up and +currently not in the queue" as "it's not going to stick around +indefinitely" + * ... moreover, ->poll() might have decided not to +put it on any queues to start with, and that needs to be distinguished +from the previous case + * ->poll() might have tried to put us on more than one queue. +Only the first will succeed for aio poll, so we might end up missing +wakeups. OTOH, we might very well notice that only after the +wakeup hits and request gets completed (all before ->poll() gets +around to the second poll_wait()). In that case it's too late to +decide that we have an error. + +req->woken was an attempt to deal with that. Unfortunately, it was +broken. What we need to keep track of is not that wakeup has happened - +the thing might come back after that. It's that async reference is +already gone and won't come back, so we can't (and needn't) put the +request on the list of cancellables. + +The easiest case is "request hadn't been put on any waitqueues"; we +can tell by seeing NULL apt.head, and in that case there won't be +anything async. We should either complete the request ourselves +(if vfs_poll() reports anything of interest) or return an error. + +In all other cases we get exclusion with wakeups by grabbing the +queue lock. + +If request is currently on queue and we have something interesting +from vfs_poll(), we can steal it and complete the request ourselves. + +If it's on queue and vfs_poll() has not reported anything interesting, +we either put it on the cancellable list, or, if we know that it +hadn't been put on all queues ->poll() wanted it on, we steal it and +return an error. + +If it's _not_ on queue, it's either been already dealt with (in which +case we do nothing), or there's aio_poll_complete_work() about to be +executed. In that case we either put it on the cancellable list, +or, if we know it hadn't been put on all queues ->poll() wanted it on, +simulate what cancel would've done. + +It's a lot more convoluted than I'd like it to be. Single-consumer APIs +suck, and unfortunately aio is not an exception... + +Signed-off-by: Al Viro +Cc: Guenter Roeck +Signed-off-by: Greg Kroah-Hartman + +--- + fs/aio.c | 90 ++++++++++++++++++++++++++++----------------------------------- + 1 file changed, 40 insertions(+), 50 deletions(-) + +--- a/fs/aio.c ++++ b/fs/aio.c +@@ -181,7 +181,7 @@ struct poll_iocb { + struct file *file; + struct wait_queue_head *head; + __poll_t events; +- bool woken; ++ bool done; + bool cancelled; + struct wait_queue_entry wait; + struct work_struct work; +@@ -1606,12 +1606,6 @@ static int aio_fsync(struct fsync_iocb * + return 0; + } + +-static inline void aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask) +-{ +- iocb->ki_res.res = mangle_poll(mask); +- iocb_put(iocb); +-} +- + static void aio_poll_complete_work(struct work_struct *work) + { + struct poll_iocb *req = container_of(work, struct poll_iocb, work); +@@ -1637,9 +1631,11 @@ static void aio_poll_complete_work(struc + return; + } + list_del_init(&iocb->ki_list); ++ iocb->ki_res.res = mangle_poll(mask); ++ req->done = true; + spin_unlock_irq(&ctx->ctx_lock); + +- aio_poll_complete(iocb, mask); ++ iocb_put(iocb); + } + + /* assumes we are called with irqs disabled */ +@@ -1667,31 +1663,27 @@ static int aio_poll_wake(struct wait_que + __poll_t mask = key_to_poll(key); + unsigned long flags; + +- req->woken = true; +- + /* for instances that support it check for an event match first: */ +- if (mask) { +- if (!(mask & req->events)) +- return 0; ++ if (mask && !(mask & req->events)) ++ return 0; + ++ list_del_init(&req->wait.entry); ++ ++ if (mask && spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) { + /* + * Try to complete the iocb inline if we can. Use + * irqsave/irqrestore because not all filesystems (e.g. fuse) + * call this function with IRQs disabled and because IRQs + * have to be disabled before ctx_lock is obtained. + */ +- if (spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) { +- list_del(&iocb->ki_list); +- spin_unlock_irqrestore(&iocb->ki_ctx->ctx_lock, flags); +- +- list_del_init(&req->wait.entry); +- aio_poll_complete(iocb, mask); +- return 1; +- } ++ list_del(&iocb->ki_list); ++ iocb->ki_res.res = mangle_poll(mask); ++ req->done = true; ++ spin_unlock_irqrestore(&iocb->ki_ctx->ctx_lock, flags); ++ iocb_put(iocb); ++ } else { ++ schedule_work(&req->work); + } +- +- list_del_init(&req->wait.entry); +- schedule_work(&req->work); + return 1; + } + +@@ -1723,6 +1715,7 @@ static ssize_t aio_poll(struct aio_kiocb + struct kioctx *ctx = aiocb->ki_ctx; + struct poll_iocb *req = &aiocb->poll; + struct aio_poll_table apt; ++ bool cancel = false; + __poll_t mask; + + /* reject any unknown events outside the normal event mask. */ +@@ -1736,7 +1729,7 @@ static ssize_t aio_poll(struct aio_kiocb + req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP; + + req->head = NULL; +- req->woken = false; ++ req->done = false; + req->cancelled = false; + + apt.pt._qproc = aio_poll_queue_proc; +@@ -1749,36 +1742,33 @@ static ssize_t aio_poll(struct aio_kiocb + init_waitqueue_func_entry(&req->wait, aio_poll_wake); + + mask = vfs_poll(req->file, &apt.pt) & req->events; +- if (unlikely(!req->head)) { +- /* we did not manage to set up a waitqueue, done */ +- goto out; +- } +- + spin_lock_irq(&ctx->ctx_lock); +- spin_lock(&req->head->lock); +- if (req->woken) { +- /* wake_up context handles the rest */ +- mask = 0; ++ if (likely(req->head)) { ++ spin_lock(&req->head->lock); ++ if (unlikely(list_empty(&req->wait.entry))) { ++ if (apt.error) ++ cancel = true; ++ apt.error = 0; ++ mask = 0; ++ } ++ if (mask || apt.error) { ++ list_del_init(&req->wait.entry); ++ } else if (cancel) { ++ WRITE_ONCE(req->cancelled, true); ++ } else if (!req->done) { /* actually waiting for an event */ ++ list_add_tail(&aiocb->ki_list, &ctx->active_reqs); ++ aiocb->ki_cancel = aio_poll_cancel; ++ } ++ spin_unlock(&req->head->lock); ++ } ++ if (mask) { /* no async, we'd stolen it */ ++ aiocb->ki_res.res = mangle_poll(mask); + apt.error = 0; +- } else if (mask || apt.error) { +- /* if we get an error or a mask we are done */ +- WARN_ON_ONCE(list_empty(&req->wait.entry)); +- list_del_init(&req->wait.entry); +- } else { +- /* actually waiting for an event */ +- list_add_tail(&aiocb->ki_list, &ctx->active_reqs); +- aiocb->ki_cancel = aio_poll_cancel; + } +- spin_unlock(&req->head->lock); + spin_unlock_irq(&ctx->ctx_lock); +- +-out: +- if (unlikely(apt.error)) +- return apt.error; +- + if (mask) +- aio_poll_complete(aiocb, mask); +- return 0; ++ iocb_put(aiocb); ++ return apt.error; + } + + static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb, diff --git a/queue-5.0/pin-iocb-through-aio.patch b/queue-5.0/pin-iocb-through-aio.patch new file mode 100644 index 00000000000..039087053c7 --- /dev/null +++ b/queue-5.0/pin-iocb-through-aio.patch @@ -0,0 +1,113 @@ +From b53119f13a04879c3bf502828d99d13726639ead Mon Sep 17 00:00:00 2001 +From: Linus Torvalds +Date: Wed, 6 Mar 2019 20:22:54 -0500 +Subject: pin iocb through aio. + +From: Linus Torvalds + +commit b53119f13a04879c3bf502828d99d13726639ead upstream. + +aio_poll() is not the only case that needs file pinned; worse, while +aio_read()/aio_write() can live without pinning iocb itself, the +proof is rather brittle and can easily break on later changes. + +Signed-off-by: Linus Torvalds +Signed-off-by: Al Viro +Cc: Guenter Roeck +Signed-off-by: Greg Kroah-Hartman + +--- + fs/aio.c | 37 +++++++++++++++++++++---------------- + 1 file changed, 21 insertions(+), 16 deletions(-) + +--- a/fs/aio.c ++++ b/fs/aio.c +@@ -1022,6 +1022,9 @@ static bool get_reqs_available(struct ki + /* aio_get_req + * Allocate a slot for an aio request. + * Returns NULL if no requests are free. ++ * ++ * The refcount is initialized to 2 - one for the async op completion, ++ * one for the synchronous code that does this. + */ + static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx) + { +@@ -1034,7 +1037,7 @@ static inline struct aio_kiocb *aio_get_ + percpu_ref_get(&ctx->reqs); + req->ki_ctx = ctx; + INIT_LIST_HEAD(&req->ki_list); +- refcount_set(&req->ki_refcnt, 0); ++ refcount_set(&req->ki_refcnt, 2); + req->ki_eventfd = NULL; + return req; + } +@@ -1067,15 +1070,18 @@ out: + return ret; + } + ++static inline void iocb_destroy(struct aio_kiocb *iocb) ++{ ++ if (iocb->ki_filp) ++ fput(iocb->ki_filp); ++ percpu_ref_put(&iocb->ki_ctx->reqs); ++ kmem_cache_free(kiocb_cachep, iocb); ++} ++ + static inline void iocb_put(struct aio_kiocb *iocb) + { +- if (refcount_read(&iocb->ki_refcnt) == 0 || +- refcount_dec_and_test(&iocb->ki_refcnt)) { +- if (iocb->ki_filp) +- fput(iocb->ki_filp); +- percpu_ref_put(&iocb->ki_ctx->reqs); +- kmem_cache_free(kiocb_cachep, iocb); +- } ++ if (refcount_dec_and_test(&iocb->ki_refcnt)) ++ iocb_destroy(iocb); + } + + static void aio_fill_event(struct io_event *ev, struct aio_kiocb *iocb, +@@ -1749,9 +1755,6 @@ static ssize_t aio_poll(struct aio_kiocb + INIT_LIST_HEAD(&req->wait.entry); + init_waitqueue_func_entry(&req->wait, aio_poll_wake); + +- /* one for removal from waitqueue, one for this function */ +- refcount_set(&aiocb->ki_refcnt, 2); +- + mask = vfs_poll(req->file, &apt.pt) & req->events; + if (unlikely(!req->head)) { + /* we did not manage to set up a waitqueue, done */ +@@ -1782,7 +1785,6 @@ out: + + if (mask) + aio_poll_complete(aiocb, mask); +- iocb_put(aiocb); + return 0; + } + +@@ -1873,18 +1875,21 @@ static int __io_submit_one(struct kioctx + break; + } + ++ /* Done with the synchronous reference */ ++ iocb_put(req); ++ + /* + * If ret is 0, we'd either done aio_complete() ourselves or have + * arranged for that to be done asynchronously. Anything non-zero + * means that we need to destroy req ourselves. + */ +- if (ret) +- goto out_put_req; +- return 0; ++ if (!ret) ++ return 0; ++ + out_put_req: + if (req->ki_eventfd) + eventfd_ctx_put(req->ki_eventfd); +- iocb_put(req); ++ iocb_destroy(req); + out_put_reqs_available: + put_reqs_available(ctx, 1); + return ret; diff --git a/queue-5.0/series b/queue-5.0/series index 12cfe01fae7..4e963239266 100644 --- a/queue-5.0/series +++ b/queue-5.0/series @@ -61,3 +61,10 @@ tipc-check-link-name-with-right-length-in-tipc_nl_compat_link_set.patch net-netrom-fix-error-cleanup-path-of-nr_proto_init.patch net-rds-check-address-length-before-reading-address-family.patch rxrpc-fix-race-condition-in-rxrpc_input_packet.patch +pin-iocb-through-aio.patch +aio-fold-lookup_kiocb-into-its-sole-caller.patch +aio-keep-io_event-in-aio_kiocb.patch +aio-store-event-at-final-iocb_put.patch +fix-aio_poll-races.patch +x86-retpolines-raise-limit-for-generating-indirect-calls-from-switch-case.patch +x86-retpolines-disable-switch-jump-tables-when-retpolines-are-enabled.patch diff --git a/queue-5.0/x86-retpolines-disable-switch-jump-tables-when-retpolines-are-enabled.patch b/queue-5.0/x86-retpolines-disable-switch-jump-tables-when-retpolines-are-enabled.patch new file mode 100644 index 00000000000..6657d2e7296 --- /dev/null +++ b/queue-5.0/x86-retpolines-disable-switch-jump-tables-when-retpolines-are-enabled.patch @@ -0,0 +1,70 @@ +From a9d57ef15cbe327fe54416dd194ee0ea66ae53a4 Mon Sep 17 00:00:00 2001 +From: Daniel Borkmann +Date: Mon, 25 Mar 2019 14:56:20 +0100 +Subject: x86/retpolines: Disable switch jump tables when retpolines are enabled +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Daniel Borkmann + +commit a9d57ef15cbe327fe54416dd194ee0ea66ae53a4 upstream. + +Commit ce02ef06fcf7 ("x86, retpolines: Raise limit for generating indirect +calls from switch-case") raised the limit under retpolines to 20 switch +cases where gcc would only then start to emit jump tables, and therefore +effectively disabling the emission of slow indirect calls in this area. + +After this has been brought to attention to gcc folks [0], Martin Liska +has then fixed gcc to align with clang by avoiding to generate switch jump +tables entirely under retpolines. This is taking effect in gcc starting +from stable version 8.4.0. Given kernel supports compilation with older +versions of gcc where the fix is not being available or backported anymore, +we need to keep the extra KBUILD_CFLAGS around for some time and generally +set the -fno-jump-tables to align with what more recent gcc is doing +automatically today. + +More than 20 switch cases are not expected to be fast-path critical, but +it would still be good to align with gcc behavior for versions < 8.4.0 in +order to have consistency across supported gcc versions. vmlinux size is +slightly growing by 0.27% for older gcc. This flag is only set to work +around affected gcc, no change for clang. + + [0] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=86952 + +Suggested-by: Martin Liska +Signed-off-by: Daniel Borkmann +Signed-off-by: Thomas Gleixner +Cc: David Woodhouse +Cc: Linus Torvalds +Cc: Jesper Dangaard Brouer +Cc: Björn Töpel +Cc: Magnus Karlsson +Cc: Alexei Starovoitov +Cc: H.J. Lu +Cc: Alexei Starovoitov +Cc: David S. Miller +Link: https://lkml.kernel.org/r/20190325135620.14882-1-daniel@iogearbox.net +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/Makefile | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +--- a/arch/x86/Makefile ++++ b/arch/x86/Makefile +@@ -220,8 +220,12 @@ ifdef CONFIG_RETPOLINE + # Additionally, avoid generating expensive indirect jumps which + # are subject to retpolines for small number of switch cases. + # clang turns off jump table generation by default when under +- # retpoline builds, however, gcc does not for x86. +- KBUILD_CFLAGS += $(call cc-option,--param=case-values-threshold=20) ++ # retpoline builds, however, gcc does not for x86. This has ++ # only been fixed starting from gcc stable version 8.4.0 and ++ # onwards, but not for older ones. See gcc bug #86952. ++ ifndef CONFIG_CC_IS_CLANG ++ KBUILD_CFLAGS += $(call cc-option,-fno-jump-tables) ++ endif + endif + + archscripts: scripts_basic diff --git a/queue-5.0/x86-retpolines-raise-limit-for-generating-indirect-calls-from-switch-case.patch b/queue-5.0/x86-retpolines-raise-limit-for-generating-indirect-calls-from-switch-case.patch new file mode 100644 index 00000000000..73c21f1af65 --- /dev/null +++ b/queue-5.0/x86-retpolines-raise-limit-for-generating-indirect-calls-from-switch-case.patch @@ -0,0 +1,175 @@ +From ce02ef06fcf7a399a6276adb83f37373d10cbbe1 Mon Sep 17 00:00:00 2001 +From: Daniel Borkmann +Date: Thu, 21 Feb 2019 23:19:41 +0100 +Subject: x86, retpolines: Raise limit for generating indirect calls from switch-case +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Daniel Borkmann + +commit ce02ef06fcf7a399a6276adb83f37373d10cbbe1 upstream. + +From networking side, there are numerous attempts to get rid of indirect +calls in fast-path wherever feasible in order to avoid the cost of +retpolines, for example, just to name a few: + + * 283c16a2dfd3 ("indirect call wrappers: helpers to speed-up indirect calls of builtin") + * aaa5d90b395a ("net: use indirect call wrappers at GRO network layer") + * 028e0a476684 ("net: use indirect call wrappers at GRO transport layer") + * 356da6d0cde3 ("dma-mapping: bypass indirect calls for dma-direct") + * 09772d92cd5a ("bpf: avoid retpoline for lookup/update/delete calls on maps") + * 10870dd89e95 ("netfilter: nf_tables: add direct calls for all builtin expressions") + [...] + +Recent work on XDP from Björn and Magnus additionally found that manually +transforming the XDP return code switch statement with more than 5 cases +into if-else combination would result in a considerable speedup in XDP +layer due to avoidance of indirect calls in CONFIG_RETPOLINE enabled +builds. On i40e driver with XDP prog attached, a 20-26% speedup has been +observed [0]. Aside from XDP, there are many other places later in the +networking stack's critical path with similar switch-case +processing. Rather than fixing every XDP-enabled driver and locations in +stack by hand, it would be good to instead raise the limit where gcc would +emit expensive indirect calls from the switch under retpolines and stick +with the default as-is in case of !retpoline configured kernels. This would +also have the advantage that for archs where this is not necessary, we let +compiler select the underlying target optimization for these constructs and +avoid potential slow-downs by if-else hand-rewrite. + +In case of gcc, this setting is controlled by case-values-threshold which +has an architecture global default that selects 4 or 5 (latter if target +does not have a case insn that compares the bounds) where some arch back +ends like arm64 or s390 override it with their own target hooks, for +example, in gcc commit db7a90aa0de5 ("S/390: Disable prediction of indirect +branches") the threshold pretty much disables jump tables by limit of 20 +under retpoline builds. Comparing gcc's and clang's default code +generation on x86-64 under O2 level with retpoline build results in the +following outcome for 5 switch cases: + +* gcc with -mindirect-branch=thunk-inline -mindirect-branch-register: + + # gdb -batch -ex 'disassemble dispatch' ./c-switch + Dump of assembler code for function dispatch: + 0x0000000000400be0 <+0>: cmp $0x4,%edi + 0x0000000000400be3 <+3>: ja 0x400c35 + 0x0000000000400be5 <+5>: lea 0x915f8(%rip),%rdx # 0x4921e4 + 0x0000000000400bec <+12>: mov %edi,%edi + 0x0000000000400bee <+14>: movslq (%rdx,%rdi,4),%rax + 0x0000000000400bf2 <+18>: add %rdx,%rax + 0x0000000000400bf5 <+21>: callq 0x400c01 + 0x0000000000400bfa <+26>: pause + 0x0000000000400bfc <+28>: lfence + 0x0000000000400bff <+31>: jmp 0x400bfa + 0x0000000000400c01 <+33>: mov %rax,(%rsp) + 0x0000000000400c05 <+37>: retq + 0x0000000000400c06 <+38>: nopw %cs:0x0(%rax,%rax,1) + 0x0000000000400c10 <+48>: jmpq 0x400c90 + 0x0000000000400c15 <+53>: nopl (%rax) + 0x0000000000400c18 <+56>: jmpq 0x400c70 + 0x0000000000400c1d <+61>: nopl (%rax) + 0x0000000000400c20 <+64>: jmpq 0x400c50 + 0x0000000000400c25 <+69>: nopl (%rax) + 0x0000000000400c28 <+72>: jmpq 0x400c40 + 0x0000000000400c2d <+77>: nopl (%rax) + 0x0000000000400c30 <+80>: jmpq 0x400cb0 + 0x0000000000400c35 <+85>: push %rax + 0x0000000000400c36 <+86>: callq 0x40dd80 + End of assembler dump. + +* clang with -mretpoline emitting search tree: + + # gdb -batch -ex 'disassemble dispatch' ./c-switch + Dump of assembler code for function dispatch: + 0x0000000000400b30 <+0>: cmp $0x1,%edi + 0x0000000000400b33 <+3>: jle 0x400b44 + 0x0000000000400b35 <+5>: cmp $0x2,%edi + 0x0000000000400b38 <+8>: je 0x400b4d + 0x0000000000400b3a <+10>: cmp $0x3,%edi + 0x0000000000400b3d <+13>: jne 0x400b52 + 0x0000000000400b3f <+15>: jmpq 0x400c50 + 0x0000000000400b44 <+20>: test %edi,%edi + 0x0000000000400b46 <+22>: jne 0x400b5c + 0x0000000000400b48 <+24>: jmpq 0x400c20 + 0x0000000000400b4d <+29>: jmpq 0x400c40 + 0x0000000000400b52 <+34>: cmp $0x4,%edi + 0x0000000000400b55 <+37>: jne 0x400b66 + 0x0000000000400b57 <+39>: jmpq 0x400c60 + 0x0000000000400b5c <+44>: cmp $0x1,%edi + 0x0000000000400b5f <+47>: jne 0x400b66 + 0x0000000000400b61 <+49>: jmpq 0x400c30 + 0x0000000000400b66 <+54>: push %rax + 0x0000000000400b67 <+55>: callq 0x40dd20 + End of assembler dump. + + For sake of comparison, clang without -mretpoline: + + # gdb -batch -ex 'disassemble dispatch' ./c-switch + Dump of assembler code for function dispatch: + 0x0000000000400b30 <+0>: cmp $0x4,%edi + 0x0000000000400b33 <+3>: ja 0x400b57 + 0x0000000000400b35 <+5>: mov %edi,%eax + 0x0000000000400b37 <+7>: jmpq *0x492148(,%rax,8) + 0x0000000000400b3e <+14>: jmpq 0x400bf0 + 0x0000000000400b43 <+19>: jmpq 0x400c30 + 0x0000000000400b48 <+24>: jmpq 0x400c10 + 0x0000000000400b4d <+29>: jmpq 0x400c20 + 0x0000000000400b52 <+34>: jmpq 0x400c00 + 0x0000000000400b57 <+39>: push %rax + 0x0000000000400b58 <+40>: callq 0x40dcf0 + End of assembler dump. + +Raising the cases to a high number (e.g. 100) will still result in similar +code generation pattern with clang and gcc as above, in other words clang +generally turns off jump table emission by having an extra expansion pass +under retpoline build to turn indirectbr instructions from their IR into +switch instructions as a built-in -mno-jump-table lowering of a switch (in +this case, even if IR input already contained an indirect branch). + +For gcc, adding --param=case-values-threshold=20 as in similar fashion as +s390 in order to raise the limit for x86 retpoline enabled builds results +in a small vmlinux size increase of only 0.13% (before=18,027,528 +after=18,051,192). For clang this option is ignored due to i) not being +needed as mentioned and ii) not having above cmdline +parameter. Non-retpoline-enabled builds with gcc continue to use the +default case-values-threshold setting, so nothing changes here. + +[0] https://lore.kernel.org/netdev/20190129095754.9390-1-bjorn.topel@gmail.com/ + and "The Path to DPDK Speeds for AF_XDP", LPC 2018, networking track: + - http://vger.kernel.org/lpc_net2018_talks/lpc18_pres_af_xdp_perf-v3.pdf + - http://vger.kernel.org/lpc_net2018_talks/lpc18_paper_af_xdp_perf-v2.pdf + +Signed-off-by: Daniel Borkmann +Signed-off-by: Thomas Gleixner +Acked-by: Jesper Dangaard Brouer +Acked-by: Björn Töpel +Acked-by: Linus Torvalds +Cc: netdev@vger.kernel.org +Cc: David S. Miller +Cc: Magnus Karlsson +Cc: Alexei Starovoitov +Cc: Peter Zijlstra +Cc: David Woodhouse +Cc: Andy Lutomirski +Cc: Borislav Petkov +Link: https://lkml.kernel.org/r/20190221221941.29358-1-daniel@iogearbox.net +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/Makefile | 5 +++++ + 1 file changed, 5 insertions(+) + +--- a/arch/x86/Makefile ++++ b/arch/x86/Makefile +@@ -217,6 +217,11 @@ KBUILD_CFLAGS += -fno-asynchronous-unwin + # Avoid indirect branches in kernel to deal with Spectre + ifdef CONFIG_RETPOLINE + KBUILD_CFLAGS += $(RETPOLINE_CFLAGS) ++ # Additionally, avoid generating expensive indirect jumps which ++ # are subject to retpolines for small number of switch cases. ++ # clang turns off jump table generation by default when under ++ # retpoline builds, however, gcc does not for x86. ++ KBUILD_CFLAGS += $(call cc-option,--param=case-values-threshold=20) + endif + + archscripts: scripts_basic