From fc70f12d533dbcf0a2991ff22f5da0c0db090c95 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 1 Sep 2021 10:25:13 +0200 Subject: [PATCH] 5.13-stable patches added patches: pipe-avoid-unnecessary-epollet-wakeups-under-normal-loads.patch pipe-do-fasync-notifications-for-every-pipe-io-not-just-state-changes.patch tipc-call-tipc_wait_for_connect-only-when-dlen-is-not-0.patch --- ...y-epollet-wakeups-under-normal-loads.patch | 119 ++++++++++++++++ ...every-pipe-io-not-just-state-changes.patch | 127 ++++++++++++++++++ queue-5.13/series | 3 + ..._for_connect-only-when-dlen-is-not-0.patch | 45 +++++++ 4 files changed, 294 insertions(+) create mode 100644 queue-5.13/pipe-avoid-unnecessary-epollet-wakeups-under-normal-loads.patch create mode 100644 queue-5.13/pipe-do-fasync-notifications-for-every-pipe-io-not-just-state-changes.patch create mode 100644 queue-5.13/tipc-call-tipc_wait_for_connect-only-when-dlen-is-not-0.patch diff --git a/queue-5.13/pipe-avoid-unnecessary-epollet-wakeups-under-normal-loads.patch b/queue-5.13/pipe-avoid-unnecessary-epollet-wakeups-under-normal-loads.patch new file mode 100644 index 00000000000..66fda4dd222 --- /dev/null +++ b/queue-5.13/pipe-avoid-unnecessary-epollet-wakeups-under-normal-loads.patch @@ -0,0 +1,119 @@ +From 3b844826b6c6affa80755254da322b017358a2f4 Mon Sep 17 00:00:00 2001 +From: Linus Torvalds +Date: Thu, 5 Aug 2021 10:04:43 -0700 +Subject: pipe: avoid unnecessary EPOLLET wakeups under normal loads + +From: Linus Torvalds + +commit 3b844826b6c6affa80755254da322b017358a2f4 upstream. + +I had forgotten just how sensitive hackbench is to extra pipe wakeups, +and commit 3a34b13a88ca ("pipe: make pipe writes always wake up +readers") ended up causing a quite noticeable regression on larger +machines. + +Now, hackbench isn't necessarily a hugely meaningful benchmark, and it's +not clear that this matters in real life all that much, but as Mel +points out, it's used often enough when comparing kernels and so the +performance regression shows up like a sore thumb. + +It's easy enough to fix at least for the common cases where pipes are +used purely for data transfer, and you never have any exciting poll +usage at all. So set a special 'poll_usage' flag when there is polling +activity, and make the ugly "EPOLLET has crazy legacy expectations" +semantics explicit to only that case. + +I would love to limit it to just the broken EPOLLET case, but the pipe +code can't see the difference between epoll and regular select/poll, so +any non-read/write waiting will trigger the extra wakeup behavior. That +is sufficient for at least the hackbench case. + +Apart from making the odd extra wakeup cases more explicitly about +EPOLLET, this also makes the extra wakeup be at the _end_ of the pipe +write, not at the first write chunk. That is actually much saner +semantics (as much as you can call any of the legacy edge-triggered +expectations for EPOLLET "sane") since it means that you know the wakeup +will happen once the write is done, rather than possibly in the middle +of one. + +[ For stable people: I'm putting a "Fixes" tag on this, but I leave it + up to you to decide whether you actually want to backport it or not. + It likely has no impact outside of synthetic benchmarks - Linus ] + +Link: https://lore.kernel.org/lkml/20210802024945.GA8372@xsang-OptiPlex-9020/ +Fixes: 3a34b13a88ca ("pipe: make pipe writes always wake up readers") +Reported-by: kernel test robot +Tested-by: Sandeep Patil +Tested-by: Mel Gorman +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + fs/pipe.c | 15 +++++++++------ + include/linux/pipe_fs_i.h | 2 ++ + 2 files changed, 11 insertions(+), 6 deletions(-) + +--- a/fs/pipe.c ++++ b/fs/pipe.c +@@ -444,9 +444,6 @@ pipe_write(struct kiocb *iocb, struct io + #endif + + /* +- * Epoll nonsensically wants a wakeup whether the pipe +- * was already empty or not. +- * + * If it wasn't empty we try to merge new data into + * the last buffer. + * +@@ -455,9 +452,9 @@ pipe_write(struct kiocb *iocb, struct io + * spanning multiple pages. + */ + head = pipe->head; +- was_empty = true; ++ was_empty = pipe_empty(head, pipe->tail); + chars = total_len & (PAGE_SIZE-1); +- if (chars && !pipe_empty(head, pipe->tail)) { ++ if (chars && !was_empty) { + unsigned int mask = pipe->ring_size - 1; + struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask]; + int offset = buf->offset + buf->len; +@@ -590,8 +587,11 @@ out: + * This is particularly important for small writes, because of + * how (for example) the GNU make jobserver uses small writes to + * wake up pending jobs ++ * ++ * Epoll nonsensically wants a wakeup whether the pipe ++ * was already empty or not. + */ +- if (was_empty) { ++ if (was_empty || pipe->poll_usage) { + wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + } +@@ -654,6 +654,9 @@ pipe_poll(struct file *filp, poll_table + struct pipe_inode_info *pipe = filp->private_data; + unsigned int head, tail; + ++ /* Epoll has some historical nasty semantics, this enables them */ ++ pipe->poll_usage = 1; ++ + /* + * Reading pipe state only -- no need for acquiring the semaphore. + * +--- a/include/linux/pipe_fs_i.h ++++ b/include/linux/pipe_fs_i.h +@@ -48,6 +48,7 @@ struct pipe_buffer { + * @files: number of struct file referring this pipe (protected by ->i_lock) + * @r_counter: reader counter + * @w_counter: writer counter ++ * @poll_usage: is this pipe used for epoll, which has crazy wakeups? + * @fasync_readers: reader side fasync + * @fasync_writers: writer side fasync + * @bufs: the circular array of pipe buffers +@@ -70,6 +71,7 @@ struct pipe_inode_info { + unsigned int files; + unsigned int r_counter; + unsigned int w_counter; ++ unsigned int poll_usage; + struct page *tmp_page; + struct fasync_struct *fasync_readers; + struct fasync_struct *fasync_writers; diff --git a/queue-5.13/pipe-do-fasync-notifications-for-every-pipe-io-not-just-state-changes.patch b/queue-5.13/pipe-do-fasync-notifications-for-every-pipe-io-not-just-state-changes.patch new file mode 100644 index 00000000000..d63d9e7ff5c --- /dev/null +++ b/queue-5.13/pipe-do-fasync-notifications-for-every-pipe-io-not-just-state-changes.patch @@ -0,0 +1,127 @@ +From fe67f4dd8daa252eb9aa7acb61555f3cc3c1ce4c Mon Sep 17 00:00:00 2001 +From: Linus Torvalds +Date: Tue, 24 Aug 2021 10:39:25 -0700 +Subject: pipe: do FASYNC notifications for every pipe IO, not just state changes + +From: Linus Torvalds + +commit fe67f4dd8daa252eb9aa7acb61555f3cc3c1ce4c upstream. + +It turns out that the SIGIO/FASYNC situation is almost exactly the same +as the EPOLLET case was: user space really wants to be notified after +every operation. + +Now, in a perfect world it should be sufficient to only notify user +space on "state transitions" when the IO state changes (ie when a pipe +goes from unreadable to readable, or from unwritable to writable). User +space should then do as much as possible - fully emptying the buffer or +what not - and we'll notify it again the next time the state changes. + +But as with EPOLLET, we have at least one case (stress-ng) where the +kernel sent SIGIO due to the pipe being marked for asynchronous +notification, but the user space signal handler then didn't actually +necessarily read it all before returning (it read more than what was +written, but since there could be multiple writes, it could leave data +pending). + +The user space code then expected to get another SIGIO for subsequent +writes - even though the pipe had been readable the whole time - and +would only then read more. + +This is arguably a user space bug - and Colin King already fixed the +stress-ng code in question - but the kernel regression rules are clear: +it doesn't matter if kernel people think that user space did something +silly and wrong. What matters is that it used to work. + +So if user space depends on specific historical kernel behavior, it's a +regression when that behavior changes. It's on us: we were silly to +have that non-optimal historical behavior, and our old kernel behavior +was what user space was tested against. + +Because of how the FASYNC notification was tied to wakeup behavior, this +was first broken by commits f467a6a66419 and 1b6b26ae7053 ("pipe: fix +and clarify pipe read/write wakeup logic"), but at the time it seems +nobody noticed. Probably because the stress-ng problem case ends up +being timing-dependent too. + +It was then unwittingly fixed by commit 3a34b13a88ca ("pipe: make pipe +writes always wake up readers") only to be broken again when by commit +3b844826b6c6 ("pipe: avoid unnecessary EPOLLET wakeups under normal +loads"). + +And at that point the kernel test robot noticed the performance +refression in the stress-ng.sigio.ops_per_sec case. So the "Fixes" tag +below is somewhat ad hoc, but it matches when the issue was noticed. + +Fix it for good (knock wood) by simply making the kill_fasync() case +separate from the wakeup case. FASYNC is quite rare, and we clearly +shouldn't even try to use the "avoid unnecessary wakeups" logic for it. + +Link: https://lore.kernel.org/lkml/20210824151337.GC27667@xsang-OptiPlex-9020/ +Fixes: 3b844826b6c6 ("pipe: avoid unnecessary EPOLLET wakeups under normal loads") +Reported-by: kernel test robot +Tested-by: Oliver Sang +Cc: Eric Biederman +Cc: Colin Ian King +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + fs/pipe.c | 20 ++++++++------------ + 1 file changed, 8 insertions(+), 12 deletions(-) + +--- a/fs/pipe.c ++++ b/fs/pipe.c +@@ -363,10 +363,9 @@ pipe_read(struct kiocb *iocb, struct iov + * _very_ unlikely case that the pipe was full, but we got + * no data. + */ +- if (unlikely(was_full)) { ++ if (unlikely(was_full)) + wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); +- kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); +- } ++ kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); + + /* + * But because we didn't read anything, at this point we can +@@ -385,12 +384,11 @@ pipe_read(struct kiocb *iocb, struct iov + wake_next_reader = false; + __pipe_unlock(pipe); + +- if (was_full) { ++ if (was_full) + wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); +- kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); +- } + if (wake_next_reader) + wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); ++ kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); + if (ret > 0) + file_accessed(filp); + return ret; +@@ -565,10 +563,9 @@ pipe_write(struct kiocb *iocb, struct io + * become empty while we dropped the lock. + */ + __pipe_unlock(pipe); +- if (was_empty) { ++ if (was_empty) + wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); +- kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); +- } ++ kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe)); + __pipe_lock(pipe); + was_empty = pipe_empty(pipe->head, pipe->tail); +@@ -591,10 +588,9 @@ out: + * Epoll nonsensically wants a wakeup whether the pipe + * was already empty or not. + */ +- if (was_empty || pipe->poll_usage) { ++ if (was_empty || pipe->poll_usage) + wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); +- kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); +- } ++ kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + if (wake_next_writer) + wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); + if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) { diff --git a/queue-5.13/series b/queue-5.13/series index 5076c53227c..f7431092ad2 100644 --- a/queue-5.13/series +++ b/queue-5.13/series @@ -95,3 +95,6 @@ drm-nouveau-kms-nv50-workaround-efi-gop-window-chann.patch platform-x86-gigabyte-wmi-add-support-for-b450m-s2h-.patch net-rds-dma_map_sg-is-entitled-to-merge-entries.patch arm64-initialize-all-of-cnthctl_el2.patch +pipe-avoid-unnecessary-epollet-wakeups-under-normal-loads.patch +pipe-do-fasync-notifications-for-every-pipe-io-not-just-state-changes.patch +tipc-call-tipc_wait_for_connect-only-when-dlen-is-not-0.patch diff --git a/queue-5.13/tipc-call-tipc_wait_for_connect-only-when-dlen-is-not-0.patch b/queue-5.13/tipc-call-tipc_wait_for_connect-only-when-dlen-is-not-0.patch new file mode 100644 index 00000000000..f80774f636c --- /dev/null +++ b/queue-5.13/tipc-call-tipc_wait_for_connect-only-when-dlen-is-not-0.patch @@ -0,0 +1,45 @@ +From 7387a72c5f84f0dfb57618f9e4770672c0d2e4c9 Mon Sep 17 00:00:00 2001 +From: Xin Long +Date: Sun, 15 Aug 2021 03:13:36 -0400 +Subject: tipc: call tipc_wait_for_connect only when dlen is not 0 + +From: Xin Long + +commit 7387a72c5f84f0dfb57618f9e4770672c0d2e4c9 upstream. + +__tipc_sendmsg() is called to send SYN packet by either tipc_sendmsg() +or tipc_connect(). The difference is in tipc_connect(), it will call +tipc_wait_for_connect() after __tipc_sendmsg() to wait until connecting +is done. So there's no need to wait in __tipc_sendmsg() for this case. + +This patch is to fix it by calling tipc_wait_for_connect() only when dlen +is not 0 in __tipc_sendmsg(), which means it's called by tipc_connect(). + +Note this also fixes the failure in tipcutils/test/ptts/: + + # ./tipcTS & + # ./tipcTC 9 + (hang) + +Fixes: 36239dab6da7 ("tipc: fix implicit-connect for SYN+") +Reported-by: Shuang Li +Signed-off-by: Xin Long +Acked-by: Jon Maloy +Signed-off-by: David S. Miller +Cc: Paul Gortmaker +Signed-off-by: Greg Kroah-Hartman +--- + net/tipc/socket.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/tipc/socket.c ++++ b/net/tipc/socket.c +@@ -1528,7 +1528,7 @@ static int __tipc_sendmsg(struct socket + + if (unlikely(syn && !rc)) { + tipc_set_sk_state(sk, TIPC_CONNECTING); +- if (timeout) { ++ if (dlen && timeout) { + timeout = msecs_to_jiffies(timeout); + tipc_wait_for_connect(sock, &timeout); + } -- 2.47.3