From dd9f37f4a405dfdc987db327b9f90d040e167319 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Sun, 4 Dec 2022 16:51:07 +0100
Subject: [PATCH] 5.4-stable patches

added patches:
	epoll-call-final-ep_events_available-check-under-the-lock.patch
	epoll-check-for-events-when-removing-a-timed-out-thread-from-the-wait-queue.patch
	nvme-ensure-subsystem-reset-is-single-threaded.patch
	nvme-restrict-management-ioctls-to-admin.patch
	tracing-ring-buffer-have-polling-block-on-watermark.patch
---
 ...vents_available-check-under-the-lock.patch | 124 ++++++++++++
 ...timed-out-thread-from-the-wait-queue.patch | 125 ++++++++++++
 ...e-subsystem-reset-is-single-threaded.patch |  67 +++++++
 ...-restrict-management-ioctls-to-admin.patch |  41 ++++
 queue-5.4/series                              |   5 +
 ...ffer-have-polling-block-on-watermark.patch | 186 ++++++++++++++++++
 6 files changed, 548 insertions(+)
 create mode 100644 queue-5.4/epoll-call-final-ep_events_available-check-under-the-lock.patch
 create mode 100644 queue-5.4/epoll-check-for-events-when-removing-a-timed-out-thread-from-the-wait-queue.patch
 create mode 100644 queue-5.4/nvme-ensure-subsystem-reset-is-single-threaded.patch
 create mode 100644 queue-5.4/nvme-restrict-management-ioctls-to-admin.patch
 create mode 100644 queue-5.4/tracing-ring-buffer-have-polling-block-on-watermark.patch

diff --git a/queue-5.4/epoll-call-final-ep_events_available-check-under-the-lock.patch b/queue-5.4/epoll-call-final-ep_events_available-check-under-the-lock.patch
new file mode 100644
index 00000000000..275be3dc6f8
--- /dev/null
+++ b/queue-5.4/epoll-call-final-ep_events_available-check-under-the-lock.patch
@@ -0,0 +1,124 @@
+From 65759097d804d2a9ad2b687db436319704ba7019 Mon Sep 17 00:00:00 2001
+From: Roman Penyaev <rpenyaev@suse.de>
+Date: Wed, 13 May 2020 17:50:38 -0700
+Subject: epoll: call final ep_events_available() check under the lock
+
+From: Roman Penyaev <rpenyaev@suse.de>
+
+commit 65759097d804d2a9ad2b687db436319704ba7019 upstream.
+
+There is a possible race when ep_scan_ready_list() leaves ->rdllist and
+->obflist empty for a short period of time although some events are
+pending.  It is quite likely that ep_events_available() observes empty
+lists and goes to sleep.
+
+Since commit 339ddb53d373 ("fs/epoll: remove unnecessary wakeups of
+nested epoll") we are conservative in wakeups (there is only one place
+for wakeup and this is ep_poll_callback()), thus ep_events_available()
+must always observe correct state of two lists.
+
+The easiest and correct way is to do the final check under the lock.
+This does not impact the performance, since lock is taken anyway for
+adding a wait entry to the wait queue.
+
+The discussion of the problem can be found here:
+
+   https://lore.kernel.org/linux-fsdevel/a2f22c3c-c25a-4bda-8339-a7bdaf17849e@akamai.com/
+
+In this patch barrierless __set_current_state() is used.  This is safe
+since waitqueue_active() is called under the same lock on wakeup side.
+
+Short-circuit for fatal signals (i.e.  fatal_signal_pending() check) is
+moved to the line just before actual events harvesting routine.  This is
+fully compliant to what is said in the comment of the patch where the
+actual fatal_signal_pending() check was added: c257a340ede0 ("fs, epoll:
+short circuit fetching events if thread has been killed").
+
+Fixes: 339ddb53d373 ("fs/epoll: remove unnecessary wakeups of nested epoll")
+Reported-by: Jason Baron <jbaron@akamai.com>
+Reported-by: Randy Dunlap <rdunlap@infradead.org>
+Signed-off-by: Roman Penyaev <rpenyaev@suse.de>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Reviewed-by: Jason Baron <jbaron@akamai.com>
+Cc: Khazhismel Kumykov <khazhy@google.com>
+Cc: Alexander Viro <viro@zeniv.linux.org.uk>
+Cc: <stable@vger.kernel.org>
+Link: http://lkml.kernel.org/r/20200505145609.1865152-1-rpenyaev@suse.de
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Acked-by: Thadeu Lima de Souza Cascardo <cascardo@canonical.com>
+Signed-off-by: Rishabh Bhatnagar <risbhat@amazon.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/eventpoll.c |   47 +++++++++++++++++++++++++++--------------------
+ 1 file changed, 27 insertions(+), 20 deletions(-)
+
+--- a/fs/eventpoll.c
++++ b/fs/eventpoll.c
+@@ -1905,33 +1905,31 @@ fetch_events:
+ 		init_wait(&wait);
+ 		wait.func = ep_autoremove_wake_function;
+ 		write_lock_irq(&ep->lock);
+-		__add_wait_queue_exclusive(&ep->wq, &wait);
+-		write_unlock_irq(&ep->lock);
+-
+ 		/*
+-		 * We don't want to sleep if the ep_poll_callback() sends us
+-		 * a wakeup in between. That's why we set the task state
+-		 * to TASK_INTERRUPTIBLE before doing the checks.
++		 * Barrierless variant, waitqueue_active() is called under
++		 * the same lock on wakeup ep_poll_callback() side, so it
++		 * is safe to avoid an explicit barrier.
+ 		 */
+-		set_current_state(TASK_INTERRUPTIBLE);
++		__set_current_state(TASK_INTERRUPTIBLE);
++
+ 		/*
+-		 * Always short-circuit for fatal signals to allow
+-		 * threads to make a timely exit without the chance of
+-		 * finding more events available and fetching
+-		 * repeatedly.
++		 * Do the final check under the lock. ep_scan_ready_list()
++		 * plays with two lists (->rdllist and ->ovflist) and there
++		 * is always a race when both lists are empty for short
++		 * period of time although events are pending, so lock is
++		 * important.
+ 		 */
+-		if (fatal_signal_pending(current)) {
+-			res = -EINTR;
+-			break;
++		eavail = ep_events_available(ep);
++		if (!eavail) {
++			if (signal_pending(current))
++				res = -EINTR;
++			else
++				__add_wait_queue_exclusive(&ep->wq, &wait);
+ 		}
++		write_unlock_irq(&ep->lock);
+ 
+-		eavail = ep_events_available(ep);
+-		if (eavail)
+-			break;
+-		if (signal_pending(current)) {
+-			res = -EINTR;
++		if (eavail || res)
+ 			break;
+-		}
+ 
+ 		if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) {
+ 			timed_out = 1;
+@@ -1952,6 +1950,15 @@ fetch_events:
+ 	}
+ 
+ send_events:
++	if (fatal_signal_pending(current)) {
++		/*
++		 * Always short-circuit for fatal signals to allow
++		 * threads to make a timely exit without the chance of
++		 * finding more events available and fetching
++		 * repeatedly.
++		 */
++		res = -EINTR;
++	}
+ 	/*
+ 	 * Try to transfer events to user space. In case we get 0 events and
+ 	 * there's still timeout left over, we go trying again in search of
diff --git a/queue-5.4/epoll-check-for-events-when-removing-a-timed-out-thread-from-the-wait-queue.patch b/queue-5.4/epoll-check-for-events-when-removing-a-timed-out-thread-from-the-wait-queue.patch
new file mode 100644
index 00000000000..f8065eca3d9
--- /dev/null
+++ b/queue-5.4/epoll-check-for-events-when-removing-a-timed-out-thread-from-the-wait-queue.patch
@@ -0,0 +1,125 @@
+From 289caf5d8f6c61c6d2b7fd752a7f483cd153f182 Mon Sep 17 00:00:00 2001
+From: Soheil Hassas Yeganeh <soheil@google.com>
+Date: Fri, 18 Dec 2020 14:01:44 -0800
+Subject: epoll: check for events when removing a timed out thread from the wait queue
+
+From: Soheil Hassas Yeganeh <soheil@google.com>
+
+commit 289caf5d8f6c61c6d2b7fd752a7f483cd153f182 upstream.
+
+Patch series "simplify ep_poll".
+
+This patch series is a followup based on the suggestions and feedback by
+Linus:
+https://lkml.kernel.org/r/CAHk-=wizk=OxUyQPbO8MS41w2Pag1kniUV5WdD5qWL-gq1kjDA@mail.gmail.com
+
+The first patch in the series is a fix for the epoll race in presence of
+timeouts, so that it can be cleanly backported to all affected stable
+kernels.
+
+The rest of the patch series simplify the ep_poll() implementation.  Some
+of these simplifications result in minor performance enhancements as well.
+We have kept these changes under self tests and internal benchmarks for a
+few days, and there are minor (1-2%) performance enhancements as a result.
+
+This patch (of 8):
+
+After abc610e01c66 ("fs/epoll: avoid barrier after an epoll_wait(2)
+timeout"), we break out of the ep_poll loop upon timeout, without checking
+whether there is any new events available.  Prior to that patch-series we
+always called ep_events_available() after exiting the loop.
+
+This can cause races and missed wakeups.  For example, consider the
+following scenario reported by Guantao Liu:
+
+Suppose we have an eventfd added using EPOLLET to an epollfd.
+
+Thread 1: Sleeps for just below 5ms and then writes to an eventfd.
+Thread 2: Calls epoll_wait with a timeout of 5 ms. If it sees an
+          event of the eventfd, it will write back on that fd.
+Thread 3: Calls epoll_wait with a negative timeout.
+
+Prior to abc610e01c66, it is guaranteed that Thread 3 will wake up either
+by Thread 1 or Thread 2.  After abc610e01c66, Thread 3 can be blocked
+indefinitely if Thread 2 sees a timeout right before the write to the
+eventfd by Thread 1.  Thread 2 will be woken up from
+schedule_hrtimeout_range and, with evail 0, it will not call
+ep_send_events().
+
+To fix this issue:
+1) Simplify the timed_out case as suggested by Linus.
+2) while holding the lock, recheck whether the thread was woken up
+   after its time out has reached.
+
+Note that (2) is different from Linus' original suggestion: It do not set
+"eavail = ep_events_available(ep)" to avoid unnecessary contention (when
+there are too many timed-out threads and a small number of events), as
+well as races mentioned in the discussion thread.
+
+This is the first patch in the series so that the backport to stable
+releases is straightforward.
+
+Link: https://lkml.kernel.org/r/20201106231635.3528496-1-soheil.kdev@gmail.com
+Link: https://lkml.kernel.org/r/CAHk-=wizk=OxUyQPbO8MS41w2Pag1kniUV5WdD5qWL-gq1kjDA@mail.gmail.com
+Link: https://lkml.kernel.org/r/20201106231635.3528496-2-soheil.kdev@gmail.com
+Fixes: abc610e01c66 ("fs/epoll: avoid barrier after an epoll_wait(2) timeout")
+Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
+Tested-by: Guantao Liu <guantaol@google.com>
+Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
+Reported-by: Guantao Liu <guantaol@google.com>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Reviewed-by: Willem de Bruijn <willemb@google.com>
+Reviewed-by: Khazhismel Kumykov <khazhy@google.com>
+Reviewed-by: Davidlohr Bueso <dbueso@suse.de>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Acked-by: Thadeu Lima de Souza Cascardo <cascardo@canonical.com>
+Signed-off-by: Rishabh Bhatnagar <risbhat@amazon.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/eventpoll.c |   27 +++++++++++++++++----------
+ 1 file changed, 17 insertions(+), 10 deletions(-)
+
+--- a/fs/eventpoll.c
++++ b/fs/eventpoll.c
+@@ -1928,23 +1928,30 @@ fetch_events:
+ 		}
+ 		write_unlock_irq(&ep->lock);
+ 
+-		if (eavail || res)
+-			break;
+-
+-		if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) {
+-			timed_out = 1;
+-			break;
+-		}
+-
+-		/* We were woken up, thus go and try to harvest some events */
++		if (!eavail && !res)
++			timed_out = !schedule_hrtimeout_range(to, slack,
++							      HRTIMER_MODE_ABS);
++
++		/*
++		 * We were woken up, thus go and try to harvest some events.
++		 * If timed out and still on the wait queue, recheck eavail
++		 * carefully under lock, below.
++		 */
+ 		eavail = 1;
+-
+ 	} while (0);
+ 
+ 	__set_current_state(TASK_RUNNING);
+ 
+ 	if (!list_empty_careful(&wait.entry)) {
+ 		write_lock_irq(&ep->lock);
++		/*
++		 * If the thread timed out and is not on the wait queue, it
++		 * means that the thread was woken up after its timeout expired
++		 * before it could reacquire the lock. Thus, when wait.entry is
++		 * empty, it needs to harvest events.
++		 */
++		if (timed_out)
++			eavail = list_empty(&wait.entry);
+ 		__remove_wait_queue(&ep->wq, &wait);
+ 		write_unlock_irq(&ep->lock);
+ 	}
diff --git a/queue-5.4/nvme-ensure-subsystem-reset-is-single-threaded.patch b/queue-5.4/nvme-ensure-subsystem-reset-is-single-threaded.patch
new file mode 100644
index 00000000000..ff628adc8c5
--- /dev/null
+++ b/queue-5.4/nvme-ensure-subsystem-reset-is-single-threaded.patch
@@ -0,0 +1,67 @@
+From 1e866afd4bcdd01a70a5eddb4371158d3035ce03 Mon Sep 17 00:00:00 2001
+From: Keith Busch <kbusch@kernel.org>
+Date: Thu, 22 Sep 2022 08:13:47 -0700
+Subject: nvme: ensure subsystem reset is single threaded
+
+From: Keith Busch <kbusch@kernel.org>
+
+commit 1e866afd4bcdd01a70a5eddb4371158d3035ce03 upstream.
+
+The subsystem reset writes to a register, so we have to ensure the
+device state is capable of handling that otherwise the driver may access
+unmapped registers. Use the state machine to ensure the subsystem reset
+doesn't try to write registers on a device already undergoing this type
+of reset.
+
+Link: https://bugzilla.kernel.org/show_bug.cgi?id=214771
+Signed-off-by: Keith Busch <kbusch@kernel.org>
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Ovidiu Panait <ovidiu.panait@windriver.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/nvme/host/nvme.h |   16 +++++++++++++---
+ 1 file changed, 13 insertions(+), 3 deletions(-)
+
+--- a/drivers/nvme/host/nvme.h
++++ b/drivers/nvme/host/nvme.h
+@@ -422,11 +422,23 @@ static inline void nvme_fault_inject_fin
+ static inline void nvme_should_fail(struct request *req) {}
+ #endif
+ 
++bool nvme_wait_reset(struct nvme_ctrl *ctrl);
++int nvme_try_sched_reset(struct nvme_ctrl *ctrl);
++
+ static inline int nvme_reset_subsystem(struct nvme_ctrl *ctrl)
+ {
++	int ret;
++
+ 	if (!ctrl->subsystem)
+ 		return -ENOTTY;
+-	return ctrl->ops->reg_write32(ctrl, NVME_REG_NSSR, 0x4E564D65);
++	if (!nvme_wait_reset(ctrl))
++		return -EBUSY;
++
++	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_NSSR, 0x4E564D65);
++	if (ret)
++		return ret;
++
++	return nvme_try_sched_reset(ctrl);
+ }
+ 
+ /*
+@@ -473,7 +485,6 @@ void nvme_cancel_tagset(struct nvme_ctrl
+ void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl);
+ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
+ 		enum nvme_ctrl_state new_state);
+-bool nvme_wait_reset(struct nvme_ctrl *ctrl);
+ int nvme_disable_ctrl(struct nvme_ctrl *ctrl);
+ int nvme_enable_ctrl(struct nvme_ctrl *ctrl);
+ int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl);
+@@ -525,7 +536,6 @@ int nvme_set_queue_count(struct nvme_ctr
+ void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
+ int nvme_reset_ctrl(struct nvme_ctrl *ctrl);
+ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl);
+-int nvme_try_sched_reset(struct nvme_ctrl *ctrl);
+ int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
+ 
+ int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp,
diff --git a/queue-5.4/nvme-restrict-management-ioctls-to-admin.patch b/queue-5.4/nvme-restrict-management-ioctls-to-admin.patch
new file mode 100644
index 00000000000..1b587aa9ea0
--- /dev/null
+++ b/queue-5.4/nvme-restrict-management-ioctls-to-admin.patch
@@ -0,0 +1,41 @@
+From 23e085b2dead13b51fe86d27069895b740f749c0 Mon Sep 17 00:00:00 2001
+From: Keith Busch <kbusch@kernel.org>
+Date: Thu, 22 Sep 2022 07:54:06 -0700
+Subject: nvme: restrict management ioctls to admin
+
+From: Keith Busch <kbusch@kernel.org>
+
+commit 23e085b2dead13b51fe86d27069895b740f749c0 upstream.
+
+The passthrough commands already have this restriction, but the other
+operations do not. Require the same capabilities for all users as all of
+these operations, which include resets and rescans, can be disruptive.
+
+Signed-off-by: Keith Busch <kbusch@kernel.org>
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Signed-off-by: Ovidiu Panait <ovidiu.panait@windriver.com>
+---
+ drivers/nvme/host/core.c |    6 ++++++
+ 1 file changed, 6 insertions(+)
+
+--- a/drivers/nvme/host/core.c
++++ b/drivers/nvme/host/core.c
+@@ -3027,11 +3027,17 @@ static long nvme_dev_ioctl(struct file *
+ 	case NVME_IOCTL_IO_CMD:
+ 		return nvme_dev_user_cmd(ctrl, argp);
+ 	case NVME_IOCTL_RESET:
++		if (!capable(CAP_SYS_ADMIN))
++			return -EACCES;
+ 		dev_warn(ctrl->device, "resetting controller\n");
+ 		return nvme_reset_ctrl_sync(ctrl);
+ 	case NVME_IOCTL_SUBSYS_RESET:
++		if (!capable(CAP_SYS_ADMIN))
++			return -EACCES;
+ 		return nvme_reset_subsystem(ctrl);
+ 	case NVME_IOCTL_RESCAN:
++		if (!capable(CAP_SYS_ADMIN))
++			return -EACCES;
+ 		nvme_queue_scan(ctrl);
+ 		return 0;
+ 	default:
diff --git a/queue-5.4/series b/queue-5.4/series
index 4fc04ed9e27..2bcb95d87ec 100644
--- a/queue-5.4/series
+++ b/queue-5.4/series
@@ -145,3 +145,8 @@ selftests-net-add-delete-nexthop-route-warning-test.patch
 selftests-net-fix-nexthop-warning-cleanup-double-ip-.patch
 ipv4-handle-attempt-to-delete-multipath-route-when-f.patch
 ipv4-fix-route-deletion-when-nexthop-info-is-not-spe.patch
+tracing-ring-buffer-have-polling-block-on-watermark.patch
+epoll-call-final-ep_events_available-check-under-the-lock.patch
+epoll-check-for-events-when-removing-a-timed-out-thread-from-the-wait-queue.patch
+nvme-restrict-management-ioctls-to-admin.patch
+nvme-ensure-subsystem-reset-is-single-threaded.patch
diff --git a/queue-5.4/tracing-ring-buffer-have-polling-block-on-watermark.patch b/queue-5.4/tracing-ring-buffer-have-polling-block-on-watermark.patch
new file mode 100644
index 00000000000..ae4f6d534c8
--- /dev/null
+++ b/queue-5.4/tracing-ring-buffer-have-polling-block-on-watermark.patch
@@ -0,0 +1,186 @@
+From 42fb0a1e84ff525ebe560e2baf9451ab69127e2b Mon Sep 17 00:00:00 2001
+From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
+Date: Thu, 20 Oct 2022 23:14:27 -0400
+Subject: tracing/ring-buffer: Have polling block on watermark
+
+From: Steven Rostedt (Google) <rostedt@goodmis.org>
+
+commit 42fb0a1e84ff525ebe560e2baf9451ab69127e2b upstream.
+
+Currently the way polling works on the ring buffer is broken. It will
+return immediately if there's any data in the ring buffer whereas a read
+will block until the watermark (defined by the tracefs buffer_percent file)
+is hit.
+
+That is, a select() or poll() will return as if there's data available,
+but then the following read will block. This is broken for the way
+select()s and poll()s are supposed to work.
+
+Have the polling on the ring buffer also block the same way reads and
+splice does on the ring buffer.
+
+Link: https://lkml.kernel.org/r/20221020231427.41be3f26@gandalf.local.home
+
+Cc: Linux Trace Kernel <linux-trace-kernel@vger.kernel.org>
+Cc: Masami Hiramatsu <mhiramat@kernel.org>
+Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+Cc: Primiano Tucci <primiano@google.com>
+Cc: stable@vger.kernel.org
+Fixes: 1e0d6714aceb7 ("ring-buffer: Do not wake up a splice waiter when page is not full")
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/ring_buffer.h |    2 -
+ kernel/trace/ring_buffer.c  |   54 ++++++++++++++++++++++++++++----------------
+ kernel/trace/trace.c        |    2 -
+ 3 files changed, 37 insertions(+), 21 deletions(-)
+
+--- a/include/linux/ring_buffer.h
++++ b/include/linux/ring_buffer.h
+@@ -99,7 +99,7 @@ __ring_buffer_alloc(unsigned long size,
+ 
+ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, int full);
+ __poll_t ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
+-			  struct file *filp, poll_table *poll_table);
++			  struct file *filp, poll_table *poll_table, int full);
+ 
+ 
+ #define RING_BUFFER_ALL_CPUS -1
+--- a/kernel/trace/ring_buffer.c
++++ b/kernel/trace/ring_buffer.c
+@@ -566,6 +566,21 @@ size_t ring_buffer_nr_dirty_pages(struct
+ 	return cnt - read;
+ }
+ 
++static __always_inline bool full_hit(struct ring_buffer *buffer, int cpu, int full)
++{
++	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
++	size_t nr_pages;
++	size_t dirty;
++
++	nr_pages = cpu_buffer->nr_pages;
++	if (!nr_pages || !full)
++		return true;
++
++	dirty = ring_buffer_nr_dirty_pages(buffer, cpu);
++
++	return (dirty * 100) > (full * nr_pages);
++}
++
+ /*
+  * rb_wake_up_waiters - wake up tasks waiting for ring buffer input
+  *
+@@ -661,22 +676,20 @@ int ring_buffer_wait(struct ring_buffer
+ 		    !ring_buffer_empty_cpu(buffer, cpu)) {
+ 			unsigned long flags;
+ 			bool pagebusy;
+-			size_t nr_pages;
+-			size_t dirty;
++			bool done;
+ 
+ 			if (!full)
+ 				break;
+ 
+ 			raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ 			pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
+-			nr_pages = cpu_buffer->nr_pages;
+-			dirty = ring_buffer_nr_dirty_pages(buffer, cpu);
++			done = !pagebusy && full_hit(buffer, cpu, full);
++
+ 			if (!cpu_buffer->shortest_full ||
+ 			    cpu_buffer->shortest_full > full)
+ 				cpu_buffer->shortest_full = full;
+ 			raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+-			if (!pagebusy &&
+-			    (!nr_pages || (dirty * 100) > full * nr_pages))
++			if (done)
+ 				break;
+ 		}
+ 
+@@ -697,6 +710,7 @@ int ring_buffer_wait(struct ring_buffer
+  * @cpu: the cpu buffer to wait on
+  * @filp: the file descriptor
+  * @poll_table: The poll descriptor
++ * @full: wait until the percentage of pages are available, if @cpu != RING_BUFFER_ALL_CPUS
+  *
+  * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
+  * as data is added to any of the @buffer's cpu buffers. Otherwise
+@@ -706,14 +720,14 @@ int ring_buffer_wait(struct ring_buffer
+  * zero otherwise.
+  */
+ __poll_t ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
+-			  struct file *filp, poll_table *poll_table)
++			  struct file *filp, poll_table *poll_table, int full)
+ {
+ 	struct ring_buffer_per_cpu *cpu_buffer;
+ 	struct rb_irq_work *work;
+ 
+-	if (cpu == RING_BUFFER_ALL_CPUS)
++	if (cpu == RING_BUFFER_ALL_CPUS) {
+ 		work = &buffer->irq_work;
+-	else {
++	} else {
+ 		if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ 			return -EINVAL;
+ 
+@@ -721,8 +735,14 @@ __poll_t ring_buffer_poll_wait(struct ri
+ 		work = &cpu_buffer->irq_work;
+ 	}
+ 
+-	poll_wait(filp, &work->waiters, poll_table);
+-	work->waiters_pending = true;
++	if (full) {
++		poll_wait(filp, &work->full_waiters, poll_table);
++		work->full_waiters_pending = true;
++	} else {
++		poll_wait(filp, &work->waiters, poll_table);
++		work->waiters_pending = true;
++	}
++
+ 	/*
+ 	 * There's a tight race between setting the waiters_pending and
+ 	 * checking if the ring buffer is empty.  Once the waiters_pending bit
+@@ -738,6 +758,9 @@ __poll_t ring_buffer_poll_wait(struct ri
+ 	 */
+ 	smp_mb();
+ 
++	if (full)
++		return full_hit(buffer, cpu, full) ? EPOLLIN | EPOLLRDNORM : 0;
++
+ 	if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
+ 	    (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
+ 		return EPOLLIN | EPOLLRDNORM;
+@@ -2640,10 +2663,6 @@ static void rb_commit(struct ring_buffer
+ static __always_inline void
+ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
+ {
+-	size_t nr_pages;
+-	size_t dirty;
+-	size_t full;
+-
+ 	if (buffer->irq_work.waiters_pending) {
+ 		buffer->irq_work.waiters_pending = false;
+ 		/* irq_work_queue() supplies it's own memory barriers */
+@@ -2667,10 +2686,7 @@ rb_wakeups(struct ring_buffer *buffer, s
+ 
+ 	cpu_buffer->last_pages_touch = local_read(&cpu_buffer->pages_touched);
+ 
+-	full = cpu_buffer->shortest_full;
+-	nr_pages = cpu_buffer->nr_pages;
+-	dirty = ring_buffer_nr_dirty_pages(buffer, cpu_buffer->cpu);
+-	if (full && nr_pages && (dirty * 100) <= full * nr_pages)
++	if (!full_hit(buffer, cpu_buffer->cpu, cpu_buffer->shortest_full))
+ 		return;
+ 
+ 	cpu_buffer->irq_work.wakeup_full = true;
+--- a/kernel/trace/trace.c
++++ b/kernel/trace/trace.c
+@@ -5993,7 +5993,7 @@ trace_poll(struct trace_iterator *iter,
+ 		return EPOLLIN | EPOLLRDNORM;
+ 	else
+ 		return ring_buffer_poll_wait(iter->trace_buffer->buffer, iter->cpu_file,
+-					     filp, poll_table);
++					     filp, poll_table, iter->tr->buffer_percent);
+ }
+ 
+ static __poll_t
-- 
2.47.3