From c814cf80ae96504efdd93196eac4c3481a0ad1f1 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 12 May 2020 13:35:23 +0200
Subject: [PATCH] 5.4-stable patches

added patches:
	arm64-hugetlb-avoid-potential-null-dereference.patch
	ceph-demote-quotarealm-lookup-warning-to-a-debug-message.patch
	ceph-fix-endianness-bug-when-handling-mds-session-feature-bits.patch
	coredump-fix-crash-when-umh-is-disabled.patch
	drm-ingenic-drm-add-module_device_table.patch
	epoll-atomically-remove-wait-entry-on-wake-up.patch
	eventpoll-fix-missing-wakeup-for-ovflist-in-ep_poll_callback.patch
	iocost-protect-iocg-abs_vdebt-with-iocg-waitq.lock.patch
	ipc-mqueue.c-change-__do_notify-to-bypass-check_kill_permission.patch
	mm-limit-boost_watermark-on-small-zones.patch
	mm-page_alloc-fix-watchdog-soft-lockups-during-set_zone_contiguous.patch
	riscv-set-max_pfn-to-the-pfn-of-the-last-page.patch
	staging-gasket-check-the-return-value-of-gasket_get_bar_index.patch
---
 ...tlb-avoid-potential-null-dereference.patch |  56 ++++
 ...lm-lookup-warning-to-a-debug-message.patch |  40 +++
 ...en-handling-mds-session-feature-bits.patch |  48 +++
 ...edump-fix-crash-when-umh-is-disabled.patch | 123 +++++++
 ...-ingenic-drm-add-module_device_table.patch |  33 ++
 ...mically-remove-wait-entry-on-wake-up.patch | 178 +++++++++++
 ...keup-for-ovflist-in-ep_poll_callback.patch |  77 +++++
 ...-iocg-abs_vdebt-with-iocg-waitq.lock.patch | 300 ++++++++++++++++++
 ...tify-to-bypass-check_kill_permission.patch | 149 +++++++++
 ...limit-boost_watermark-on-small-zones.patch |  82 +++++
 ...t-lockups-during-set_zone_contiguous.patch |  62 ++++
 ...-max_pfn-to-the-pfn-of-the-last-page.patch |  70 ++++
 queue-5.4/series                              |  13 +
 ...return-value-of-gasket_get_bar_index.patch |  39 +++
 14 files changed, 1270 insertions(+)
 create mode 100644 queue-5.4/arm64-hugetlb-avoid-potential-null-dereference.patch
 create mode 100644 queue-5.4/ceph-demote-quotarealm-lookup-warning-to-a-debug-message.patch
 create mode 100644 queue-5.4/ceph-fix-endianness-bug-when-handling-mds-session-feature-bits.patch
 create mode 100644 queue-5.4/coredump-fix-crash-when-umh-is-disabled.patch
 create mode 100644 queue-5.4/drm-ingenic-drm-add-module_device_table.patch
 create mode 100644 queue-5.4/epoll-atomically-remove-wait-entry-on-wake-up.patch
 create mode 100644 queue-5.4/eventpoll-fix-missing-wakeup-for-ovflist-in-ep_poll_callback.patch
 create mode 100644 queue-5.4/iocost-protect-iocg-abs_vdebt-with-iocg-waitq.lock.patch
 create mode 100644 queue-5.4/ipc-mqueue.c-change-__do_notify-to-bypass-check_kill_permission.patch
 create mode 100644 queue-5.4/mm-limit-boost_watermark-on-small-zones.patch
 create mode 100644 queue-5.4/mm-page_alloc-fix-watchdog-soft-lockups-during-set_zone_contiguous.patch
 create mode 100644 queue-5.4/riscv-set-max_pfn-to-the-pfn-of-the-last-page.patch
 create mode 100644 queue-5.4/staging-gasket-check-the-return-value-of-gasket_get_bar_index.patch

diff --git a/queue-5.4/arm64-hugetlb-avoid-potential-null-dereference.patch b/queue-5.4/arm64-hugetlb-avoid-potential-null-dereference.patch
new file mode 100644
index 00000000000..0c40d51f6f4
--- /dev/null
+++ b/queue-5.4/arm64-hugetlb-avoid-potential-null-dereference.patch
@@ -0,0 +1,56 @@
+From 027d0c7101f50cf03aeea9eebf484afd4920c8d3 Mon Sep 17 00:00:00 2001
+From: Mark Rutland <mark.rutland@arm.com>
+Date: Tue, 5 May 2020 13:59:30 +0100
+Subject: arm64: hugetlb: avoid potential NULL dereference
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+commit 027d0c7101f50cf03aeea9eebf484afd4920c8d3 upstream.
+
+The static analyzer in GCC 10 spotted that in huge_pte_alloc() we may
+pass a NULL pmdp into pte_alloc_map() when pmd_alloc() returns NULL:
+
+|   CC      arch/arm64/mm/pageattr.o
+|   CC      arch/arm64/mm/hugetlbpage.o
+|                  from arch/arm64/mm/hugetlbpage.c:10:
+| arch/arm64/mm/hugetlbpage.c: In function âhuge_pte_allocâ:
+| ./arch/arm64/include/asm/pgtable-types.h:28:24: warning: dereference of NULL âpmdpâ [CWE-690] [-Wanalyzer-null-dereference]
+| ./arch/arm64/include/asm/pgtable.h:436:26: note: in expansion of macro âpmd_valâ
+| arch/arm64/mm/hugetlbpage.c:242:10: note: in expansion of macro âpte_alloc_mapâ
+|     |arch/arm64/mm/hugetlbpage.c:232:10:
+|     |./arch/arm64/include/asm/pgtable-types.h:28:24:
+| ./arch/arm64/include/asm/pgtable.h:436:26: note: in expansion of macro âpmd_valâ
+| arch/arm64/mm/hugetlbpage.c:242:10: note: in expansion of macro âpte_alloc_mapâ
+
+This can only occur when the kernel cannot allocate a page, and so is
+unlikely to happen in practice before other systems start failing.
+
+We can avoid this by bailing out if pmd_alloc() fails, as we do earlier
+in the function if pud_alloc() fails.
+
+Fixes: 66b3923a1a0f ("arm64: hugetlb: add support for PTE contiguous bit")
+Signed-off-by: Mark Rutland <mark.rutland@arm.com>
+Reported-by: Kyrill Tkachov <kyrylo.tkachov@arm.com>
+Cc: <stable@vger.kernel.org> # 4.5.x-
+Cc: Will Deacon <will@kernel.org>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/arm64/mm/hugetlbpage.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/arch/arm64/mm/hugetlbpage.c
++++ b/arch/arm64/mm/hugetlbpage.c
+@@ -230,6 +230,8 @@ pte_t *huge_pte_alloc(struct mm_struct *
+ 		ptep = (pte_t *)pudp;
+ 	} else if (sz == (CONT_PTE_SIZE)) {
+ 		pmdp = pmd_alloc(mm, pudp, addr);
++		if (!pmdp)
++			return NULL;
+ 
+ 		WARN_ON(addr & (sz - 1));
+ 		/*
diff --git a/queue-5.4/ceph-demote-quotarealm-lookup-warning-to-a-debug-message.patch b/queue-5.4/ceph-demote-quotarealm-lookup-warning-to-a-debug-message.patch
new file mode 100644
index 00000000000..17d5931de34
--- /dev/null
+++ b/queue-5.4/ceph-demote-quotarealm-lookup-warning-to-a-debug-message.patch
@@ -0,0 +1,40 @@
+From 12ae44a40a1be891bdc6463f8c7072b4ede746ef Mon Sep 17 00:00:00 2001
+From: Luis Henriques <lhenriques@suse.com>
+Date: Tue, 5 May 2020 13:59:02 +0100
+Subject: ceph: demote quotarealm lookup warning to a debug message
+
+From: Luis Henriques <lhenriques@suse.com>
+
+commit 12ae44a40a1be891bdc6463f8c7072b4ede746ef upstream.
+
+A misconfigured cephx can easily result in having the kernel client
+flooding the logs with:
+
+  ceph: Can't lookup inode 1 (err: -13)
+
+Change this message to debug level.
+
+Cc: stable@vger.kernel.org
+URL: https://tracker.ceph.com/issues/44546
+Signed-off-by: Luis Henriques <lhenriques@suse.com>
+Reviewed-by: Jeff Layton <jlayton@kernel.org>
+Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/ceph/quota.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/fs/ceph/quota.c
++++ b/fs/ceph/quota.c
+@@ -159,8 +159,8 @@ static struct inode *lookup_quotarealm_i
+ 	}
+ 
+ 	if (IS_ERR(in)) {
+-		pr_warn("Can't lookup inode %llx (err: %ld)\n",
+-			realm->ino, PTR_ERR(in));
++		dout("Can't lookup inode %llx (err: %ld)\n",
++		     realm->ino, PTR_ERR(in));
+ 		qri->timeout = jiffies + msecs_to_jiffies(60 * 1000); /* XXX */
+ 	} else {
+ 		qri->timeout = 0;
diff --git a/queue-5.4/ceph-fix-endianness-bug-when-handling-mds-session-feature-bits.patch b/queue-5.4/ceph-fix-endianness-bug-when-handling-mds-session-feature-bits.patch
new file mode 100644
index 00000000000..b1d7cd7e8d0
--- /dev/null
+++ b/queue-5.4/ceph-fix-endianness-bug-when-handling-mds-session-feature-bits.patch
@@ -0,0 +1,48 @@
+From 0fa8263367db9287aa0632f96c1a5f93cc478150 Mon Sep 17 00:00:00 2001
+From: Jeff Layton <jlayton@kernel.org>
+Date: Tue, 28 Apr 2020 08:10:22 -0400
+Subject: ceph: fix endianness bug when handling MDS session feature bits
+
+From: Jeff Layton <jlayton@kernel.org>
+
+commit 0fa8263367db9287aa0632f96c1a5f93cc478150 upstream.
+
+Eduard reported a problem mounting cephfs on s390 arch. The feature
+mask sent by the MDS is little-endian, so we need to convert it
+before storing and testing against it.
+
+Cc: stable@vger.kernel.org
+Reported-and-Tested-by: Eduard Shishkin <edward6@linux.ibm.com>
+Signed-off-by: Jeff Layton <jlayton@kernel.org>
+Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
+Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/ceph/mds_client.c |    8 +++-----
+ 1 file changed, 3 insertions(+), 5 deletions(-)
+
+--- a/fs/ceph/mds_client.c
++++ b/fs/ceph/mds_client.c
+@@ -3072,8 +3072,7 @@ static void handle_session(struct ceph_m
+ 	void *end = p + msg->front.iov_len;
+ 	struct ceph_mds_session_head *h;
+ 	u32 op;
+-	u64 seq;
+-	unsigned long features = 0;
++	u64 seq, features = 0;
+ 	int wake = 0;
+ 	bool blacklisted = false;
+ 
+@@ -3092,9 +3091,8 @@ static void handle_session(struct ceph_m
+ 			goto bad;
+ 		/* version >= 3, feature bits */
+ 		ceph_decode_32_safe(&p, end, len, bad);
+-		ceph_decode_need(&p, end, len, bad);
+-		memcpy(&features, p, min_t(size_t, len, sizeof(features)));
+-		p += len;
++		ceph_decode_64_safe(&p, end, features, bad);
++		p += len - sizeof(features);
+ 	}
+ 
+ 	mutex_lock(&mdsc->mutex);
diff --git a/queue-5.4/coredump-fix-crash-when-umh-is-disabled.patch b/queue-5.4/coredump-fix-crash-when-umh-is-disabled.patch
new file mode 100644
index 00000000000..ef5c0349b15
--- /dev/null
+++ b/queue-5.4/coredump-fix-crash-when-umh-is-disabled.patch
@@ -0,0 +1,123 @@
+From 3740d93e37902b31159a82da2d5c8812ed825404 Mon Sep 17 00:00:00 2001
+From: Luis Chamberlain <mcgrof@kernel.org>
+Date: Thu, 16 Apr 2020 16:28:59 +0000
+Subject: coredump: fix crash when umh is disabled
+
+From: Luis Chamberlain <mcgrof@kernel.org>
+
+commit 3740d93e37902b31159a82da2d5c8812ed825404 upstream.
+
+Commit 64e90a8acb859 ("Introduce STATIC_USERMODEHELPER to mediate
+call_usermodehelper()") added the optiont to disable all
+call_usermodehelper() calls by setting STATIC_USERMODEHELPER_PATH to
+an empty string. When this is done, and crashdump is triggered, it
+will crash on null pointer dereference, since we make assumptions
+over what call_usermodehelper_exec() did.
+
+This has been reported by Sergey when one triggers a a coredump
+with the following configuration:
+
+```
+CONFIG_STATIC_USERMODEHELPER=y
+CONFIG_STATIC_USERMODEHELPER_PATH=""
+kernel.core_pattern = |/usr/lib/systemd/systemd-coredump %P %u %g %s %t %c %h %e
+```
+
+The way disabling the umh was designed was that call_usermodehelper_exec()
+would just return early, without an error. But coredump assumes
+certain variables are set up for us when this happens, and calls
+ile_start_write(cprm.file) with a NULL file.
+
+[    2.819676] BUG: kernel NULL pointer dereference, address: 0000000000000020
+[    2.819859] #PF: supervisor read access in kernel mode
+[    2.820035] #PF: error_code(0x0000) - not-present page
+[    2.820188] PGD 0 P4D 0
+[    2.820305] Oops: 0000 [#1] SMP PTI
+[    2.820436] CPU: 2 PID: 89 Comm: a Not tainted 5.7.0-rc1+ #7
+[    2.820680] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190711_202441-buildvm-armv7-10.arm.fedoraproject.org-2.fc31 04/01/2014
+[    2.821150] RIP: 0010:do_coredump+0xd80/0x1060
+[    2.821385] Code: e8 95 11 ed ff 48 c7 c6 cc a7 b4 81 48 8d bd 28 ff
+ff ff 89 c2 e8 70 f1 ff ff 41 89 c2 85 c0 0f 84 72 f7 ff ff e9 b4 fe ff
+ff <48> 8b 57 20 0f b7 02 66 25 00 f0 66 3d 00 8
+0 0f 84 9c 01 00 00 44
+[    2.822014] RSP: 0000:ffffc9000029bcb8 EFLAGS: 00010246
+[    2.822339] RAX: 0000000000000000 RBX: ffff88803f860000 RCX: 000000000000000a
+[    2.822746] RDX: 0000000000000009 RSI: 0000000000000282 RDI: 0000000000000000
+[    2.823141] RBP: ffffc9000029bde8 R08: 0000000000000000 R09: ffffc9000029bc00
+[    2.823508] R10: 0000000000000001 R11: ffff88803dec90be R12: ffffffff81c39da0
+[    2.823902] R13: ffff88803de84400 R14: 0000000000000000 R15: 0000000000000000
+[    2.824285] FS:  00007fee08183540(0000) GS:ffff88803e480000(0000) knlGS:0000000000000000
+[    2.824767] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+[    2.825111] CR2: 0000000000000020 CR3: 000000003f856005 CR4: 0000000000060ea0
+[    2.825479] Call Trace:
+[    2.825790]  get_signal+0x11e/0x720
+[    2.826087]  do_signal+0x1d/0x670
+[    2.826361]  ? force_sig_info_to_task+0xc1/0xf0
+[    2.826691]  ? force_sig_fault+0x3c/0x40
+[    2.826996]  ? do_trap+0xc9/0x100
+[    2.827179]  exit_to_usermode_loop+0x49/0x90
+[    2.827359]  prepare_exit_to_usermode+0x77/0xb0
+[    2.827559]  ? invalid_op+0xa/0x30
+[    2.827747]  ret_from_intr+0x20/0x20
+[    2.827921] RIP: 0033:0x55e2c76d2129
+[    2.828107] Code: 2d ff ff ff e8 68 ff ff ff 5d c6 05 18 2f 00 00 01
+c3 0f 1f 80 00 00 00 00 c3 0f 1f 80 00 00 00 00 e9 7b ff ff ff 55 48 89
+e5 <0f> 0b b8 00 00 00 00 5d c3 66 2e 0f 1f 84 0
+0 00 00 00 00 0f 1f 40
+[    2.828603] RSP: 002b:00007fffeba5e080 EFLAGS: 00010246
+[    2.828801] RAX: 000055e2c76d2125 RBX: 0000000000000000 RCX: 00007fee0817c718
+[    2.829034] RDX: 00007fffeba5e188 RSI: 00007fffeba5e178 RDI: 0000000000000001
+[    2.829257] RBP: 00007fffeba5e080 R08: 0000000000000000 R09: 00007fee08193c00
+[    2.829482] R10: 0000000000000009 R11: 0000000000000000 R12: 000055e2c76d2040
+[    2.829727] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
+[    2.829964] CR2: 0000000000000020
+[    2.830149] ---[ end trace ceed83d8c68a1bf1 ]---
+```
+
+Cc: <stable@vger.kernel.org> # v4.11+
+Fixes: 64e90a8acb85 ("Introduce STATIC_USERMODEHELPER to mediate call_usermodehelper()")
+BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=199795
+Reported-by: Tony Vroon <chainsaw@gentoo.org>
+Reported-by: Sergey Kvachonok <ravenexp@gmail.com>
+Tested-by: Sergei Trofimovich <slyfox@gentoo.org>
+Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
+Link: https://lore.kernel.org/r/20200416162859.26518-1-mcgrof@kernel.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/coredump.c |    8 ++++++++
+ kernel/umh.c  |    5 +++++
+ 2 files changed, 13 insertions(+)
+
+--- a/fs/coredump.c
++++ b/fs/coredump.c
+@@ -788,6 +788,14 @@ void do_coredump(const kernel_siginfo_t
+ 	if (displaced)
+ 		put_files_struct(displaced);
+ 	if (!dump_interrupted()) {
++		/*
++		 * umh disabled with CONFIG_STATIC_USERMODEHELPER_PATH="" would
++		 * have this set to NULL.
++		 */
++		if (!cprm.file) {
++			pr_info("Core dump to |%s disabled\n", cn.corename);
++			goto close_fail;
++		}
+ 		file_start_write(cprm.file);
+ 		core_dumped = binfmt->core_dump(&cprm);
+ 		file_end_write(cprm.file);
+--- a/kernel/umh.c
++++ b/kernel/umh.c
+@@ -544,6 +544,11 @@ EXPORT_SYMBOL_GPL(fork_usermode_blob);
+  * Runs a user-space application.  The application is started
+  * asynchronously if wait is not set, and runs as a child of system workqueues.
+  * (ie. it runs with full root capabilities and optimized affinity).
++ *
++ * Note: successful return value does not guarantee the helper was called at
++ * all. You can't rely on sub_info->{init,cleanup} being called even for
++ * UMH_WAIT_* wait modes as STATIC_USERMODEHELPER_PATH="" turns all helpers
++ * into a successful no-op.
+  */
+ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
+ {
diff --git a/queue-5.4/drm-ingenic-drm-add-module_device_table.patch b/queue-5.4/drm-ingenic-drm-add-module_device_table.patch
new file mode 100644
index 00000000000..b7691a07923
--- /dev/null
+++ b/queue-5.4/drm-ingenic-drm-add-module_device_table.patch
@@ -0,0 +1,33 @@
+From c59359a02d14a7256cd508a4886b7d2012df2363 Mon Sep 17 00:00:00 2001
+From: "H. Nikolaus Schaller" <hns@goldelico.com>
+Date: Mon, 4 May 2020 08:35:12 +0200
+Subject: drm: ingenic-drm: add MODULE_DEVICE_TABLE
+
+From: H. Nikolaus Schaller <hns@goldelico.com>
+
+commit c59359a02d14a7256cd508a4886b7d2012df2363 upstream.
+
+so that the driver can load by matching the device tree
+if compiled as module.
+
+Cc: stable@vger.kernel.org # v5.3+
+Fixes: 90b86fcc47b4 ("DRM: Add KMS driver for the Ingenic JZ47xx SoCs")
+Signed-off-by: H. Nikolaus Schaller <hns@goldelico.com>
+Signed-off-by: Paul Cercueil <paul@crapouillou.net>
+Link: https://patchwork.freedesktop.org/patch/msgid/1694a29b7a3449b6b662cec33d1b33f2ee0b174a.1588574111.git.hns@goldelico.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/gpu/drm/ingenic/ingenic-drm.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/gpu/drm/ingenic/ingenic-drm.c
++++ b/drivers/gpu/drm/ingenic/ingenic-drm.c
+@@ -824,6 +824,7 @@ static const struct of_device_id ingenic
+ 	{ .compatible = "ingenic,jz4725b-lcd", .data = &jz4725b_soc_info },
+ 	{ /* sentinel */ },
+ };
++MODULE_DEVICE_TABLE(of, ingenic_drm_of_match);
+ 
+ static struct platform_driver ingenic_drm_driver = {
+ 	.driver = {
diff --git a/queue-5.4/epoll-atomically-remove-wait-entry-on-wake-up.patch b/queue-5.4/epoll-atomically-remove-wait-entry-on-wake-up.patch
new file mode 100644
index 00000000000..9fcf0449c1d
--- /dev/null
+++ b/queue-5.4/epoll-atomically-remove-wait-entry-on-wake-up.patch
@@ -0,0 +1,178 @@
+From 412895f03cbf9633298111cb4dfde13b7720e2c5 Mon Sep 17 00:00:00 2001
+From: Roman Penyaev <rpenyaev@suse.de>
+Date: Thu, 7 May 2020 18:36:16 -0700
+Subject: epoll: atomically remove wait entry on wake up
+
+From: Roman Penyaev <rpenyaev@suse.de>
+
+commit 412895f03cbf9633298111cb4dfde13b7720e2c5 upstream.
+
+This patch does two things:
+
+ - fixes a lost wakeup introduced by commit 339ddb53d373 ("fs/epoll:
+   remove unnecessary wakeups of nested epoll")
+
+ - improves performance for events delivery.
+
+The description of the problem is the following: if N (>1) threads are
+waiting on ep->wq for new events and M (>1) events come, it is quite
+likely that >1 wakeups hit the same wait queue entry, because there is
+quite a big window between __add_wait_queue_exclusive() and the
+following __remove_wait_queue() calls in ep_poll() function.
+
+This can lead to lost wakeups, because thread, which was woken up, can
+handle not all the events in ->rdllist.  (in better words the problem is
+described here: https://lkml.org/lkml/2019/10/7/905)
+
+The idea of the current patch is to use init_wait() instead of
+init_waitqueue_entry().
+
+Internally init_wait() sets autoremove_wake_function as a callback,
+which removes the wait entry atomically (under the wq locks) from the
+list, thus the next coming wakeup hits the next wait entry in the wait
+queue, thus preventing lost wakeups.
+
+Problem is very well reproduced by the epoll60 test case [1].
+
+Wait entry removal on wakeup has also performance benefits, because
+there is no need to take a ep->lock and remove wait entry from the queue
+after the successful wakeup.  Here is the timing output of the epoll60
+test case:
+
+  With explicit wakeup from ep_scan_ready_list() (the state of the
+  code prior 339ddb53d373):
+
+    real    0m6.970s
+    user    0m49.786s
+    sys     0m0.113s
+
+ After this patch:
+
+   real    0m5.220s
+   user    0m36.879s
+   sys     0m0.019s
+
+The other testcase is the stress-epoll [2], where one thread consumes
+all the events and other threads produce many events:
+
+  With explicit wakeup from ep_scan_ready_list() (the state of the
+  code prior 339ddb53d373):
+
+    threads  events/ms  run-time ms
+          8       5427         1474
+         16       6163         2596
+         32       6824         4689
+         64       7060         9064
+        128       6991        18309
+
+ After this patch:
+
+    threads  events/ms  run-time ms
+          8       5598         1429
+         16       7073         2262
+         32       7502         4265
+         64       7640         8376
+        128       7634        16767
+
+ (number of "events/ms" represents event bandwidth, thus higher is
+  better; number of "run-time ms" represents overall time spent
+  doing the benchmark, thus lower is better)
+
+[1] tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c
+[2] https://github.com/rouming/test-tools/blob/master/stress-epoll.c
+
+Signed-off-by: Roman Penyaev <rpenyaev@suse.de>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Reviewed-by: Jason Baron <jbaron@akamai.com>
+Cc: Khazhismel Kumykov <khazhy@google.com>
+Cc: Alexander Viro <viro@zeniv.linux.org.uk>
+Cc: Heiher <r@hev.cc>
+Cc: <stable@vger.kernel.org>
+Link: http://lkml.kernel.org/r/20200430130326.1368509-2-rpenyaev@suse.de
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/eventpoll.c |   43 ++++++++++++++++++++++++-------------------
+ 1 file changed, 24 insertions(+), 19 deletions(-)
+
+--- a/fs/eventpoll.c
++++ b/fs/eventpoll.c
+@@ -1827,7 +1827,6 @@ static int ep_poll(struct eventpoll *ep,
+ {
+ 	int res = 0, eavail, timed_out = 0;
+ 	u64 slack = 0;
+-	bool waiter = false;
+ 	wait_queue_entry_t wait;
+ 	ktime_t expires, *to = NULL;
+ 
+@@ -1872,21 +1871,23 @@ fetch_events:
+ 	 */
+ 	ep_reset_busy_poll_napi_id(ep);
+ 
+-	/*
+-	 * We don't have any available event to return to the caller.  We need
+-	 * to sleep here, and we will be woken by ep_poll_callback() when events
+-	 * become available.
+-	 */
+-	if (!waiter) {
+-		waiter = true;
+-		init_waitqueue_entry(&wait, current);
+-
++	do {
++		/*
++		 * Internally init_wait() uses autoremove_wake_function(),
++		 * thus wait entry is removed from the wait queue on each
++		 * wakeup. Why it is important? In case of several waiters
++		 * each new wakeup will hit the next waiter, giving it the
++		 * chance to harvest new event. Otherwise wakeup can be
++		 * lost. This is also good performance-wise, because on
++		 * normal wakeup path no need to call __remove_wait_queue()
++		 * explicitly, thus ep->lock is not taken, which halts the
++		 * event delivery.
++		 */
++		init_wait(&wait);
+ 		write_lock_irq(&ep->lock);
+ 		__add_wait_queue_exclusive(&ep->wq, &wait);
+ 		write_unlock_irq(&ep->lock);
+-	}
+ 
+-	for (;;) {
+ 		/*
+ 		 * We don't want to sleep if the ep_poll_callback() sends us
+ 		 * a wakeup in between. That's why we set the task state
+@@ -1916,10 +1917,20 @@ fetch_events:
+ 			timed_out = 1;
+ 			break;
+ 		}
+-	}
++
++		/* We were woken up, thus go and try to harvest some events */
++		eavail = 1;
++
++	} while (0);
+ 
+ 	__set_current_state(TASK_RUNNING);
+ 
++	if (!list_empty_careful(&wait.entry)) {
++		write_lock_irq(&ep->lock);
++		__remove_wait_queue(&ep->wq, &wait);
++		write_unlock_irq(&ep->lock);
++	}
++
+ send_events:
+ 	/*
+ 	 * Try to transfer events to user space. In case we get 0 events and
+@@ -1930,12 +1941,6 @@ send_events:
+ 	    !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
+ 		goto fetch_events;
+ 
+-	if (waiter) {
+-		write_lock_irq(&ep->lock);
+-		__remove_wait_queue(&ep->wq, &wait);
+-		write_unlock_irq(&ep->lock);
+-	}
+-
+ 	return res;
+ }
+ 
diff --git a/queue-5.4/eventpoll-fix-missing-wakeup-for-ovflist-in-ep_poll_callback.patch b/queue-5.4/eventpoll-fix-missing-wakeup-for-ovflist-in-ep_poll_callback.patch
new file mode 100644
index 00000000000..ea3e7eaedbd
--- /dev/null
+++ b/queue-5.4/eventpoll-fix-missing-wakeup-for-ovflist-in-ep_poll_callback.patch
@@ -0,0 +1,77 @@
+From 0c54a6a44bf3d41e76ce3f583a6ece267618df2e Mon Sep 17 00:00:00 2001
+From: Khazhismel Kumykov <khazhy@google.com>
+Date: Thu, 7 May 2020 18:35:59 -0700
+Subject: eventpoll: fix missing wakeup for ovflist in ep_poll_callback
+
+From: Khazhismel Kumykov <khazhy@google.com>
+
+commit 0c54a6a44bf3d41e76ce3f583a6ece267618df2e upstream.
+
+In the event that we add to ovflist, before commit 339ddb53d373
+("fs/epoll: remove unnecessary wakeups of nested epoll") we would be
+woken up by ep_scan_ready_list, and did no wakeup in ep_poll_callback.
+
+With that wakeup removed, if we add to ovflist here, we may never wake
+up.  Rather than adding back the ep_scan_ready_list wakeup - which was
+resulting in unnecessary wakeups, trigger a wake-up in ep_poll_callback.
+
+We noticed that one of our workloads was missing wakeups starting with
+339ddb53d373 and upon manual inspection, this wakeup seemed missing to me.
+With this patch added, we no longer see missing wakeups.  I haven't yet
+tried to make a small reproducer, but the existing kselftests in
+filesystem/epoll passed for me with this patch.
+
+[khazhy@google.com: use if/elif instead of goto + cleanup suggested by Roman]
+  Link: http://lkml.kernel.org/r/20200424190039.192373-1-khazhy@google.com
+Fixes: 339ddb53d373 ("fs/epoll: remove unnecessary wakeups of nested epoll")
+Signed-off-by: Khazhismel Kumykov <khazhy@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Reviewed-by: Roman Penyaev <rpenyaev@suse.de>
+Cc: Alexander Viro <viro@zeniv.linux.org.uk>
+Cc: Roman Penyaev <rpenyaev@suse.de>
+Cc: Heiher <r@hev.cc>
+Cc: Jason Baron <jbaron@akamai.com>
+Cc: <stable@vger.kernel.org>
+Link: http://lkml.kernel.org/r/20200424025057.118641-1-khazhy@google.com
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/eventpoll.c |   18 +++++++++---------
+ 1 file changed, 9 insertions(+), 9 deletions(-)
+
+--- a/fs/eventpoll.c
++++ b/fs/eventpoll.c
+@@ -1176,6 +1176,10 @@ static inline bool chain_epi_lockless(st
+ {
+ 	struct eventpoll *ep = epi->ep;
+ 
++	/* Fast preliminary check */
++	if (epi->next != EP_UNACTIVE_PTR)
++		return false;
++
+ 	/* Check that the same epi has not been just chained from another CPU */
+ 	if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR)
+ 		return false;
+@@ -1242,16 +1246,12 @@ static int ep_poll_callback(wait_queue_e
+ 	 * chained in ep->ovflist and requeued later on.
+ 	 */
+ 	if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
+-		if (epi->next == EP_UNACTIVE_PTR &&
+-		    chain_epi_lockless(epi))
++		if (chain_epi_lockless(epi))
++			ep_pm_stay_awake_rcu(epi);
++	} else if (!ep_is_linked(epi)) {
++		/* In the usual case, add event to ready list. */
++		if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist))
+ 			ep_pm_stay_awake_rcu(epi);
+-		goto out_unlock;
+-	}
+-
+-	/* If this file is already in the ready list we exit soon */
+-	if (!ep_is_linked(epi) &&
+-	    list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) {
+-		ep_pm_stay_awake_rcu(epi);
+ 	}
+ 
+ 	/*
diff --git a/queue-5.4/iocost-protect-iocg-abs_vdebt-with-iocg-waitq.lock.patch b/queue-5.4/iocost-protect-iocg-abs_vdebt-with-iocg-waitq.lock.patch
new file mode 100644
index 00000000000..583091e76bd
--- /dev/null
+++ b/queue-5.4/iocost-protect-iocg-abs_vdebt-with-iocg-waitq.lock.patch
@@ -0,0 +1,300 @@
+From 0b80f9866e6bbfb905140ed8787ff2af03652c0c Mon Sep 17 00:00:00 2001
+From: Tejun Heo <tj@kernel.org>
+Date: Mon, 4 May 2020 19:27:54 -0400
+Subject: iocost: protect iocg->abs_vdebt with iocg->waitq.lock
+
+From: Tejun Heo <tj@kernel.org>
+
+commit 0b80f9866e6bbfb905140ed8787ff2af03652c0c upstream.
+
+abs_vdebt is an atomic_64 which tracks how much over budget a given cgroup
+is and controls the activation of use_delay mechanism. Once a cgroup goes
+over budget from forced IOs, it has to pay it back with its future budget.
+The progress guarantee on debt paying comes from the iocg being active -
+active iocgs are processed by the periodic timer, which ensures that as time
+passes the debts dissipate and the iocg returns to normal operation.
+
+However, both iocg activation and vdebt handling are asynchronous and a
+sequence like the following may happen.
+
+1. The iocg is in the process of being deactivated by the periodic timer.
+
+2. A bio enters ioc_rqos_throttle(), calls iocg_activate() which returns
+   without anything because it still sees that the iocg is already active.
+
+3. The iocg is deactivated.
+
+4. The bio from #2 is over budget but needs to be forced. It increases
+   abs_vdebt and goes over the threshold and enables use_delay.
+
+5. IO control is enabled for the iocg's subtree and now IOs are attributed
+   to the descendant cgroups and the iocg itself no longer issues IOs.
+
+This leaves the iocg with stuck abs_vdebt - it has debt but inactive and no
+further IOs which can activate it. This can end up unduly punishing all the
+descendants cgroups.
+
+The usual throttling path has the same issue - the iocg must be active while
+throttled to ensure that future event will wake it up - and solves the
+problem by synchronizing the throttling path with a spinlock. abs_vdebt
+handling is another form of overage handling and shares a lot of
+characteristics including the fact that it isn't in the hottest path.
+
+This patch fixes the above and other possible races by strictly
+synchronizing abs_vdebt and use_delay handling with iocg->waitq.lock.
+
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Reported-by: Vlad Dmitriev <vvd@fb.com>
+Cc: stable@vger.kernel.org # v5.4+
+Fixes: e1518f63f246 ("blk-iocost: Don't let merges push vtime into the future")
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ block/blk-iocost.c             |  117 ++++++++++++++++++++++++-----------------
+ tools/cgroup/iocost_monitor.py |    7 ++
+ 2 files changed, 77 insertions(+), 47 deletions(-)
+
+--- a/block/blk-iocost.c
++++ b/block/blk-iocost.c
+@@ -469,7 +469,7 @@ struct ioc_gq {
+ 	 */
+ 	atomic64_t			vtime;
+ 	atomic64_t			done_vtime;
+-	atomic64_t			abs_vdebt;
++	u64				abs_vdebt;
+ 	u64				last_vtime;
+ 
+ 	/*
+@@ -1145,7 +1145,7 @@ static void iocg_kick_waitq(struct ioc_g
+ 	struct iocg_wake_ctx ctx = { .iocg = iocg };
+ 	u64 margin_ns = (u64)(ioc->period_us *
+ 			      WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC;
+-	u64 abs_vdebt, vdebt, vshortage, expires, oexpires;
++	u64 vdebt, vshortage, expires, oexpires;
+ 	s64 vbudget;
+ 	u32 hw_inuse;
+ 
+@@ -1155,18 +1155,15 @@ static void iocg_kick_waitq(struct ioc_g
+ 	vbudget = now->vnow - atomic64_read(&iocg->vtime);
+ 
+ 	/* pay off debt */
+-	abs_vdebt = atomic64_read(&iocg->abs_vdebt);
+-	vdebt = abs_cost_to_cost(abs_vdebt, hw_inuse);
++	vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
+ 	if (vdebt && vbudget > 0) {
+ 		u64 delta = min_t(u64, vbudget, vdebt);
+ 		u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse),
+-				    abs_vdebt);
++				    iocg->abs_vdebt);
+ 
+ 		atomic64_add(delta, &iocg->vtime);
+ 		atomic64_add(delta, &iocg->done_vtime);
+-		atomic64_sub(abs_delta, &iocg->abs_vdebt);
+-		if (WARN_ON_ONCE(atomic64_read(&iocg->abs_vdebt) < 0))
+-			atomic64_set(&iocg->abs_vdebt, 0);
++		iocg->abs_vdebt -= abs_delta;
+ 	}
+ 
+ 	/*
+@@ -1222,12 +1219,18 @@ static bool iocg_kick_delay(struct ioc_g
+ 	u64 expires, oexpires;
+ 	u32 hw_inuse;
+ 
++	lockdep_assert_held(&iocg->waitq.lock);
++
+ 	/* debt-adjust vtime */
+ 	current_hweight(iocg, NULL, &hw_inuse);
+-	vtime += abs_cost_to_cost(atomic64_read(&iocg->abs_vdebt), hw_inuse);
++	vtime += abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
+ 
+-	/* clear or maintain depending on the overage */
+-	if (time_before_eq64(vtime, now->vnow)) {
++	/*
++	 * Clear or maintain depending on the overage. Non-zero vdebt is what
++	 * guarantees that @iocg is online and future iocg_kick_delay() will
++	 * clear use_delay. Don't leave it on when there's no vdebt.
++	 */
++	if (!iocg->abs_vdebt || time_before_eq64(vtime, now->vnow)) {
+ 		blkcg_clear_delay(blkg);
+ 		return false;
+ 	}
+@@ -1261,9 +1264,12 @@ static enum hrtimer_restart iocg_delay_t
+ {
+ 	struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer);
+ 	struct ioc_now now;
++	unsigned long flags;
+ 
++	spin_lock_irqsave(&iocg->waitq.lock, flags);
+ 	ioc_now(iocg->ioc, &now);
+ 	iocg_kick_delay(iocg, &now, 0);
++	spin_unlock_irqrestore(&iocg->waitq.lock, flags);
+ 
+ 	return HRTIMER_NORESTART;
+ }
+@@ -1371,14 +1377,13 @@ static void ioc_timer_fn(struct timer_li
+ 	 * should have woken up in the last period and expire idle iocgs.
+ 	 */
+ 	list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
+-		if (!waitqueue_active(&iocg->waitq) &&
+-		    !atomic64_read(&iocg->abs_vdebt) && !iocg_is_idle(iocg))
++		if (!waitqueue_active(&iocg->waitq) && iocg->abs_vdebt &&
++		    !iocg_is_idle(iocg))
+ 			continue;
+ 
+ 		spin_lock(&iocg->waitq.lock);
+ 
+-		if (waitqueue_active(&iocg->waitq) ||
+-		    atomic64_read(&iocg->abs_vdebt)) {
++		if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt) {
+ 			/* might be oversleeping vtime / hweight changes, kick */
+ 			iocg_kick_waitq(iocg, &now);
+ 			iocg_kick_delay(iocg, &now, 0);
+@@ -1721,28 +1726,49 @@ static void ioc_rqos_throttle(struct rq_
+ 	 * tests are racy but the races aren't systemic - we only miss once
+ 	 * in a while which is fine.
+ 	 */
+-	if (!waitqueue_active(&iocg->waitq) &&
+-	    !atomic64_read(&iocg->abs_vdebt) &&
++	if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
+ 	    time_before_eq64(vtime + cost, now.vnow)) {
+ 		iocg_commit_bio(iocg, bio, cost);
+ 		return;
+ 	}
+ 
+ 	/*
+-	 * We're over budget.  If @bio has to be issued regardless,
+-	 * remember the abs_cost instead of advancing vtime.
+-	 * iocg_kick_waitq() will pay off the debt before waking more IOs.
++	 * We activated above but w/o any synchronization. Deactivation is
++	 * synchronized with waitq.lock and we won't get deactivated as long
++	 * as we're waiting or has debt, so we're good if we're activated
++	 * here. In the unlikely case that we aren't, just issue the IO.
++	 */
++	spin_lock_irq(&iocg->waitq.lock);
++
++	if (unlikely(list_empty(&iocg->active_list))) {
++		spin_unlock_irq(&iocg->waitq.lock);
++		iocg_commit_bio(iocg, bio, cost);
++		return;
++	}
++
++	/*
++	 * We're over budget. If @bio has to be issued regardless, remember
++	 * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay
++	 * off the debt before waking more IOs.
++	 *
+ 	 * This way, the debt is continuously paid off each period with the
+-	 * actual budget available to the cgroup.  If we just wound vtime,
+-	 * we would incorrectly use the current hw_inuse for the entire
+-	 * amount which, for example, can lead to the cgroup staying
+-	 * blocked for a long time even with substantially raised hw_inuse.
++	 * actual budget available to the cgroup. If we just wound vtime, we
++	 * would incorrectly use the current hw_inuse for the entire amount
++	 * which, for example, can lead to the cgroup staying blocked for a
++	 * long time even with substantially raised hw_inuse.
++	 *
++	 * An iocg with vdebt should stay online so that the timer can keep
++	 * deducting its vdebt and [de]activate use_delay mechanism
++	 * accordingly. We don't want to race against the timer trying to
++	 * clear them and leave @iocg inactive w/ dangling use_delay heavily
++	 * penalizing the cgroup and its descendants.
+ 	 */
+ 	if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) {
+-		atomic64_add(abs_cost, &iocg->abs_vdebt);
++		iocg->abs_vdebt += abs_cost;
+ 		if (iocg_kick_delay(iocg, &now, cost))
+ 			blkcg_schedule_throttle(rqos->q,
+ 					(bio->bi_opf & REQ_SWAP) == REQ_SWAP);
++		spin_unlock_irq(&iocg->waitq.lock);
+ 		return;
+ 	}
+ 
+@@ -1759,20 +1785,6 @@ static void ioc_rqos_throttle(struct rq_
+ 	 * All waiters are on iocg->waitq and the wait states are
+ 	 * synchronized using waitq.lock.
+ 	 */
+-	spin_lock_irq(&iocg->waitq.lock);
+-
+-	/*
+-	 * We activated above but w/o any synchronization.  Deactivation is
+-	 * synchronized with waitq.lock and we won't get deactivated as
+-	 * long as we're waiting, so we're good if we're activated here.
+-	 * In the unlikely case that we are deactivated, just issue the IO.
+-	 */
+-	if (unlikely(list_empty(&iocg->active_list))) {
+-		spin_unlock_irq(&iocg->waitq.lock);
+-		iocg_commit_bio(iocg, bio, cost);
+-		return;
+-	}
+-
+ 	init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
+ 	wait.wait.private = current;
+ 	wait.bio = bio;
+@@ -1804,6 +1816,7 @@ static void ioc_rqos_merge(struct rq_qos
+ 	struct ioc_now now;
+ 	u32 hw_inuse;
+ 	u64 abs_cost, cost;
++	unsigned long flags;
+ 
+ 	/* bypass if disabled or for root cgroup */
+ 	if (!ioc->enabled || !iocg->level)
+@@ -1823,15 +1836,28 @@ static void ioc_rqos_merge(struct rq_qos
+ 		iocg->cursor = bio_end;
+ 
+ 	/*
+-	 * Charge if there's enough vtime budget and the existing request
+-	 * has cost assigned.  Otherwise, account it as debt.  See debt
+-	 * handling in ioc_rqos_throttle() for details.
++	 * Charge if there's enough vtime budget and the existing request has
++	 * cost assigned.
+ 	 */
+ 	if (rq->bio && rq->bio->bi_iocost_cost &&
+-	    time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow))
++	    time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) {
+ 		iocg_commit_bio(iocg, bio, cost);
+-	else
+-		atomic64_add(abs_cost, &iocg->abs_vdebt);
++		return;
++	}
++
++	/*
++	 * Otherwise, account it as debt if @iocg is online, which it should
++	 * be for the vast majority of cases. See debt handling in
++	 * ioc_rqos_throttle() for details.
++	 */
++	spin_lock_irqsave(&iocg->waitq.lock, flags);
++	if (likely(!list_empty(&iocg->active_list))) {
++		iocg->abs_vdebt += abs_cost;
++		iocg_kick_delay(iocg, &now, cost);
++	} else {
++		iocg_commit_bio(iocg, bio, cost);
++	}
++	spin_unlock_irqrestore(&iocg->waitq.lock, flags);
+ }
+ 
+ static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
+@@ -2001,7 +2027,6 @@ static void ioc_pd_init(struct blkg_poli
+ 	iocg->ioc = ioc;
+ 	atomic64_set(&iocg->vtime, now.vnow);
+ 	atomic64_set(&iocg->done_vtime, now.vnow);
+-	atomic64_set(&iocg->abs_vdebt, 0);
+ 	atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
+ 	INIT_LIST_HEAD(&iocg->active_list);
+ 	iocg->hweight_active = HWEIGHT_WHOLE;
+--- a/tools/cgroup/iocost_monitor.py
++++ b/tools/cgroup/iocost_monitor.py
+@@ -159,7 +159,12 @@ class IocgStat:
+         else:
+             self.inflight_pct = 0
+ 
+-        self.debt_ms = iocg.abs_vdebt.counter.value_() / VTIME_PER_USEC / 1000
++        # vdebt used to be an atomic64_t and is now u64, support both
++        try:
++            self.debt_ms = iocg.abs_vdebt.counter.value_() / VTIME_PER_USEC / 1000
++        except:
++            self.debt_ms = iocg.abs_vdebt.value_() / VTIME_PER_USEC / 1000
++
+         self.use_delay = blkg.use_delay.counter.value_()
+         self.delay_ms = blkg.delay_nsec.counter.value_() / 1_000_000
+ 
diff --git a/queue-5.4/ipc-mqueue.c-change-__do_notify-to-bypass-check_kill_permission.patch b/queue-5.4/ipc-mqueue.c-change-__do_notify-to-bypass-check_kill_permission.patch
new file mode 100644
index 00000000000..0b7e7469df6
--- /dev/null
+++ b/queue-5.4/ipc-mqueue.c-change-__do_notify-to-bypass-check_kill_permission.patch
@@ -0,0 +1,149 @@
+From b5f2006144c6ae941726037120fa1001ddede784 Mon Sep 17 00:00:00 2001
+From: Oleg Nesterov <oleg@redhat.com>
+Date: Thu, 7 May 2020 18:35:39 -0700
+Subject: ipc/mqueue.c: change __do_notify() to bypass check_kill_permission()
+
+From: Oleg Nesterov <oleg@redhat.com>
+
+commit b5f2006144c6ae941726037120fa1001ddede784 upstream.
+
+Commit cc731525f26a ("signal: Remove kernel interal si_code magic")
+changed the value of SI_FROMUSER(SI_MESGQ), this means that mq_notify() no
+longer works if the sender doesn't have rights to send a signal.
+
+Change __do_notify() to use do_send_sig_info() instead of kill_pid_info()
+to avoid check_kill_permission().
+
+This needs the additional notify.sigev_signo != 0 check, shouldn't we
+change do_mq_notify() to deny sigev_signo == 0 ?
+
+Test-case:
+
+	#include <signal.h>
+	#include <mqueue.h>
+	#include <unistd.h>
+	#include <sys/wait.h>
+	#include <assert.h>
+
+	static int notified;
+
+	static void sigh(int sig)
+	{
+		notified = 1;
+	}
+
+	int main(void)
+	{
+		signal(SIGIO, sigh);
+
+		int fd = mq_open("/mq", O_RDWR|O_CREAT, 0666, NULL);
+		assert(fd >= 0);
+
+		struct sigevent se = {
+			.sigev_notify	= SIGEV_SIGNAL,
+			.sigev_signo	= SIGIO,
+		};
+		assert(mq_notify(fd, &se) == 0);
+
+		if (!fork()) {
+			assert(setuid(1) == 0);
+			mq_send(fd, "",1,0);
+			return 0;
+		}
+
+		wait(NULL);
+		mq_unlink("/mq");
+		assert(notified);
+		return 0;
+	}
+
+[manfred@colorfullife.com: 1) Add self_exec_id evaluation so that the implementation matches do_notify_parent 2) use PIDTYPE_TGID everywhere]
+Fixes: cc731525f26a ("signal: Remove kernel interal si_code magic")
+Reported-by: Yoji <yoji.fujihar.min@gmail.com>
+Signed-off-by: Oleg Nesterov <oleg@redhat.com>
+Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
+Cc: Davidlohr Bueso <dave@stgolabs.net>
+Cc: Markus Elfring <elfring@users.sourceforge.net>
+Cc: <1vier1@web.de>
+Cc: <stable@vger.kernel.org>
+Link: http://lkml.kernel.org/r/e2a782e4-eab9-4f5c-c749-c07a8f7a4e66@colorfullife.com
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ ipc/mqueue.c |   34 ++++++++++++++++++++++++++--------
+ 1 file changed, 26 insertions(+), 8 deletions(-)
+
+--- a/ipc/mqueue.c
++++ b/ipc/mqueue.c
+@@ -82,6 +82,7 @@ struct mqueue_inode_info {
+ 
+ 	struct sigevent notify;
+ 	struct pid *notify_owner;
++	u32 notify_self_exec_id;
+ 	struct user_namespace *notify_user_ns;
+ 	struct user_struct *user;	/* user who created, for accounting */
+ 	struct sock *notify_sock;
+@@ -709,28 +710,44 @@ static void __do_notify(struct mqueue_in
+ 	 * synchronously. */
+ 	if (info->notify_owner &&
+ 	    info->attr.mq_curmsgs == 1) {
+-		struct kernel_siginfo sig_i;
+ 		switch (info->notify.sigev_notify) {
+ 		case SIGEV_NONE:
+ 			break;
+-		case SIGEV_SIGNAL:
+-			/* sends signal */
++		case SIGEV_SIGNAL: {
++			struct kernel_siginfo sig_i;
++			struct task_struct *task;
++
++			/* do_mq_notify() accepts sigev_signo == 0, why?? */
++			if (!info->notify.sigev_signo)
++				break;
+ 
+ 			clear_siginfo(&sig_i);
+ 			sig_i.si_signo = info->notify.sigev_signo;
+ 			sig_i.si_errno = 0;
+ 			sig_i.si_code = SI_MESGQ;
+ 			sig_i.si_value = info->notify.sigev_value;
+-			/* map current pid/uid into info->owner's namespaces */
+ 			rcu_read_lock();
++			/* map current pid/uid into info->owner's namespaces */
+ 			sig_i.si_pid = task_tgid_nr_ns(current,
+ 						ns_of_pid(info->notify_owner));
+-			sig_i.si_uid = from_kuid_munged(info->notify_user_ns, current_uid());
++			sig_i.si_uid = from_kuid_munged(info->notify_user_ns,
++						current_uid());
++			/*
++			 * We can't use kill_pid_info(), this signal should
++			 * bypass check_kill_permission(). It is from kernel
++			 * but si_fromuser() can't know this.
++			 * We do check the self_exec_id, to avoid sending
++			 * signals to programs that don't expect them.
++			 */
++			task = pid_task(info->notify_owner, PIDTYPE_TGID);
++			if (task && task->self_exec_id ==
++						info->notify_self_exec_id) {
++				do_send_sig_info(info->notify.sigev_signo,
++						&sig_i, task, PIDTYPE_TGID);
++			}
+ 			rcu_read_unlock();
+-
+-			kill_pid_info(info->notify.sigev_signo,
+-				      &sig_i, info->notify_owner);
+ 			break;
++		}
+ 		case SIGEV_THREAD:
+ 			set_cookie(info->notify_cookie, NOTIFY_WOKENUP);
+ 			netlink_sendskb(info->notify_sock, info->notify_cookie);
+@@ -1315,6 +1332,7 @@ retry:
+ 			info->notify.sigev_signo = notification->sigev_signo;
+ 			info->notify.sigev_value = notification->sigev_value;
+ 			info->notify.sigev_notify = SIGEV_SIGNAL;
++			info->notify_self_exec_id = current->self_exec_id;
+ 			break;
+ 		}
+ 
diff --git a/queue-5.4/mm-limit-boost_watermark-on-small-zones.patch b/queue-5.4/mm-limit-boost_watermark-on-small-zones.patch
new file mode 100644
index 00000000000..eabe7f83f90
--- /dev/null
+++ b/queue-5.4/mm-limit-boost_watermark-on-small-zones.patch
@@ -0,0 +1,82 @@
+From 14f69140ff9c92a0928547ceefb153a842e8492c Mon Sep 17 00:00:00 2001
+From: Henry Willard <henry.willard@oracle.com>
+Date: Thu, 7 May 2020 18:36:27 -0700
+Subject: mm: limit boost_watermark on small zones
+
+From: Henry Willard <henry.willard@oracle.com>
+
+commit 14f69140ff9c92a0928547ceefb153a842e8492c upstream.
+
+Commit 1c30844d2dfe ("mm: reclaim small amounts of memory when an
+external fragmentation event occurs") adds a boost_watermark() function
+which increases the min watermark in a zone by at least
+pageblock_nr_pages or the number of pages in a page block.
+
+On Arm64, with 64K pages and 512M huge pages, this is 8192 pages or
+512M.  It does this regardless of the number of managed pages managed in
+the zone or the likelihood of success.
+
+This can put the zone immediately under water in terms of allocating
+pages from the zone, and can cause a small machine to fail immediately
+due to OoM.  Unlike set_recommended_min_free_kbytes(), which
+substantially increases min_free_kbytes and is tied to THP,
+boost_watermark() can be called even if THP is not active.
+
+The problem is most likely to appear on architectures such as Arm64
+where pageblock_nr_pages is very large.
+
+It is desirable to run the kdump capture kernel in as small a space as
+possible to avoid wasting memory.  In some architectures, such as Arm64,
+there are restrictions on where the capture kernel can run, and
+therefore, the space available.  A capture kernel running in 768M can
+fail due to OoM immediately after boost_watermark() sets the min in zone
+DMA32, where most of the memory is, to 512M.  It fails even though there
+is over 500M of free memory.  With boost_watermark() suppressed, the
+capture kernel can run successfully in 448M.
+
+This patch limits boost_watermark() to boosting a zone's min watermark
+only when there are enough pages that the boost will produce positive
+results.  In this case that is estimated to be four times as many pages
+as pageblock_nr_pages.
+
+Mel said:
+
+: There is no harm in marking it stable.  Clearly it does not happen very
+: often but it's not impossible.  32-bit x86 is a lot less common now
+: which would previously have been vulnerable to triggering this easily.
+: ppc64 has a larger base page size but typically only has one zone.
+: arm64 is likely the most vulnerable, particularly when CMA is
+: configured with a small movable zone.
+
+Fixes: 1c30844d2dfe ("mm: reclaim small amounts of memory when an external fragmentation event occurs")
+Signed-off-by: Henry Willard <henry.willard@oracle.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Reviewed-by: David Hildenbrand <david@redhat.com>
+Acked-by: Mel Gorman <mgorman@techsingularity.net>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: <stable@vger.kernel.org>
+Link: http://lkml.kernel.org/r/1588294148-6586-1-git-send-email-henry.willard@oracle.com
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/page_alloc.c |    8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -2351,6 +2351,14 @@ static inline void boost_watermark(struc
+ 
+ 	if (!watermark_boost_factor)
+ 		return;
++	/*
++	 * Don't bother in zones that are unlikely to produce results.
++	 * On small machines, including kdump capture kernels running
++	 * in a small area, boosting the watermark can cause an out of
++	 * memory situation immediately.
++	 */
++	if ((pageblock_nr_pages * 4) > zone_managed_pages(zone))
++		return;
+ 
+ 	max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
+ 			watermark_boost_factor, 10000);
diff --git a/queue-5.4/mm-page_alloc-fix-watchdog-soft-lockups-during-set_zone_contiguous.patch b/queue-5.4/mm-page_alloc-fix-watchdog-soft-lockups-during-set_zone_contiguous.patch
new file mode 100644
index 00000000000..5e9c5951118
--- /dev/null
+++ b/queue-5.4/mm-page_alloc-fix-watchdog-soft-lockups-during-set_zone_contiguous.patch
@@ -0,0 +1,62 @@
+From e84fe99b68ce353c37ceeecc95dce9696c976556 Mon Sep 17 00:00:00 2001
+From: David Hildenbrand <david@redhat.com>
+Date: Thu, 7 May 2020 18:35:46 -0700
+Subject: mm/page_alloc: fix watchdog soft lockups during set_zone_contiguous()
+
+From: David Hildenbrand <david@redhat.com>
+
+commit e84fe99b68ce353c37ceeecc95dce9696c976556 upstream.
+
+Without CONFIG_PREEMPT, it can happen that we get soft lockups detected,
+e.g., while booting up.
+
+  watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [swapper/0:1]
+  CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.6.0-next-20200331+ #4
+  Hardware name: Red Hat KVM, BIOS 1.11.1-4.module+el8.1.0+4066+0f1aadab 04/01/2014
+  RIP: __pageblock_pfn_to_page+0x134/0x1c0
+  Call Trace:
+   set_zone_contiguous+0x56/0x70
+   page_alloc_init_late+0x166/0x176
+   kernel_init_freeable+0xfa/0x255
+   kernel_init+0xa/0x106
+   ret_from_fork+0x35/0x40
+
+The issue becomes visible when having a lot of memory (e.g., 4TB)
+assigned to a single NUMA node - a system that can easily be created
+using QEMU.  Inside VMs on a hypervisor with quite some memory
+overcommit, this is fairly easy to trigger.
+
+Signed-off-by: David Hildenbrand <david@redhat.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Reviewed-by: Pavel Tatashin <pasha.tatashin@soleen.com>
+Reviewed-by: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
+Reviewed-by: Baoquan He <bhe@redhat.com>
+Reviewed-by: Shile Zhang <shile.zhang@linux.alibaba.com>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
+Cc: Shile Zhang <shile.zhang@linux.alibaba.com>
+Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
+Cc: Daniel Jordan <daniel.m.jordan@oracle.com>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Alexander Duyck <alexander.duyck@gmail.com>
+Cc: Baoquan He <bhe@redhat.com>
+Cc: Oscar Salvador <osalvador@suse.de>
+Cc: <stable@vger.kernel.org>
+Link: http://lkml.kernel.org/r/20200416073417.5003-1-david@redhat.com
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/page_alloc.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -1555,6 +1555,7 @@ void set_zone_contiguous(struct zone *zo
+ 		if (!__pageblock_pfn_to_page(block_start_pfn,
+ 					     block_end_pfn, zone))
+ 			return;
++		cond_resched();
+ 	}
+ 
+ 	/* We confirm that there is no hole */
diff --git a/queue-5.4/riscv-set-max_pfn-to-the-pfn-of-the-last-page.patch b/queue-5.4/riscv-set-max_pfn-to-the-pfn-of-the-last-page.patch
new file mode 100644
index 00000000000..1960d4c4d32
--- /dev/null
+++ b/queue-5.4/riscv-set-max_pfn-to-the-pfn-of-the-last-page.patch
@@ -0,0 +1,70 @@
+From c749bb2d554825e007cbc43b791f54e124dadfce Mon Sep 17 00:00:00 2001
+From: Vincent Chen <vincent.chen@sifive.com>
+Date: Mon, 27 Apr 2020 14:59:24 +0800
+Subject: riscv: set max_pfn to the PFN of the last page
+
+From: Vincent Chen <vincent.chen@sifive.com>
+
+commit c749bb2d554825e007cbc43b791f54e124dadfce upstream.
+
+The current max_pfn equals to zero. In this case, I found it caused users
+cannot get some page information through /proc such as kpagecount in v5.6
+kernel because of new sanity checks. The following message is displayed by
+stress-ng test suite with the command "stress-ng --verbose --physpage 1 -t
+1" on HiFive unleashed board.
+
+ # stress-ng --verbose --physpage 1 -t 1
+ stress-ng: debug: [109] 4 processors online, 4 processors configured
+ stress-ng: info: [109] dispatching hogs: 1 physpage
+ stress-ng: debug: [109] cache allocate: reducing cache level from L3 (too high) to L0
+ stress-ng: debug: [109] get_cpu_cache: invalid cache_level: 0
+ stress-ng: info: [109] cache allocate: using built-in defaults as no suitable cache found
+ stress-ng: debug: [109] cache allocate: default cache size: 2048K
+ stress-ng: debug: [109] starting stressors
+ stress-ng: debug: [109] 1 stressor spawned
+ stress-ng: debug: [110] stress-ng-physpage: started [110] (instance 0)
+ stress-ng: error: [110] stress-ng-physpage: cannot read page count for address 0x3fd34de000 in /proc/kpagecount, errno=0 (Success)
+ stress-ng: error: [110] stress-ng-physpage: cannot read page count for address 0x3fd32db078 in /proc/kpagecount, errno=0 (Success)
+ ...
+ stress-ng: error: [110] stress-ng-physpage: cannot read page count for address 0x3fd32db078 in /proc/kpagecount, errno=0 (Success)
+ stress-ng: debug: [110] stress-ng-physpage: exited [110] (instance 0)
+ stress-ng: debug: [109] process [110] terminated
+ stress-ng: info: [109] successful run completed in 1.00s
+ #
+
+After applying this patch, the kernel can pass the test.
+
+ # stress-ng --verbose --physpage 1 -t 1
+ stress-ng: debug: [104] 4 processors online, 4 processors configured stress-ng: info: [104] dispatching hogs: 1 physpage
+ stress-ng: info: [104] cache allocate: using defaults, can't determine cache details from sysfs
+ stress-ng: debug: [104] cache allocate: default cache size: 2048K
+ stress-ng: debug: [104] starting stressors
+ stress-ng: debug: [104] 1 stressor spawned
+ stress-ng: debug: [105] stress-ng-physpage: started [105] (instance 0) stress-ng: debug: [105] stress-ng-physpage: exited [105] (instance 0) stress-ng: debug: [104] process [105] terminated
+ stress-ng: info: [104] successful run completed in 1.01s
+ #
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Vincent Chen <vincent.chen@sifive.com>
+Reviewed-by: Anup Patel <anup@brainfault.org>
+Reviewed-by: Yash Shah <yash.shah@sifive.com>
+Tested-by: Yash Shah <yash.shah@sifive.com>
+Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/riscv/mm/init.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/arch/riscv/mm/init.c
++++ b/arch/riscv/mm/init.c
+@@ -116,7 +116,8 @@ void __init setup_bootmem(void)
+ 	memblock_reserve(vmlinux_start, vmlinux_end - vmlinux_start);
+ 
+ 	set_max_mapnr(PFN_DOWN(mem_size));
+-	max_low_pfn = PFN_DOWN(memblock_end_of_DRAM());
++	max_pfn = PFN_DOWN(memblock_end_of_DRAM());
++	max_low_pfn = max_pfn;
+ 
+ #ifdef CONFIG_BLK_DEV_INITRD
+ 	setup_initrd();
diff --git a/queue-5.4/series b/queue-5.4/series
index a9e44c4f080..444b81594bc 100644
--- a/queue-5.4/series
+++ b/queue-5.4/series
@@ -52,3 +52,16 @@ kvm-s390-remove-false-warn_on_once-for-the-pqap-instruction.patch
 kvm-vmx-explicitly-clear-rflags.cf-and-rflags.zf-in-vm-exit-rsb-path.patch
 kvm-arm-vgic-fix-limit-condition-when-writing-to-gicd_iactiver.patch
 kvm-arm64-fix-32bit-pc-wrap-around.patch
+arm64-hugetlb-avoid-potential-null-dereference.patch
+drm-ingenic-drm-add-module_device_table.patch
+ipc-mqueue.c-change-__do_notify-to-bypass-check_kill_permission.patch
+epoll-atomically-remove-wait-entry-on-wake-up.patch
+eventpoll-fix-missing-wakeup-for-ovflist-in-ep_poll_callback.patch
+mm-page_alloc-fix-watchdog-soft-lockups-during-set_zone_contiguous.patch
+mm-limit-boost_watermark-on-small-zones.patch
+ceph-fix-endianness-bug-when-handling-mds-session-feature-bits.patch
+ceph-demote-quotarealm-lookup-warning-to-a-debug-message.patch
+staging-gasket-check-the-return-value-of-gasket_get_bar_index.patch
+coredump-fix-crash-when-umh-is-disabled.patch
+riscv-set-max_pfn-to-the-pfn-of-the-last-page.patch
+iocost-protect-iocg-abs_vdebt-with-iocg-waitq.lock.patch
diff --git a/queue-5.4/staging-gasket-check-the-return-value-of-gasket_get_bar_index.patch b/queue-5.4/staging-gasket-check-the-return-value-of-gasket_get_bar_index.patch
new file mode 100644
index 00000000000..6144b7633a9
--- /dev/null
+++ b/queue-5.4/staging-gasket-check-the-return-value-of-gasket_get_bar_index.patch
@@ -0,0 +1,39 @@
+From 769acc3656d93aaacada814939743361d284fd87 Mon Sep 17 00:00:00 2001
+From: Oscar Carter <oscar.carter@gmx.com>
+Date: Fri, 1 May 2020 17:51:18 +0200
+Subject: staging: gasket: Check the return value of gasket_get_bar_index()
+
+From: Oscar Carter <oscar.carter@gmx.com>
+
+commit 769acc3656d93aaacada814939743361d284fd87 upstream.
+
+Check the return value of gasket_get_bar_index function as it can return
+a negative one (-EINVAL). If this happens, a negative index is used in
+the "gasket_dev->bar_data" array.
+
+Addresses-Coverity-ID: 1438542 ("Negative array index read")
+Fixes: 9a69f5087ccc2 ("drivers/staging: Gasket driver framework + Apex driver")
+Signed-off-by: Oscar Carter <oscar.carter@gmx.com>
+Cc: stable <stable@vger.kernel.org>
+Reviewed-by: Richard Yeh <rcy@google.com>
+Link: https://lore.kernel.org/r/20200501155118.13380-1-oscar.carter@gmx.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/staging/gasket/gasket_core.c |    4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/drivers/staging/gasket/gasket_core.c
++++ b/drivers/staging/gasket/gasket_core.c
+@@ -926,6 +926,10 @@ do_map_region(const struct gasket_dev *g
+ 		gasket_get_bar_index(gasket_dev,
+ 				     (vma->vm_pgoff << PAGE_SHIFT) +
+ 				     driver_desc->legacy_mmap_address_offset);
++
++	if (bar_index < 0)
++		return DO_MAP_REGION_INVALID;
++
+ 	phys_base = gasket_dev->bar_data[bar_index].phys_base + phys_offset;
+ 	while (mapped_bytes < map_length) {
+ 		/*
-- 
2.47.3