5.4-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Tue, 12 May 2020 11:35:23 +0000 (13:35 +0200)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Tue, 12 May 2020 11:35:23 +0000 (13:35 +0200)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 12 May 2020 11:35:23 +0000 (13:35 +0200)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 12 May 2020 11:35:23 +0000 (13:35 +0200)
diff --git a/queue-5.4/arm64-hugetlb-avoid-potential-null-dereference.patch b/queue-5.4/arm64-hugetlb-avoid-potential-null-dereference.patch

new file mode 100644 (file)

index 0000000..0c40d51
--- /dev/null
+++ b/queue-5.4/arm64-hugetlb-avoid-potential-null-dereference.patch
@@ -0,0 +1,56 @@
+From 027d0c7101f50cf03aeea9eebf484afd4920c8d3 Mon Sep 17 00:00:00 2001
+From: Mark Rutland <mark.rutland@arm.com>
+Date: Tue, 5 May 2020 13:59:30 +0100
+Subject: arm64: hugetlb: avoid potential NULL dereference
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+commit 027d0c7101f50cf03aeea9eebf484afd4920c8d3 upstream.
+
+The static analyzer in GCC 10 spotted that in huge_pte_alloc() we may
+pass a NULL pmdp into pte_alloc_map() when pmd_alloc() returns NULL:
+
+|   CC      arch/arm64/mm/pageattr.o
+|   CC      arch/arm64/mm/hugetlbpage.o
+|                  from arch/arm64/mm/hugetlbpage.c:10:
+| arch/arm64/mm/hugetlbpage.c: In function ‘huge_pte_alloc’:
+| ./arch/arm64/include/asm/pgtable-types.h:28:24: warning: dereference of NULL ‘pmdp’ [CWE-690] [-Wanalyzer-null-dereference]
+| ./arch/arm64/include/asm/pgtable.h:436:26: note: in expansion of macro ‘pmd_val’
+| arch/arm64/mm/hugetlbpage.c:242:10: note: in expansion of macro ‘pte_alloc_map’
+|     |arch/arm64/mm/hugetlbpage.c:232:10:
+|     |./arch/arm64/include/asm/pgtable-types.h:28:24:
+| ./arch/arm64/include/asm/pgtable.h:436:26: note: in expansion of macro ‘pmd_val’
+| arch/arm64/mm/hugetlbpage.c:242:10: note: in expansion of macro ‘pte_alloc_map’
+
+This can only occur when the kernel cannot allocate a page, and so is
+unlikely to happen in practice before other systems start failing.
+
+We can avoid this by bailing out if pmd_alloc() fails, as we do earlier
+in the function if pud_alloc() fails.
+
+Fixes: 66b3923a1a0f ("arm64: hugetlb: add support for PTE contiguous bit")
+Signed-off-by: Mark Rutland <mark.rutland@arm.com>
+Reported-by: Kyrill Tkachov <kyrylo.tkachov@arm.com>
+Cc: <stable@vger.kernel.org> # 4.5.x-
+Cc: Will Deacon <will@kernel.org>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/arm64/mm/hugetlbpage.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/arch/arm64/mm/hugetlbpage.c
++++ b/arch/arm64/mm/hugetlbpage.c
+@@ -230,6 +230,8 @@ pte_t *huge_pte_alloc(struct mm_struct *
+               ptep = (pte_t *)pudp;
+       } else if (sz == (CONT_PTE_SIZE)) {
+               pmdp = pmd_alloc(mm, pudp, addr);
++              if (!pmdp)
++                      return NULL;
+ 
+               WARN_ON(addr & (sz - 1));
+               /*
diff --git a/queue-5.4/ceph-demote-quotarealm-lookup-warning-to-a-debug-message.patch b/queue-5.4/ceph-demote-quotarealm-lookup-warning-to-a-debug-message.patch

new file mode 100644 (file)

index 0000000..17d5931
--- /dev/null
+++ b/queue-5.4/ceph-demote-quotarealm-lookup-warning-to-a-debug-message.patch
@@ -0,0 +1,40 @@
+From 12ae44a40a1be891bdc6463f8c7072b4ede746ef Mon Sep 17 00:00:00 2001
+From: Luis Henriques <lhenriques@suse.com>
+Date: Tue, 5 May 2020 13:59:02 +0100
+Subject: ceph: demote quotarealm lookup warning to a debug message
+
+From: Luis Henriques <lhenriques@suse.com>
+
+commit 12ae44a40a1be891bdc6463f8c7072b4ede746ef upstream.
+
+A misconfigured cephx can easily result in having the kernel client
+flooding the logs with:
+
+  ceph: Can't lookup inode 1 (err: -13)
+
+Change this message to debug level.
+
+Cc: stable@vger.kernel.org
+URL: https://tracker.ceph.com/issues/44546
+Signed-off-by: Luis Henriques <lhenriques@suse.com>
+Reviewed-by: Jeff Layton <jlayton@kernel.org>
+Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/ceph/quota.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/fs/ceph/quota.c
++++ b/fs/ceph/quota.c
+@@ -159,8 +159,8 @@ static struct inode *lookup_quotarealm_i
+       }
+ 
+       if (IS_ERR(in)) {
+-              pr_warn("Can't lookup inode %llx (err: %ld)\n",
+-                      realm->ino, PTR_ERR(in));
++              dout("Can't lookup inode %llx (err: %ld)\n",
++                   realm->ino, PTR_ERR(in));
+               qri->timeout = jiffies + msecs_to_jiffies(60 * 1000); /* XXX */
+       } else {
+               qri->timeout = 0;
diff --git a/queue-5.4/ceph-fix-endianness-bug-when-handling-mds-session-feature-bits.patch b/queue-5.4/ceph-fix-endianness-bug-when-handling-mds-session-feature-bits.patch

new file mode 100644 (file)

index 0000000..b1d7cd7
--- /dev/null
+++ b/queue-5.4/ceph-fix-endianness-bug-when-handling-mds-session-feature-bits.patch
@@ -0,0 +1,48 @@
+From 0fa8263367db9287aa0632f96c1a5f93cc478150 Mon Sep 17 00:00:00 2001
+From: Jeff Layton <jlayton@kernel.org>
+Date: Tue, 28 Apr 2020 08:10:22 -0400
+Subject: ceph: fix endianness bug when handling MDS session feature bits
+
+From: Jeff Layton <jlayton@kernel.org>
+
+commit 0fa8263367db9287aa0632f96c1a5f93cc478150 upstream.
+
+Eduard reported a problem mounting cephfs on s390 arch. The feature
+mask sent by the MDS is little-endian, so we need to convert it
+before storing and testing against it.
+
+Cc: stable@vger.kernel.org
+Reported-and-Tested-by: Eduard Shishkin <edward6@linux.ibm.com>
+Signed-off-by: Jeff Layton <jlayton@kernel.org>
+Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
+Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/ceph/mds_client.c |    8 +++-----
+ 1 file changed, 3 insertions(+), 5 deletions(-)
+
+--- a/fs/ceph/mds_client.c
++++ b/fs/ceph/mds_client.c
+@@ -3072,8 +3072,7 @@ static void handle_session(struct ceph_m
+       void *end = p + msg->front.iov_len;
+       struct ceph_mds_session_head *h;
+       u32 op;
+-      u64 seq;
+-      unsigned long features = 0;
++      u64 seq, features = 0;
+       int wake = 0;
+       bool blacklisted = false;
+ 
+@@ -3092,9 +3091,8 @@ static void handle_session(struct ceph_m
+                       goto bad;
+               /* version >= 3, feature bits */
+               ceph_decode_32_safe(&p, end, len, bad);
+-              ceph_decode_need(&p, end, len, bad);
+-              memcpy(&features, p, min_t(size_t, len, sizeof(features)));
+-              p += len;
++              ceph_decode_64_safe(&p, end, features, bad);
++              p += len - sizeof(features);
+       }
+ 
+       mutex_lock(&mdsc->mutex);
diff --git a/queue-5.4/coredump-fix-crash-when-umh-is-disabled.patch b/queue-5.4/coredump-fix-crash-when-umh-is-disabled.patch

new file mode 100644 (file)

index 0000000..ef5c034
--- /dev/null
+++ b/queue-5.4/coredump-fix-crash-when-umh-is-disabled.patch
@@ -0,0 +1,123 @@
+From 3740d93e37902b31159a82da2d5c8812ed825404 Mon Sep 17 00:00:00 2001
+From: Luis Chamberlain <mcgrof@kernel.org>
+Date: Thu, 16 Apr 2020 16:28:59 +0000
+Subject: coredump: fix crash when umh is disabled
+
+From: Luis Chamberlain <mcgrof@kernel.org>
+
+commit 3740d93e37902b31159a82da2d5c8812ed825404 upstream.
+
+Commit 64e90a8acb859 ("Introduce STATIC_USERMODEHELPER to mediate
+call_usermodehelper()") added the optiont to disable all
+call_usermodehelper() calls by setting STATIC_USERMODEHELPER_PATH to
+an empty string. When this is done, and crashdump is triggered, it
+will crash on null pointer dereference, since we make assumptions
+over what call_usermodehelper_exec() did.
+
+This has been reported by Sergey when one triggers a a coredump
+with the following configuration:
+
+```
+CONFIG_STATIC_USERMODEHELPER=y
+CONFIG_STATIC_USERMODEHELPER_PATH=""
+kernel.core_pattern = |/usr/lib/systemd/systemd-coredump %P %u %g %s %t %c %h %e
+```
+
+The way disabling the umh was designed was that call_usermodehelper_exec()
+would just return early, without an error. But coredump assumes
+certain variables are set up for us when this happens, and calls
+ile_start_write(cprm.file) with a NULL file.
+
+[    2.819676] BUG: kernel NULL pointer dereference, address: 0000000000000020
+[    2.819859] #PF: supervisor read access in kernel mode
+[    2.820035] #PF: error_code(0x0000) - not-present page
+[    2.820188] PGD 0 P4D 0
+[    2.820305] Oops: 0000 [#1] SMP PTI
+[    2.820436] CPU: 2 PID: 89 Comm: a Not tainted 5.7.0-rc1+ #7
+[    2.820680] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190711_202441-buildvm-armv7-10.arm.fedoraproject.org-2.fc31 04/01/2014
+[    2.821150] RIP: 0010:do_coredump+0xd80/0x1060
+[    2.821385] Code: e8 95 11 ed ff 48 c7 c6 cc a7 b4 81 48 8d bd 28 ff
+ff ff 89 c2 e8 70 f1 ff ff 41 89 c2 85 c0 0f 84 72 f7 ff ff e9 b4 fe ff
+ff <48> 8b 57 20 0f b7 02 66 25 00 f0 66 3d 00 8
+0 0f 84 9c 01 00 00 44
+[    2.822014] RSP: 0000:ffffc9000029bcb8 EFLAGS: 00010246
+[    2.822339] RAX: 0000000000000000 RBX: ffff88803f860000 RCX: 000000000000000a
+[    2.822746] RDX: 0000000000000009 RSI: 0000000000000282 RDI: 0000000000000000
+[    2.823141] RBP: ffffc9000029bde8 R08: 0000000000000000 R09: ffffc9000029bc00
+[    2.823508] R10: 0000000000000001 R11: ffff88803dec90be R12: ffffffff81c39da0
+[    2.823902] R13: ffff88803de84400 R14: 0000000000000000 R15: 0000000000000000
+[    2.824285] FS:  00007fee08183540(0000) GS:ffff88803e480000(0000) knlGS:0000000000000000
+[    2.824767] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+[    2.825111] CR2: 0000000000000020 CR3: 000000003f856005 CR4: 0000000000060ea0
+[    2.825479] Call Trace:
+[    2.825790]  get_signal+0x11e/0x720
+[    2.826087]  do_signal+0x1d/0x670
+[    2.826361]  ? force_sig_info_to_task+0xc1/0xf0
+[    2.826691]  ? force_sig_fault+0x3c/0x40
+[    2.826996]  ? do_trap+0xc9/0x100
+[    2.827179]  exit_to_usermode_loop+0x49/0x90
+[    2.827359]  prepare_exit_to_usermode+0x77/0xb0
+[    2.827559]  ? invalid_op+0xa/0x30
+[    2.827747]  ret_from_intr+0x20/0x20
+[    2.827921] RIP: 0033:0x55e2c76d2129
+[    2.828107] Code: 2d ff ff ff e8 68 ff ff ff 5d c6 05 18 2f 00 00 01
+c3 0f 1f 80 00 00 00 00 c3 0f 1f 80 00 00 00 00 e9 7b ff ff ff 55 48 89
+e5 <0f> 0b b8 00 00 00 00 5d c3 66 2e 0f 1f 84 0
+0 00 00 00 00 0f 1f 40
+[    2.828603] RSP: 002b:00007fffeba5e080 EFLAGS: 00010246
+[    2.828801] RAX: 000055e2c76d2125 RBX: 0000000000000000 RCX: 00007fee0817c718
+[    2.829034] RDX: 00007fffeba5e188 RSI: 00007fffeba5e178 RDI: 0000000000000001
+[    2.829257] RBP: 00007fffeba5e080 R08: 0000000000000000 R09: 00007fee08193c00
+[    2.829482] R10: 0000000000000009 R11: 0000000000000000 R12: 000055e2c76d2040
+[    2.829727] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
+[    2.829964] CR2: 0000000000000020
+[    2.830149] ---[ end trace ceed83d8c68a1bf1 ]---
+```
+
+Cc: <stable@vger.kernel.org> # v4.11+
+Fixes: 64e90a8acb85 ("Introduce STATIC_USERMODEHELPER to mediate call_usermodehelper()")
+BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=199795
+Reported-by: Tony Vroon <chainsaw@gentoo.org>
+Reported-by: Sergey Kvachonok <ravenexp@gmail.com>
+Tested-by: Sergei Trofimovich <slyfox@gentoo.org>
+Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
+Link: https://lore.kernel.org/r/20200416162859.26518-1-mcgrof@kernel.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/coredump.c |    8 ++++++++
+ kernel/umh.c  |    5 +++++
+ 2 files changed, 13 insertions(+)
+
+--- a/fs/coredump.c
++++ b/fs/coredump.c
+@@ -788,6 +788,14 @@ void do_coredump(const kernel_siginfo_t
+       if (displaced)
+               put_files_struct(displaced);
+       if (!dump_interrupted()) {
++              /*
++               * umh disabled with CONFIG_STATIC_USERMODEHELPER_PATH="" would
++               * have this set to NULL.
++               */
++              if (!cprm.file) {
++                      pr_info("Core dump to |%s disabled\n", cn.corename);
++                      goto close_fail;
++              }
+               file_start_write(cprm.file);
+               core_dumped = binfmt->core_dump(&cprm);
+               file_end_write(cprm.file);
+--- a/kernel/umh.c
++++ b/kernel/umh.c
+@@ -544,6 +544,11 @@ EXPORT_SYMBOL_GPL(fork_usermode_blob);
+  * Runs a user-space application.  The application is started
+  * asynchronously if wait is not set, and runs as a child of system workqueues.
+  * (ie. it runs with full root capabilities and optimized affinity).
++ *
++ * Note: successful return value does not guarantee the helper was called at
++ * all. You can't rely on sub_info->{init,cleanup} being called even for
++ * UMH_WAIT_* wait modes as STATIC_USERMODEHELPER_PATH="" turns all helpers
++ * into a successful no-op.
+  */
+ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
+ {
diff --git a/queue-5.4/drm-ingenic-drm-add-module_device_table.patch b/queue-5.4/drm-ingenic-drm-add-module_device_table.patch

new file mode 100644 (file)

index 0000000..b7691a0
--- /dev/null
+++ b/queue-5.4/drm-ingenic-drm-add-module_device_table.patch
@@ -0,0 +1,33 @@
+From c59359a02d14a7256cd508a4886b7d2012df2363 Mon Sep 17 00:00:00 2001
+From: "H. Nikolaus Schaller" <hns@goldelico.com>
+Date: Mon, 4 May 2020 08:35:12 +0200
+Subject: drm: ingenic-drm: add MODULE_DEVICE_TABLE
+
+From: H. Nikolaus Schaller <hns@goldelico.com>
+
+commit c59359a02d14a7256cd508a4886b7d2012df2363 upstream.
+
+so that the driver can load by matching the device tree
+if compiled as module.
+
+Cc: stable@vger.kernel.org # v5.3+
+Fixes: 90b86fcc47b4 ("DRM: Add KMS driver for the Ingenic JZ47xx SoCs")
+Signed-off-by: H. Nikolaus Schaller <hns@goldelico.com>
+Signed-off-by: Paul Cercueil <paul@crapouillou.net>
+Link: https://patchwork.freedesktop.org/patch/msgid/1694a29b7a3449b6b662cec33d1b33f2ee0b174a.1588574111.git.hns@goldelico.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/gpu/drm/ingenic/ingenic-drm.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/gpu/drm/ingenic/ingenic-drm.c
++++ b/drivers/gpu/drm/ingenic/ingenic-drm.c
+@@ -824,6 +824,7 @@ static const struct of_device_id ingenic
+       { .compatible = "ingenic,jz4725b-lcd", .data = &jz4725b_soc_info },
+       { /* sentinel */ },
+ };
++MODULE_DEVICE_TABLE(of, ingenic_drm_of_match);
+ 
+ static struct platform_driver ingenic_drm_driver = {
+       .driver = {
diff --git a/queue-5.4/epoll-atomically-remove-wait-entry-on-wake-up.patch b/queue-5.4/epoll-atomically-remove-wait-entry-on-wake-up.patch

new file mode 100644 (file)

index 0000000..9fcf044
--- /dev/null
+++ b/queue-5.4/epoll-atomically-remove-wait-entry-on-wake-up.patch
@@ -0,0 +1,178 @@
+From 412895f03cbf9633298111cb4dfde13b7720e2c5 Mon Sep 17 00:00:00 2001
+From: Roman Penyaev <rpenyaev@suse.de>
+Date: Thu, 7 May 2020 18:36:16 -0700
+Subject: epoll: atomically remove wait entry on wake up
+
+From: Roman Penyaev <rpenyaev@suse.de>
+
+commit 412895f03cbf9633298111cb4dfde13b7720e2c5 upstream.
+
+This patch does two things:
+
+ - fixes a lost wakeup introduced by commit 339ddb53d373 ("fs/epoll:
+   remove unnecessary wakeups of nested epoll")
+
+ - improves performance for events delivery.
+
+The description of the problem is the following: if N (>1) threads are
+waiting on ep->wq for new events and M (>1) events come, it is quite
+likely that >1 wakeups hit the same wait queue entry, because there is
+quite a big window between __add_wait_queue_exclusive() and the
+following __remove_wait_queue() calls in ep_poll() function.
+
+This can lead to lost wakeups, because thread, which was woken up, can
+handle not all the events in ->rdllist.  (in better words the problem is
+described here: https://lkml.org/lkml/2019/10/7/905)
+
+The idea of the current patch is to use init_wait() instead of
+init_waitqueue_entry().
+
+Internally init_wait() sets autoremove_wake_function as a callback,
+which removes the wait entry atomically (under the wq locks) from the
+list, thus the next coming wakeup hits the next wait entry in the wait
+queue, thus preventing lost wakeups.
+
+Problem is very well reproduced by the epoll60 test case [1].
+
+Wait entry removal on wakeup has also performance benefits, because
+there is no need to take a ep->lock and remove wait entry from the queue
+after the successful wakeup.  Here is the timing output of the epoll60
+test case:
+
+  With explicit wakeup from ep_scan_ready_list() (the state of the
+  code prior 339ddb53d373):
+
+    real    0m6.970s
+    user    0m49.786s
+    sys     0m0.113s
+
+ After this patch:
+
+   real    0m5.220s
+   user    0m36.879s
+   sys     0m0.019s
+
+The other testcase is the stress-epoll [2], where one thread consumes
+all the events and other threads produce many events:
+
+  With explicit wakeup from ep_scan_ready_list() (the state of the
+  code prior 339ddb53d373):
+
+    threads  events/ms  run-time ms
+          8       5427         1474
+         16       6163         2596
+         32       6824         4689
+         64       7060         9064
+        128       6991        18309
+
+ After this patch:
+
+    threads  events/ms  run-time ms
+          8       5598         1429
+         16       7073         2262
+         32       7502         4265
+         64       7640         8376
+        128       7634        16767
+
+ (number of "events/ms" represents event bandwidth, thus higher is
+  better; number of "run-time ms" represents overall time spent
+  doing the benchmark, thus lower is better)
+
+[1] tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c
+[2] https://github.com/rouming/test-tools/blob/master/stress-epoll.c
+
+Signed-off-by: Roman Penyaev <rpenyaev@suse.de>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Reviewed-by: Jason Baron <jbaron@akamai.com>
+Cc: Khazhismel Kumykov <khazhy@google.com>
+Cc: Alexander Viro <viro@zeniv.linux.org.uk>
+Cc: Heiher <r@hev.cc>
+Cc: <stable@vger.kernel.org>
+Link: http://lkml.kernel.org/r/20200430130326.1368509-2-rpenyaev@suse.de
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/eventpoll.c |   43 ++++++++++++++++++++++++-------------------
+ 1 file changed, 24 insertions(+), 19 deletions(-)
+
+--- a/fs/eventpoll.c
++++ b/fs/eventpoll.c
+@@ -1827,7 +1827,6 @@ static int ep_poll(struct eventpoll *ep,
+ {
+       int res = 0, eavail, timed_out = 0;
+       u64 slack = 0;
+-      bool waiter = false;
+       wait_queue_entry_t wait;
+       ktime_t expires, *to = NULL;
+ 
+@@ -1872,21 +1871,23 @@ fetch_events:
+        */
+       ep_reset_busy_poll_napi_id(ep);
+ 
+-      /*
+-       * We don't have any available event to return to the caller.  We need
+-       * to sleep here, and we will be woken by ep_poll_callback() when events
+-       * become available.
+-       */
+-      if (!waiter) {
+-              waiter = true;
+-              init_waitqueue_entry(&wait, current);
+-
++      do {
++              /*
++               * Internally init_wait() uses autoremove_wake_function(),
++               * thus wait entry is removed from the wait queue on each
++               * wakeup. Why it is important? In case of several waiters
++               * each new wakeup will hit the next waiter, giving it the
++               * chance to harvest new event. Otherwise wakeup can be
++               * lost. This is also good performance-wise, because on
++               * normal wakeup path no need to call __remove_wait_queue()
++               * explicitly, thus ep->lock is not taken, which halts the
++               * event delivery.
++               */
++              init_wait(&wait);
+               write_lock_irq(&ep->lock);
+               __add_wait_queue_exclusive(&ep->wq, &wait);
+               write_unlock_irq(&ep->lock);
+-      }
+ 
+-      for (;;) {
+               /*
+                * We don't want to sleep if the ep_poll_callback() sends us
+                * a wakeup in between. That's why we set the task state
+@@ -1916,10 +1917,20 @@ fetch_events:
+                       timed_out = 1;
+                       break;
+               }
+-      }
++
++              /* We were woken up, thus go and try to harvest some events */
++              eavail = 1;
++
++      } while (0);
+ 
+       __set_current_state(TASK_RUNNING);
+ 
++      if (!list_empty_careful(&wait.entry)) {
++              write_lock_irq(&ep->lock);
++              __remove_wait_queue(&ep->wq, &wait);
++              write_unlock_irq(&ep->lock);
++      }
++
+ send_events:
+       /*
+        * Try to transfer events to user space. In case we get 0 events and
+@@ -1930,12 +1941,6 @@ send_events:
+           !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
+               goto fetch_events;
+ 
+-      if (waiter) {
+-              write_lock_irq(&ep->lock);
+-              __remove_wait_queue(&ep->wq, &wait);
+-              write_unlock_irq(&ep->lock);
+-      }
+-
+       return res;
+ }
+ 
diff --git a/queue-5.4/eventpoll-fix-missing-wakeup-for-ovflist-in-ep_poll_callback.patch b/queue-5.4/eventpoll-fix-missing-wakeup-for-ovflist-in-ep_poll_callback.patch

new file mode 100644 (file)

index 0000000..ea3e7ea
--- /dev/null
+++ b/queue-5.4/eventpoll-fix-missing-wakeup-for-ovflist-in-ep_poll_callback.patch
@@ -0,0 +1,77 @@
+From 0c54a6a44bf3d41e76ce3f583a6ece267618df2e Mon Sep 17 00:00:00 2001
+From: Khazhismel Kumykov <khazhy@google.com>
+Date: Thu, 7 May 2020 18:35:59 -0700
+Subject: eventpoll: fix missing wakeup for ovflist in ep_poll_callback
+
+From: Khazhismel Kumykov <khazhy@google.com>
+
+commit 0c54a6a44bf3d41e76ce3f583a6ece267618df2e upstream.
+
+In the event that we add to ovflist, before commit 339ddb53d373
+("fs/epoll: remove unnecessary wakeups of nested epoll") we would be
+woken up by ep_scan_ready_list, and did no wakeup in ep_poll_callback.
+
+With that wakeup removed, if we add to ovflist here, we may never wake
+up.  Rather than adding back the ep_scan_ready_list wakeup - which was
+resulting in unnecessary wakeups, trigger a wake-up in ep_poll_callback.
+
+We noticed that one of our workloads was missing wakeups starting with
+339ddb53d373 and upon manual inspection, this wakeup seemed missing to me.
+With this patch added, we no longer see missing wakeups.  I haven't yet
+tried to make a small reproducer, but the existing kselftests in
+filesystem/epoll passed for me with this patch.
+
+[khazhy@google.com: use if/elif instead of goto + cleanup suggested by Roman]
+  Link: http://lkml.kernel.org/r/20200424190039.192373-1-khazhy@google.com
+Fixes: 339ddb53d373 ("fs/epoll: remove unnecessary wakeups of nested epoll")
+Signed-off-by: Khazhismel Kumykov <khazhy@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Reviewed-by: Roman Penyaev <rpenyaev@suse.de>
+Cc: Alexander Viro <viro@zeniv.linux.org.uk>
+Cc: Roman Penyaev <rpenyaev@suse.de>
+Cc: Heiher <r@hev.cc>
+Cc: Jason Baron <jbaron@akamai.com>
+Cc: <stable@vger.kernel.org>
+Link: http://lkml.kernel.org/r/20200424025057.118641-1-khazhy@google.com
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/eventpoll.c |   18 +++++++++---------
+ 1 file changed, 9 insertions(+), 9 deletions(-)
+
+--- a/fs/eventpoll.c
++++ b/fs/eventpoll.c
+@@ -1176,6 +1176,10 @@ static inline bool chain_epi_lockless(st
+ {
+       struct eventpoll *ep = epi->ep;
+ 
++      /* Fast preliminary check */
++      if (epi->next != EP_UNACTIVE_PTR)
++              return false;
++
+       /* Check that the same epi has not been just chained from another CPU */
+       if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR)
+               return false;
+@@ -1242,16 +1246,12 @@ static int ep_poll_callback(wait_queue_e
+        * chained in ep->ovflist and requeued later on.
+        */
+       if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
+-              if (epi->next == EP_UNACTIVE_PTR &&
+-                  chain_epi_lockless(epi))
++              if (chain_epi_lockless(epi))
++                      ep_pm_stay_awake_rcu(epi);
++      } else if (!ep_is_linked(epi)) {
++              /* In the usual case, add event to ready list. */
++              if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist))
+                       ep_pm_stay_awake_rcu(epi);
+-              goto out_unlock;
+-      }
+-
+-      /* If this file is already in the ready list we exit soon */
+-      if (!ep_is_linked(epi) &&
+-          list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) {
+-              ep_pm_stay_awake_rcu(epi);
+       }
+ 
+       /*
diff --git a/queue-5.4/iocost-protect-iocg-abs_vdebt-with-iocg-waitq.lock.patch b/queue-5.4/iocost-protect-iocg-abs_vdebt-with-iocg-waitq.lock.patch

new file mode 100644 (file)

index 0000000..583091e
--- /dev/null
+++ b/queue-5.4/iocost-protect-iocg-abs_vdebt-with-iocg-waitq.lock.patch
@@ -0,0 +1,300 @@
+From 0b80f9866e6bbfb905140ed8787ff2af03652c0c Mon Sep 17 00:00:00 2001
+From: Tejun Heo <tj@kernel.org>
+Date: Mon, 4 May 2020 19:27:54 -0400
+Subject: iocost: protect iocg->abs_vdebt with iocg->waitq.lock
+
+From: Tejun Heo <tj@kernel.org>
+
+commit 0b80f9866e6bbfb905140ed8787ff2af03652c0c upstream.
+
+abs_vdebt is an atomic_64 which tracks how much over budget a given cgroup
+is and controls the activation of use_delay mechanism. Once a cgroup goes
+over budget from forced IOs, it has to pay it back with its future budget.
+The progress guarantee on debt paying comes from the iocg being active -
+active iocgs are processed by the periodic timer, which ensures that as time
+passes the debts dissipate and the iocg returns to normal operation.
+
+However, both iocg activation and vdebt handling are asynchronous and a
+sequence like the following may happen.
+
+1. The iocg is in the process of being deactivated by the periodic timer.
+
+2. A bio enters ioc_rqos_throttle(), calls iocg_activate() which returns
+   without anything because it still sees that the iocg is already active.
+
+3. The iocg is deactivated.
+
+4. The bio from #2 is over budget but needs to be forced. It increases
+   abs_vdebt and goes over the threshold and enables use_delay.
+
+5. IO control is enabled for the iocg's subtree and now IOs are attributed
+   to the descendant cgroups and the iocg itself no longer issues IOs.
+
+This leaves the iocg with stuck abs_vdebt - it has debt but inactive and no
+further IOs which can activate it. This can end up unduly punishing all the
+descendants cgroups.
+
+The usual throttling path has the same issue - the iocg must be active while
+throttled to ensure that future event will wake it up - and solves the
+problem by synchronizing the throttling path with a spinlock. abs_vdebt
+handling is another form of overage handling and shares a lot of
+characteristics including the fact that it isn't in the hottest path.
+
+This patch fixes the above and other possible races by strictly
+synchronizing abs_vdebt and use_delay handling with iocg->waitq.lock.
+
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Reported-by: Vlad Dmitriev <vvd@fb.com>
+Cc: stable@vger.kernel.org # v5.4+
+Fixes: e1518f63f246 ("blk-iocost: Don't let merges push vtime into the future")
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ block/blk-iocost.c             |  117 ++++++++++++++++++++++++-----------------
+ tools/cgroup/iocost_monitor.py |    7 ++
+ 2 files changed, 77 insertions(+), 47 deletions(-)
+
+--- a/block/blk-iocost.c
++++ b/block/blk-iocost.c
+@@ -469,7 +469,7 @@ struct ioc_gq {
+        */
+       atomic64_t                      vtime;
+       atomic64_t                      done_vtime;
+-      atomic64_t                      abs_vdebt;
++      u64                             abs_vdebt;
+       u64                             last_vtime;
+ 
+       /*
+@@ -1145,7 +1145,7 @@ static void iocg_kick_waitq(struct ioc_g
+       struct iocg_wake_ctx ctx = { .iocg = iocg };
+       u64 margin_ns = (u64)(ioc->period_us *
+                             WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC;
+-      u64 abs_vdebt, vdebt, vshortage, expires, oexpires;
++      u64 vdebt, vshortage, expires, oexpires;
+       s64 vbudget;
+       u32 hw_inuse;
+ 
+@@ -1155,18 +1155,15 @@ static void iocg_kick_waitq(struct ioc_g
+       vbudget = now->vnow - atomic64_read(&iocg->vtime);
+ 
+       /* pay off debt */
+-      abs_vdebt = atomic64_read(&iocg->abs_vdebt);
+-      vdebt = abs_cost_to_cost(abs_vdebt, hw_inuse);
++      vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
+       if (vdebt && vbudget > 0) {
+               u64 delta = min_t(u64, vbudget, vdebt);
+               u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse),
+-                                  abs_vdebt);
++                                  iocg->abs_vdebt);
+ 
+               atomic64_add(delta, &iocg->vtime);
+               atomic64_add(delta, &iocg->done_vtime);
+-              atomic64_sub(abs_delta, &iocg->abs_vdebt);
+-              if (WARN_ON_ONCE(atomic64_read(&iocg->abs_vdebt) < 0))
+-                      atomic64_set(&iocg->abs_vdebt, 0);
++              iocg->abs_vdebt -= abs_delta;
+       }
+ 
+       /*
+@@ -1222,12 +1219,18 @@ static bool iocg_kick_delay(struct ioc_g
+       u64 expires, oexpires;
+       u32 hw_inuse;
+ 
++      lockdep_assert_held(&iocg->waitq.lock);
++
+       /* debt-adjust vtime */
+       current_hweight(iocg, NULL, &hw_inuse);
+-      vtime += abs_cost_to_cost(atomic64_read(&iocg->abs_vdebt), hw_inuse);
++      vtime += abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
+ 
+-      /* clear or maintain depending on the overage */
+-      if (time_before_eq64(vtime, now->vnow)) {
++      /*
++       * Clear or maintain depending on the overage. Non-zero vdebt is what
++       * guarantees that @iocg is online and future iocg_kick_delay() will
++       * clear use_delay. Don't leave it on when there's no vdebt.
++       */
++      if (!iocg->abs_vdebt || time_before_eq64(vtime, now->vnow)) {
+               blkcg_clear_delay(blkg);
+               return false;
+       }
+@@ -1261,9 +1264,12 @@ static enum hrtimer_restart iocg_delay_t
+ {
+       struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer);
+       struct ioc_now now;
++      unsigned long flags;
+ 
++      spin_lock_irqsave(&iocg->waitq.lock, flags);
+       ioc_now(iocg->ioc, &now);
+       iocg_kick_delay(iocg, &now, 0);
++      spin_unlock_irqrestore(&iocg->waitq.lock, flags);
+ 
+       return HRTIMER_NORESTART;
+ }
+@@ -1371,14 +1377,13 @@ static void ioc_timer_fn(struct timer_li
+        * should have woken up in the last period and expire idle iocgs.
+        */
+       list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
+-              if (!waitqueue_active(&iocg->waitq) &&
+-                  !atomic64_read(&iocg->abs_vdebt) && !iocg_is_idle(iocg))
++              if (!waitqueue_active(&iocg->waitq) && iocg->abs_vdebt &&
++                  !iocg_is_idle(iocg))
+                       continue;
+ 
+               spin_lock(&iocg->waitq.lock);
+ 
+-              if (waitqueue_active(&iocg->waitq) ||
+-                  atomic64_read(&iocg->abs_vdebt)) {
++              if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt) {
+                       /* might be oversleeping vtime / hweight changes, kick */
+                       iocg_kick_waitq(iocg, &now);
+                       iocg_kick_delay(iocg, &now, 0);
+@@ -1721,28 +1726,49 @@ static void ioc_rqos_throttle(struct rq_
+        * tests are racy but the races aren't systemic - we only miss once
+        * in a while which is fine.
+        */
+-      if (!waitqueue_active(&iocg->waitq) &&
+-          !atomic64_read(&iocg->abs_vdebt) &&
++      if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
+           time_before_eq64(vtime + cost, now.vnow)) {
+               iocg_commit_bio(iocg, bio, cost);
+               return;
+       }
+ 
+       /*
+-       * We're over budget.  If @bio has to be issued regardless,
+-       * remember the abs_cost instead of advancing vtime.
+-       * iocg_kick_waitq() will pay off the debt before waking more IOs.
++       * We activated above but w/o any synchronization. Deactivation is
++       * synchronized with waitq.lock and we won't get deactivated as long
++       * as we're waiting or has debt, so we're good if we're activated
++       * here. In the unlikely case that we aren't, just issue the IO.
++       */
++      spin_lock_irq(&iocg->waitq.lock);
++
++      if (unlikely(list_empty(&iocg->active_list))) {
++              spin_unlock_irq(&iocg->waitq.lock);
++              iocg_commit_bio(iocg, bio, cost);
++              return;
++      }
++
++      /*
++       * We're over budget. If @bio has to be issued regardless, remember
++       * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay
++       * off the debt before waking more IOs.
++       *
+        * This way, the debt is continuously paid off each period with the
+-       * actual budget available to the cgroup.  If we just wound vtime,
+-       * we would incorrectly use the current hw_inuse for the entire
+-       * amount which, for example, can lead to the cgroup staying
+-       * blocked for a long time even with substantially raised hw_inuse.
++       * actual budget available to the cgroup. If we just wound vtime, we
++       * would incorrectly use the current hw_inuse for the entire amount
++       * which, for example, can lead to the cgroup staying blocked for a
++       * long time even with substantially raised hw_inuse.
++       *
++       * An iocg with vdebt should stay online so that the timer can keep
++       * deducting its vdebt and [de]activate use_delay mechanism
++       * accordingly. We don't want to race against the timer trying to
++       * clear them and leave @iocg inactive w/ dangling use_delay heavily
++       * penalizing the cgroup and its descendants.
+        */
+       if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) {
+-              atomic64_add(abs_cost, &iocg->abs_vdebt);
++              iocg->abs_vdebt += abs_cost;
+               if (iocg_kick_delay(iocg, &now, cost))
+                       blkcg_schedule_throttle(rqos->q,
+                                       (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
++              spin_unlock_irq(&iocg->waitq.lock);
+               return;
+       }
+ 
+@@ -1759,20 +1785,6 @@ static void ioc_rqos_throttle(struct rq_
+        * All waiters are on iocg->waitq and the wait states are
+        * synchronized using waitq.lock.
+        */
+-      spin_lock_irq(&iocg->waitq.lock);
+-
+-      /*
+-       * We activated above but w/o any synchronization.  Deactivation is
+-       * synchronized with waitq.lock and we won't get deactivated as
+-       * long as we're waiting, so we're good if we're activated here.
+-       * In the unlikely case that we are deactivated, just issue the IO.
+-       */
+-      if (unlikely(list_empty(&iocg->active_list))) {
+-              spin_unlock_irq(&iocg->waitq.lock);
+-              iocg_commit_bio(iocg, bio, cost);
+-              return;
+-      }
+-
+       init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
+       wait.wait.private = current;
+       wait.bio = bio;
+@@ -1804,6 +1816,7 @@ static void ioc_rqos_merge(struct rq_qos
+       struct ioc_now now;
+       u32 hw_inuse;
+       u64 abs_cost, cost;
++      unsigned long flags;
+ 
+       /* bypass if disabled or for root cgroup */
+       if (!ioc->enabled || !iocg->level)
+@@ -1823,15 +1836,28 @@ static void ioc_rqos_merge(struct rq_qos
+               iocg->cursor = bio_end;
+ 
+       /*
+-       * Charge if there's enough vtime budget and the existing request
+-       * has cost assigned.  Otherwise, account it as debt.  See debt
+-       * handling in ioc_rqos_throttle() for details.
++       * Charge if there's enough vtime budget and the existing request has
++       * cost assigned.
+        */
+       if (rq->bio && rq->bio->bi_iocost_cost &&
+-          time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow))
++          time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) {
+               iocg_commit_bio(iocg, bio, cost);
+-      else
+-              atomic64_add(abs_cost, &iocg->abs_vdebt);
++              return;
++      }
++
++      /*
++       * Otherwise, account it as debt if @iocg is online, which it should
++       * be for the vast majority of cases. See debt handling in
++       * ioc_rqos_throttle() for details.
++       */
++      spin_lock_irqsave(&iocg->waitq.lock, flags);
++      if (likely(!list_empty(&iocg->active_list))) {
++              iocg->abs_vdebt += abs_cost;
++              iocg_kick_delay(iocg, &now, cost);
++      } else {
++              iocg_commit_bio(iocg, bio, cost);
++      }
++      spin_unlock_irqrestore(&iocg->waitq.lock, flags);
+ }
+ 
+ static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
+@@ -2001,7 +2027,6 @@ static void ioc_pd_init(struct blkg_poli
+       iocg->ioc = ioc;
+       atomic64_set(&iocg->vtime, now.vnow);
+       atomic64_set(&iocg->done_vtime, now.vnow);
+-      atomic64_set(&iocg->abs_vdebt, 0);
+       atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
+       INIT_LIST_HEAD(&iocg->active_list);
+       iocg->hweight_active = HWEIGHT_WHOLE;
+--- a/tools/cgroup/iocost_monitor.py
++++ b/tools/cgroup/iocost_monitor.py
+@@ -159,7 +159,12 @@ class IocgStat:
+         else:
+             self.inflight_pct = 0
+ 
+-        self.debt_ms = iocg.abs_vdebt.counter.value_() / VTIME_PER_USEC / 1000
++        # vdebt used to be an atomic64_t and is now u64, support both
++        try:
++            self.debt_ms = iocg.abs_vdebt.counter.value_() / VTIME_PER_USEC / 1000
++        except:
++            self.debt_ms = iocg.abs_vdebt.value_() / VTIME_PER_USEC / 1000
++
+         self.use_delay = blkg.use_delay.counter.value_()
+         self.delay_ms = blkg.delay_nsec.counter.value_() / 1_000_000
+ 
diff --git a/queue-5.4/ipc-mqueue.c-change-__do_notify-to-bypass-check_kill_permission.patch b/queue-5.4/ipc-mqueue.c-change-__do_notify-to-bypass-check_kill_permission.patch

new file mode 100644 (file)

index 0000000..0b7e746
--- /dev/null
+++ b/queue-5.4/ipc-mqueue.c-change-__do_notify-to-bypass-check_kill_permission.patch
@@ -0,0 +1,149 @@
+From b5f2006144c6ae941726037120fa1001ddede784 Mon Sep 17 00:00:00 2001
+From: Oleg Nesterov <oleg@redhat.com>
+Date: Thu, 7 May 2020 18:35:39 -0700
+Subject: ipc/mqueue.c: change __do_notify() to bypass check_kill_permission()
+
+From: Oleg Nesterov <oleg@redhat.com>
+
+commit b5f2006144c6ae941726037120fa1001ddede784 upstream.
+
+Commit cc731525f26a ("signal: Remove kernel interal si_code magic")
+changed the value of SI_FROMUSER(SI_MESGQ), this means that mq_notify() no
+longer works if the sender doesn't have rights to send a signal.
+
+Change __do_notify() to use do_send_sig_info() instead of kill_pid_info()
+to avoid check_kill_permission().
+
+This needs the additional notify.sigev_signo != 0 check, shouldn't we
+change do_mq_notify() to deny sigev_signo == 0 ?
+
+Test-case:
+
+       #include <signal.h>
+       #include <mqueue.h>
+       #include <unistd.h>
+       #include <sys/wait.h>
+       #include <assert.h>
+
+       static int notified;
+
+       static void sigh(int sig)
+       {
+               notified = 1;
+       }
+
+       int main(void)
+       {
+               signal(SIGIO, sigh);
+
+               int fd = mq_open("/mq", O_RDWR|O_CREAT, 0666, NULL);
+               assert(fd >= 0);
+
+               struct sigevent se = {
+                       .sigev_notify   = SIGEV_SIGNAL,
+                       .sigev_signo    = SIGIO,
+               };
+               assert(mq_notify(fd, &se) == 0);
+
+               if (!fork()) {
+                       assert(setuid(1) == 0);
+                       mq_send(fd, "",1,0);
+                       return 0;
+               }
+
+               wait(NULL);
+               mq_unlink("/mq");
+               assert(notified);
+               return 0;
+       }
+
+[manfred@colorfullife.com: 1) Add self_exec_id evaluation so that the implementation matches do_notify_parent 2) use PIDTYPE_TGID everywhere]
+Fixes: cc731525f26a ("signal: Remove kernel interal si_code magic")
+Reported-by: Yoji <yoji.fujihar.min@gmail.com>
+Signed-off-by: Oleg Nesterov <oleg@redhat.com>
+Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
+Cc: Davidlohr Bueso <dave@stgolabs.net>
+Cc: Markus Elfring <elfring@users.sourceforge.net>
+Cc: <1vier1@web.de>
+Cc: <stable@vger.kernel.org>
+Link: http://lkml.kernel.org/r/e2a782e4-eab9-4f5c-c749-c07a8f7a4e66@colorfullife.com
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ ipc/mqueue.c |   34 ++++++++++++++++++++++++++--------
+ 1 file changed, 26 insertions(+), 8 deletions(-)
+
+--- a/ipc/mqueue.c
++++ b/ipc/mqueue.c
+@@ -82,6 +82,7 @@ struct mqueue_inode_info {
+ 
+       struct sigevent notify;
+       struct pid *notify_owner;
++      u32 notify_self_exec_id;
+       struct user_namespace *notify_user_ns;
+       struct user_struct *user;       /* user who created, for accounting */
+       struct sock *notify_sock;
+@@ -709,28 +710,44 @@ static void __do_notify(struct mqueue_in
+        * synchronously. */
+       if (info->notify_owner &&
+           info->attr.mq_curmsgs == 1) {
+-              struct kernel_siginfo sig_i;
+               switch (info->notify.sigev_notify) {
+               case SIGEV_NONE:
+                       break;
+-              case SIGEV_SIGNAL:
+-                      /* sends signal */
++              case SIGEV_SIGNAL: {
++                      struct kernel_siginfo sig_i;
++                      struct task_struct *task;
++
++                      /* do_mq_notify() accepts sigev_signo == 0, why?? */
++                      if (!info->notify.sigev_signo)
++                              break;
+ 
+                       clear_siginfo(&sig_i);
+                       sig_i.si_signo = info->notify.sigev_signo;
+                       sig_i.si_errno = 0;
+                       sig_i.si_code = SI_MESGQ;
+                       sig_i.si_value = info->notify.sigev_value;
+-                      /* map current pid/uid into info->owner's namespaces */
+                       rcu_read_lock();
++                      /* map current pid/uid into info->owner's namespaces */
+                       sig_i.si_pid = task_tgid_nr_ns(current,
+                                               ns_of_pid(info->notify_owner));
+-                      sig_i.si_uid = from_kuid_munged(info->notify_user_ns, current_uid());
++                      sig_i.si_uid = from_kuid_munged(info->notify_user_ns,
++                                              current_uid());
++                      /*
++                       * We can't use kill_pid_info(), this signal should
++                       * bypass check_kill_permission(). It is from kernel
++                       * but si_fromuser() can't know this.
++                       * We do check the self_exec_id, to avoid sending
++                       * signals to programs that don't expect them.
++                       */
++                      task = pid_task(info->notify_owner, PIDTYPE_TGID);
++                      if (task && task->self_exec_id ==
++                                              info->notify_self_exec_id) {
++                              do_send_sig_info(info->notify.sigev_signo,
++                                              &sig_i, task, PIDTYPE_TGID);
++                      }
+                       rcu_read_unlock();
+-
+-                      kill_pid_info(info->notify.sigev_signo,
+-                                    &sig_i, info->notify_owner);
+                       break;
++              }
+               case SIGEV_THREAD:
+                       set_cookie(info->notify_cookie, NOTIFY_WOKENUP);
+                       netlink_sendskb(info->notify_sock, info->notify_cookie);
+@@ -1315,6 +1332,7 @@ retry:
+                       info->notify.sigev_signo = notification->sigev_signo;
+                       info->notify.sigev_value = notification->sigev_value;
+                       info->notify.sigev_notify = SIGEV_SIGNAL;
++                      info->notify_self_exec_id = current->self_exec_id;
+                       break;
+               }
+ 
diff --git a/queue-5.4/mm-limit-boost_watermark-on-small-zones.patch b/queue-5.4/mm-limit-boost_watermark-on-small-zones.patch

new file mode 100644 (file)

index 0000000..eabe7f8
--- /dev/null
+++ b/queue-5.4/mm-limit-boost_watermark-on-small-zones.patch
@@ -0,0 +1,82 @@
+From 14f69140ff9c92a0928547ceefb153a842e8492c Mon Sep 17 00:00:00 2001
+From: Henry Willard <henry.willard@oracle.com>
+Date: Thu, 7 May 2020 18:36:27 -0700
+Subject: mm: limit boost_watermark on small zones
+
+From: Henry Willard <henry.willard@oracle.com>
+
+commit 14f69140ff9c92a0928547ceefb153a842e8492c upstream.
+
+Commit 1c30844d2dfe ("mm: reclaim small amounts of memory when an
+external fragmentation event occurs") adds a boost_watermark() function
+which increases the min watermark in a zone by at least
+pageblock_nr_pages or the number of pages in a page block.
+
+On Arm64, with 64K pages and 512M huge pages, this is 8192 pages or
+512M.  It does this regardless of the number of managed pages managed in
+the zone or the likelihood of success.
+
+This can put the zone immediately under water in terms of allocating
+pages from the zone, and can cause a small machine to fail immediately
+due to OoM.  Unlike set_recommended_min_free_kbytes(), which
+substantially increases min_free_kbytes and is tied to THP,
+boost_watermark() can be called even if THP is not active.
+
+The problem is most likely to appear on architectures such as Arm64
+where pageblock_nr_pages is very large.
+
+It is desirable to run the kdump capture kernel in as small a space as
+possible to avoid wasting memory.  In some architectures, such as Arm64,
+there are restrictions on where the capture kernel can run, and
+therefore, the space available.  A capture kernel running in 768M can
+fail due to OoM immediately after boost_watermark() sets the min in zone
+DMA32, where most of the memory is, to 512M.  It fails even though there
+is over 500M of free memory.  With boost_watermark() suppressed, the
+capture kernel can run successfully in 448M.
+
+This patch limits boost_watermark() to boosting a zone's min watermark
+only when there are enough pages that the boost will produce positive
+results.  In this case that is estimated to be four times as many pages
+as pageblock_nr_pages.
+
+Mel said:
+
+: There is no harm in marking it stable.  Clearly it does not happen very
+: often but it's not impossible.  32-bit x86 is a lot less common now
+: which would previously have been vulnerable to triggering this easily.
+: ppc64 has a larger base page size but typically only has one zone.
+: arm64 is likely the most vulnerable, particularly when CMA is
+: configured with a small movable zone.
+
+Fixes: 1c30844d2dfe ("mm: reclaim small amounts of memory when an external fragmentation event occurs")
+Signed-off-by: Henry Willard <henry.willard@oracle.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Reviewed-by: David Hildenbrand <david@redhat.com>
+Acked-by: Mel Gorman <mgorman@techsingularity.net>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: <stable@vger.kernel.org>
+Link: http://lkml.kernel.org/r/1588294148-6586-1-git-send-email-henry.willard@oracle.com
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/page_alloc.c |    8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -2351,6 +2351,14 @@ static inline void boost_watermark(struc
+ 
+       if (!watermark_boost_factor)
+               return;
++      /*
++       * Don't bother in zones that are unlikely to produce results.
++       * On small machines, including kdump capture kernels running
++       * in a small area, boosting the watermark can cause an out of
++       * memory situation immediately.
++       */
++      if ((pageblock_nr_pages * 4) > zone_managed_pages(zone))
++              return;
+ 
+       max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
+                       watermark_boost_factor, 10000);
diff --git a/queue-5.4/mm-page_alloc-fix-watchdog-soft-lockups-during-set_zone_contiguous.patch b/queue-5.4/mm-page_alloc-fix-watchdog-soft-lockups-during-set_zone_contiguous.patch

new file mode 100644 (file)

index 0000000..5e9c595
--- /dev/null
+++ b/queue-5.4/mm-page_alloc-fix-watchdog-soft-lockups-during-set_zone_contiguous.patch
@@ -0,0 +1,62 @@
+From e84fe99b68ce353c37ceeecc95dce9696c976556 Mon Sep 17 00:00:00 2001
+From: David Hildenbrand <david@redhat.com>
+Date: Thu, 7 May 2020 18:35:46 -0700
+Subject: mm/page_alloc: fix watchdog soft lockups during set_zone_contiguous()
+
+From: David Hildenbrand <david@redhat.com>
+
+commit e84fe99b68ce353c37ceeecc95dce9696c976556 upstream.
+
+Without CONFIG_PREEMPT, it can happen that we get soft lockups detected,
+e.g., while booting up.
+
+  watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [swapper/0:1]
+  CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.6.0-next-20200331+ #4
+  Hardware name: Red Hat KVM, BIOS 1.11.1-4.module+el8.1.0+4066+0f1aadab 04/01/2014
+  RIP: __pageblock_pfn_to_page+0x134/0x1c0
+  Call Trace:
+   set_zone_contiguous+0x56/0x70
+   page_alloc_init_late+0x166/0x176
+   kernel_init_freeable+0xfa/0x255
+   kernel_init+0xa/0x106
+   ret_from_fork+0x35/0x40
+
+The issue becomes visible when having a lot of memory (e.g., 4TB)
+assigned to a single NUMA node - a system that can easily be created
+using QEMU.  Inside VMs on a hypervisor with quite some memory
+overcommit, this is fairly easy to trigger.
+
+Signed-off-by: David Hildenbrand <david@redhat.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Reviewed-by: Pavel Tatashin <pasha.tatashin@soleen.com>
+Reviewed-by: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
+Reviewed-by: Baoquan He <bhe@redhat.com>
+Reviewed-by: Shile Zhang <shile.zhang@linux.alibaba.com>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
+Cc: Shile Zhang <shile.zhang@linux.alibaba.com>
+Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
+Cc: Daniel Jordan <daniel.m.jordan@oracle.com>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Alexander Duyck <alexander.duyck@gmail.com>
+Cc: Baoquan He <bhe@redhat.com>
+Cc: Oscar Salvador <osalvador@suse.de>
+Cc: <stable@vger.kernel.org>
+Link: http://lkml.kernel.org/r/20200416073417.5003-1-david@redhat.com
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/page_alloc.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -1555,6 +1555,7 @@ void set_zone_contiguous(struct zone *zo
+               if (!__pageblock_pfn_to_page(block_start_pfn,
+                                            block_end_pfn, zone))
+                       return;
++              cond_resched();
+       }
+ 
+       /* We confirm that there is no hole */
diff --git a/queue-5.4/riscv-set-max_pfn-to-the-pfn-of-the-last-page.patch b/queue-5.4/riscv-set-max_pfn-to-the-pfn-of-the-last-page.patch

new file mode 100644 (file)

index 0000000..1960d4c
--- /dev/null
+++ b/queue-5.4/riscv-set-max_pfn-to-the-pfn-of-the-last-page.patch
@@ -0,0 +1,70 @@
+From c749bb2d554825e007cbc43b791f54e124dadfce Mon Sep 17 00:00:00 2001
+From: Vincent Chen <vincent.chen@sifive.com>
+Date: Mon, 27 Apr 2020 14:59:24 +0800
+Subject: riscv: set max_pfn to the PFN of the last page
+
+From: Vincent Chen <vincent.chen@sifive.com>
+
+commit c749bb2d554825e007cbc43b791f54e124dadfce upstream.
+
+The current max_pfn equals to zero. In this case, I found it caused users
+cannot get some page information through /proc such as kpagecount in v5.6
+kernel because of new sanity checks. The following message is displayed by
+stress-ng test suite with the command "stress-ng --verbose --physpage 1 -t
+1" on HiFive unleashed board.
+
+ # stress-ng --verbose --physpage 1 -t 1
+ stress-ng: debug: [109] 4 processors online, 4 processors configured
+ stress-ng: info: [109] dispatching hogs: 1 physpage
+ stress-ng: debug: [109] cache allocate: reducing cache level from L3 (too high) to L0
+ stress-ng: debug: [109] get_cpu_cache: invalid cache_level: 0
+ stress-ng: info: [109] cache allocate: using built-in defaults as no suitable cache found
+ stress-ng: debug: [109] cache allocate: default cache size: 2048K
+ stress-ng: debug: [109] starting stressors
+ stress-ng: debug: [109] 1 stressor spawned
+ stress-ng: debug: [110] stress-ng-physpage: started [110] (instance 0)
+ stress-ng: error: [110] stress-ng-physpage: cannot read page count for address 0x3fd34de000 in /proc/kpagecount, errno=0 (Success)
+ stress-ng: error: [110] stress-ng-physpage: cannot read page count for address 0x3fd32db078 in /proc/kpagecount, errno=0 (Success)
+ ...
+ stress-ng: error: [110] stress-ng-physpage: cannot read page count for address 0x3fd32db078 in /proc/kpagecount, errno=0 (Success)
+ stress-ng: debug: [110] stress-ng-physpage: exited [110] (instance 0)
+ stress-ng: debug: [109] process [110] terminated
+ stress-ng: info: [109] successful run completed in 1.00s
+ #
+
+After applying this patch, the kernel can pass the test.
+
+ # stress-ng --verbose --physpage 1 -t 1
+ stress-ng: debug: [104] 4 processors online, 4 processors configured stress-ng: info: [104] dispatching hogs: 1 physpage
+ stress-ng: info: [104] cache allocate: using defaults, can't determine cache details from sysfs
+ stress-ng: debug: [104] cache allocate: default cache size: 2048K
+ stress-ng: debug: [104] starting stressors
+ stress-ng: debug: [104] 1 stressor spawned
+ stress-ng: debug: [105] stress-ng-physpage: started [105] (instance 0) stress-ng: debug: [105] stress-ng-physpage: exited [105] (instance 0) stress-ng: debug: [104] process [105] terminated
+ stress-ng: info: [104] successful run completed in 1.01s
+ #
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Vincent Chen <vincent.chen@sifive.com>
+Reviewed-by: Anup Patel <anup@brainfault.org>
+Reviewed-by: Yash Shah <yash.shah@sifive.com>
+Tested-by: Yash Shah <yash.shah@sifive.com>
+Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/riscv/mm/init.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/arch/riscv/mm/init.c
++++ b/arch/riscv/mm/init.c
+@@ -116,7 +116,8 @@ void __init setup_bootmem(void)
+       memblock_reserve(vmlinux_start, vmlinux_end - vmlinux_start);
+ 
+       set_max_mapnr(PFN_DOWN(mem_size));
+-      max_low_pfn = PFN_DOWN(memblock_end_of_DRAM());
++      max_pfn = PFN_DOWN(memblock_end_of_DRAM());
++      max_low_pfn = max_pfn;
+ 
+ #ifdef CONFIG_BLK_DEV_INITRD
+       setup_initrd();
diff --git a/queue-5.4/series b/queue-5.4/series

index a9e44c4f08044099d75e5ada347a3501eeafed0e..444b81594bc1b1ee66a720a1fa4306e756e1b249 100644 (file)
--- a/queue-5.4/series
+++ b/queue-5.4/series
@@ -52,3 +52,16 @@ kvm-s390-remove-false-warn_on_once-for-the-pqap-instruction.patch
  kvm-vmx-explicitly-clear-rflags.cf-and-rflags.zf-in-vm-exit-rsb-path.patch
  kvm-arm-vgic-fix-limit-condition-when-writing-to-gicd_iactiver.patch
  kvm-arm64-fix-32bit-pc-wrap-around.patch
+arm64-hugetlb-avoid-potential-null-dereference.patch
+drm-ingenic-drm-add-module_device_table.patch
+ipc-mqueue.c-change-__do_notify-to-bypass-check_kill_permission.patch
+epoll-atomically-remove-wait-entry-on-wake-up.patch
+eventpoll-fix-missing-wakeup-for-ovflist-in-ep_poll_callback.patch
+mm-page_alloc-fix-watchdog-soft-lockups-during-set_zone_contiguous.patch
+mm-limit-boost_watermark-on-small-zones.patch
+ceph-fix-endianness-bug-when-handling-mds-session-feature-bits.patch
+ceph-demote-quotarealm-lookup-warning-to-a-debug-message.patch
+staging-gasket-check-the-return-value-of-gasket_get_bar_index.patch
+coredump-fix-crash-when-umh-is-disabled.patch
+riscv-set-max_pfn-to-the-pfn-of-the-last-page.patch
+iocost-protect-iocg-abs_vdebt-with-iocg-waitq.lock.patch
diff --git a/queue-5.4/staging-gasket-check-the-return-value-of-gasket_get_bar_index.patch b/queue-5.4/staging-gasket-check-the-return-value-of-gasket_get_bar_index.patch

new file mode 100644 (file)

index 0000000..6144b76
--- /dev/null
+++ b/queue-5.4/staging-gasket-check-the-return-value-of-gasket_get_bar_index.patch
@@ -0,0 +1,39 @@
+From 769acc3656d93aaacada814939743361d284fd87 Mon Sep 17 00:00:00 2001
+From: Oscar Carter <oscar.carter@gmx.com>
+Date: Fri, 1 May 2020 17:51:18 +0200
+Subject: staging: gasket: Check the return value of gasket_get_bar_index()
+
+From: Oscar Carter <oscar.carter@gmx.com>
+
+commit 769acc3656d93aaacada814939743361d284fd87 upstream.
+
+Check the return value of gasket_get_bar_index function as it can return
+a negative one (-EINVAL). If this happens, a negative index is used in
+the "gasket_dev->bar_data" array.
+
+Addresses-Coverity-ID: 1438542 ("Negative array index read")
+Fixes: 9a69f5087ccc2 ("drivers/staging: Gasket driver framework + Apex driver")
+Signed-off-by: Oscar Carter <oscar.carter@gmx.com>
+Cc: stable <stable@vger.kernel.org>
+Reviewed-by: Richard Yeh <rcy@google.com>
+Link: https://lore.kernel.org/r/20200501155118.13380-1-oscar.carter@gmx.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/staging/gasket/gasket_core.c |    4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/drivers/staging/gasket/gasket_core.c
++++ b/drivers/staging/gasket/gasket_core.c
+@@ -926,6 +926,10 @@ do_map_region(const struct gasket_dev *g
+               gasket_get_bar_index(gasket_dev,
+                                    (vma->vm_pgoff << PAGE_SHIFT) +
+                                    driver_desc->legacy_mmap_address_offset);
++
++      if (bar_index < 0)
++              return DO_MAP_REGION_INVALID;
++
+       phys_base = gasket_dev->bar_data[bar_index].phys_base + phys_offset;
+       while (mapped_bytes < map_length) {
+               /*
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Tue, 12 May 2020 11:35:23 +0000 (13:35 +0200)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Tue, 12 May 2020 11:35:23 +0000 (13:35 +0200)
queue-5.4/arm64-hugetlb-avoid-potential-null-dereference.patch	[new file with mode: 0644]	patch \| blob
queue-5.4/ceph-demote-quotarealm-lookup-warning-to-a-debug-message.patch	[new file with mode: 0644]	patch \| blob
queue-5.4/ceph-fix-endianness-bug-when-handling-mds-session-feature-bits.patch	[new file with mode: 0644]	patch \| blob
queue-5.4/coredump-fix-crash-when-umh-is-disabled.patch	[new file with mode: 0644]	patch \| blob
queue-5.4/drm-ingenic-drm-add-module_device_table.patch	[new file with mode: 0644]	patch \| blob
queue-5.4/epoll-atomically-remove-wait-entry-on-wake-up.patch	[new file with mode: 0644]	patch \| blob
queue-5.4/eventpoll-fix-missing-wakeup-for-ovflist-in-ep_poll_callback.patch	[new file with mode: 0644]	patch \| blob
queue-5.4/iocost-protect-iocg-abs_vdebt-with-iocg-waitq.lock.patch	[new file with mode: 0644]	patch \| blob
queue-5.4/ipc-mqueue.c-change-__do_notify-to-bypass-check_kill_permission.patch	[new file with mode: 0644]	patch \| blob
queue-5.4/mm-limit-boost_watermark-on-small-zones.patch	[new file with mode: 0644]	patch \| blob
queue-5.4/mm-page_alloc-fix-watchdog-soft-lockups-during-set_zone_contiguous.patch	[new file with mode: 0644]	patch \| blob
queue-5.4/riscv-set-max_pfn-to-the-pfn-of-the-last-page.patch	[new file with mode: 0644]	patch \| blob
queue-5.4/series		patch \| blob \| blame \| history
queue-5.4/staging-gasket-check-the-return-value-of-gasket_get_bar_index.patch	[new file with mode: 0644]	patch \| blob