From c814cf80ae96504efdd93196eac4c3481a0ad1f1 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 12 May 2020 13:35:23 +0200 Subject: [PATCH] 5.4-stable patches added patches: arm64-hugetlb-avoid-potential-null-dereference.patch ceph-demote-quotarealm-lookup-warning-to-a-debug-message.patch ceph-fix-endianness-bug-when-handling-mds-session-feature-bits.patch coredump-fix-crash-when-umh-is-disabled.patch drm-ingenic-drm-add-module_device_table.patch epoll-atomically-remove-wait-entry-on-wake-up.patch eventpoll-fix-missing-wakeup-for-ovflist-in-ep_poll_callback.patch iocost-protect-iocg-abs_vdebt-with-iocg-waitq.lock.patch ipc-mqueue.c-change-__do_notify-to-bypass-check_kill_permission.patch mm-limit-boost_watermark-on-small-zones.patch mm-page_alloc-fix-watchdog-soft-lockups-during-set_zone_contiguous.patch riscv-set-max_pfn-to-the-pfn-of-the-last-page.patch staging-gasket-check-the-return-value-of-gasket_get_bar_index.patch --- ...tlb-avoid-potential-null-dereference.patch | 56 ++++ ...lm-lookup-warning-to-a-debug-message.patch | 40 +++ ...en-handling-mds-session-feature-bits.patch | 48 +++ ...edump-fix-crash-when-umh-is-disabled.patch | 123 +++++++ ...-ingenic-drm-add-module_device_table.patch | 33 ++ ...mically-remove-wait-entry-on-wake-up.patch | 178 +++++++++++ ...keup-for-ovflist-in-ep_poll_callback.patch | 77 +++++ ...-iocg-abs_vdebt-with-iocg-waitq.lock.patch | 300 ++++++++++++++++++ ...tify-to-bypass-check_kill_permission.patch | 149 +++++++++ ...limit-boost_watermark-on-small-zones.patch | 82 +++++ ...t-lockups-during-set_zone_contiguous.patch | 62 ++++ ...-max_pfn-to-the-pfn-of-the-last-page.patch | 70 ++++ queue-5.4/series | 13 + ...return-value-of-gasket_get_bar_index.patch | 39 +++ 14 files changed, 1270 insertions(+) create mode 100644 queue-5.4/arm64-hugetlb-avoid-potential-null-dereference.patch create mode 100644 queue-5.4/ceph-demote-quotarealm-lookup-warning-to-a-debug-message.patch create mode 100644 queue-5.4/ceph-fix-endianness-bug-when-handling-mds-session-feature-bits.patch create mode 100644 queue-5.4/coredump-fix-crash-when-umh-is-disabled.patch create mode 100644 queue-5.4/drm-ingenic-drm-add-module_device_table.patch create mode 100644 queue-5.4/epoll-atomically-remove-wait-entry-on-wake-up.patch create mode 100644 queue-5.4/eventpoll-fix-missing-wakeup-for-ovflist-in-ep_poll_callback.patch create mode 100644 queue-5.4/iocost-protect-iocg-abs_vdebt-with-iocg-waitq.lock.patch create mode 100644 queue-5.4/ipc-mqueue.c-change-__do_notify-to-bypass-check_kill_permission.patch create mode 100644 queue-5.4/mm-limit-boost_watermark-on-small-zones.patch create mode 100644 queue-5.4/mm-page_alloc-fix-watchdog-soft-lockups-during-set_zone_contiguous.patch create mode 100644 queue-5.4/riscv-set-max_pfn-to-the-pfn-of-the-last-page.patch create mode 100644 queue-5.4/staging-gasket-check-the-return-value-of-gasket_get_bar_index.patch diff --git a/queue-5.4/arm64-hugetlb-avoid-potential-null-dereference.patch b/queue-5.4/arm64-hugetlb-avoid-potential-null-dereference.patch new file mode 100644 index 00000000000..0c40d51f6f4 --- /dev/null +++ b/queue-5.4/arm64-hugetlb-avoid-potential-null-dereference.patch @@ -0,0 +1,56 @@ +From 027d0c7101f50cf03aeea9eebf484afd4920c8d3 Mon Sep 17 00:00:00 2001 +From: Mark Rutland +Date: Tue, 5 May 2020 13:59:30 +0100 +Subject: arm64: hugetlb: avoid potential NULL dereference +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Mark Rutland + +commit 027d0c7101f50cf03aeea9eebf484afd4920c8d3 upstream. + +The static analyzer in GCC 10 spotted that in huge_pte_alloc() we may +pass a NULL pmdp into pte_alloc_map() when pmd_alloc() returns NULL: + +| CC arch/arm64/mm/pageattr.o +| CC arch/arm64/mm/hugetlbpage.o +| from arch/arm64/mm/hugetlbpage.c:10: +| arch/arm64/mm/hugetlbpage.c: In function ‘huge_pte_alloc’: +| ./arch/arm64/include/asm/pgtable-types.h:28:24: warning: dereference of NULL ‘pmdp’ [CWE-690] [-Wanalyzer-null-dereference] +| ./arch/arm64/include/asm/pgtable.h:436:26: note: in expansion of macro ‘pmd_val’ +| arch/arm64/mm/hugetlbpage.c:242:10: note: in expansion of macro ‘pte_alloc_map’ +| |arch/arm64/mm/hugetlbpage.c:232:10: +| |./arch/arm64/include/asm/pgtable-types.h:28:24: +| ./arch/arm64/include/asm/pgtable.h:436:26: note: in expansion of macro ‘pmd_val’ +| arch/arm64/mm/hugetlbpage.c:242:10: note: in expansion of macro ‘pte_alloc_map’ + +This can only occur when the kernel cannot allocate a page, and so is +unlikely to happen in practice before other systems start failing. + +We can avoid this by bailing out if pmd_alloc() fails, as we do earlier +in the function if pud_alloc() fails. + +Fixes: 66b3923a1a0f ("arm64: hugetlb: add support for PTE contiguous bit") +Signed-off-by: Mark Rutland +Reported-by: Kyrill Tkachov +Cc: # 4.5.x- +Cc: Will Deacon +Signed-off-by: Catalin Marinas +Signed-off-by: Greg Kroah-Hartman + +--- + arch/arm64/mm/hugetlbpage.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/arch/arm64/mm/hugetlbpage.c ++++ b/arch/arm64/mm/hugetlbpage.c +@@ -230,6 +230,8 @@ pte_t *huge_pte_alloc(struct mm_struct * + ptep = (pte_t *)pudp; + } else if (sz == (CONT_PTE_SIZE)) { + pmdp = pmd_alloc(mm, pudp, addr); ++ if (!pmdp) ++ return NULL; + + WARN_ON(addr & (sz - 1)); + /* diff --git a/queue-5.4/ceph-demote-quotarealm-lookup-warning-to-a-debug-message.patch b/queue-5.4/ceph-demote-quotarealm-lookup-warning-to-a-debug-message.patch new file mode 100644 index 00000000000..17d5931de34 --- /dev/null +++ b/queue-5.4/ceph-demote-quotarealm-lookup-warning-to-a-debug-message.patch @@ -0,0 +1,40 @@ +From 12ae44a40a1be891bdc6463f8c7072b4ede746ef Mon Sep 17 00:00:00 2001 +From: Luis Henriques +Date: Tue, 5 May 2020 13:59:02 +0100 +Subject: ceph: demote quotarealm lookup warning to a debug message + +From: Luis Henriques + +commit 12ae44a40a1be891bdc6463f8c7072b4ede746ef upstream. + +A misconfigured cephx can easily result in having the kernel client +flooding the logs with: + + ceph: Can't lookup inode 1 (err: -13) + +Change this message to debug level. + +Cc: stable@vger.kernel.org +URL: https://tracker.ceph.com/issues/44546 +Signed-off-by: Luis Henriques +Reviewed-by: Jeff Layton +Signed-off-by: Ilya Dryomov +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ceph/quota.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/fs/ceph/quota.c ++++ b/fs/ceph/quota.c +@@ -159,8 +159,8 @@ static struct inode *lookup_quotarealm_i + } + + if (IS_ERR(in)) { +- pr_warn("Can't lookup inode %llx (err: %ld)\n", +- realm->ino, PTR_ERR(in)); ++ dout("Can't lookup inode %llx (err: %ld)\n", ++ realm->ino, PTR_ERR(in)); + qri->timeout = jiffies + msecs_to_jiffies(60 * 1000); /* XXX */ + } else { + qri->timeout = 0; diff --git a/queue-5.4/ceph-fix-endianness-bug-when-handling-mds-session-feature-bits.patch b/queue-5.4/ceph-fix-endianness-bug-when-handling-mds-session-feature-bits.patch new file mode 100644 index 00000000000..b1d7cd7e8d0 --- /dev/null +++ b/queue-5.4/ceph-fix-endianness-bug-when-handling-mds-session-feature-bits.patch @@ -0,0 +1,48 @@ +From 0fa8263367db9287aa0632f96c1a5f93cc478150 Mon Sep 17 00:00:00 2001 +From: Jeff Layton +Date: Tue, 28 Apr 2020 08:10:22 -0400 +Subject: ceph: fix endianness bug when handling MDS session feature bits + +From: Jeff Layton + +commit 0fa8263367db9287aa0632f96c1a5f93cc478150 upstream. + +Eduard reported a problem mounting cephfs on s390 arch. The feature +mask sent by the MDS is little-endian, so we need to convert it +before storing and testing against it. + +Cc: stable@vger.kernel.org +Reported-and-Tested-by: Eduard Shishkin +Signed-off-by: Jeff Layton +Reviewed-by: "Yan, Zheng" +Signed-off-by: Ilya Dryomov +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ceph/mds_client.c | 8 +++----- + 1 file changed, 3 insertions(+), 5 deletions(-) + +--- a/fs/ceph/mds_client.c ++++ b/fs/ceph/mds_client.c +@@ -3072,8 +3072,7 @@ static void handle_session(struct ceph_m + void *end = p + msg->front.iov_len; + struct ceph_mds_session_head *h; + u32 op; +- u64 seq; +- unsigned long features = 0; ++ u64 seq, features = 0; + int wake = 0; + bool blacklisted = false; + +@@ -3092,9 +3091,8 @@ static void handle_session(struct ceph_m + goto bad; + /* version >= 3, feature bits */ + ceph_decode_32_safe(&p, end, len, bad); +- ceph_decode_need(&p, end, len, bad); +- memcpy(&features, p, min_t(size_t, len, sizeof(features))); +- p += len; ++ ceph_decode_64_safe(&p, end, features, bad); ++ p += len - sizeof(features); + } + + mutex_lock(&mdsc->mutex); diff --git a/queue-5.4/coredump-fix-crash-when-umh-is-disabled.patch b/queue-5.4/coredump-fix-crash-when-umh-is-disabled.patch new file mode 100644 index 00000000000..ef5c0349b15 --- /dev/null +++ b/queue-5.4/coredump-fix-crash-when-umh-is-disabled.patch @@ -0,0 +1,123 @@ +From 3740d93e37902b31159a82da2d5c8812ed825404 Mon Sep 17 00:00:00 2001 +From: Luis Chamberlain +Date: Thu, 16 Apr 2020 16:28:59 +0000 +Subject: coredump: fix crash when umh is disabled + +From: Luis Chamberlain + +commit 3740d93e37902b31159a82da2d5c8812ed825404 upstream. + +Commit 64e90a8acb859 ("Introduce STATIC_USERMODEHELPER to mediate +call_usermodehelper()") added the optiont to disable all +call_usermodehelper() calls by setting STATIC_USERMODEHELPER_PATH to +an empty string. When this is done, and crashdump is triggered, it +will crash on null pointer dereference, since we make assumptions +over what call_usermodehelper_exec() did. + +This has been reported by Sergey when one triggers a a coredump +with the following configuration: + +``` +CONFIG_STATIC_USERMODEHELPER=y +CONFIG_STATIC_USERMODEHELPER_PATH="" +kernel.core_pattern = |/usr/lib/systemd/systemd-coredump %P %u %g %s %t %c %h %e +``` + +The way disabling the umh was designed was that call_usermodehelper_exec() +would just return early, without an error. But coredump assumes +certain variables are set up for us when this happens, and calls +ile_start_write(cprm.file) with a NULL file. + +[ 2.819676] BUG: kernel NULL pointer dereference, address: 0000000000000020 +[ 2.819859] #PF: supervisor read access in kernel mode +[ 2.820035] #PF: error_code(0x0000) - not-present page +[ 2.820188] PGD 0 P4D 0 +[ 2.820305] Oops: 0000 [#1] SMP PTI +[ 2.820436] CPU: 2 PID: 89 Comm: a Not tainted 5.7.0-rc1+ #7 +[ 2.820680] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190711_202441-buildvm-armv7-10.arm.fedoraproject.org-2.fc31 04/01/2014 +[ 2.821150] RIP: 0010:do_coredump+0xd80/0x1060 +[ 2.821385] Code: e8 95 11 ed ff 48 c7 c6 cc a7 b4 81 48 8d bd 28 ff +ff ff 89 c2 e8 70 f1 ff ff 41 89 c2 85 c0 0f 84 72 f7 ff ff e9 b4 fe ff +ff <48> 8b 57 20 0f b7 02 66 25 00 f0 66 3d 00 8 +0 0f 84 9c 01 00 00 44 +[ 2.822014] RSP: 0000:ffffc9000029bcb8 EFLAGS: 00010246 +[ 2.822339] RAX: 0000000000000000 RBX: ffff88803f860000 RCX: 000000000000000a +[ 2.822746] RDX: 0000000000000009 RSI: 0000000000000282 RDI: 0000000000000000 +[ 2.823141] RBP: ffffc9000029bde8 R08: 0000000000000000 R09: ffffc9000029bc00 +[ 2.823508] R10: 0000000000000001 R11: ffff88803dec90be R12: ffffffff81c39da0 +[ 2.823902] R13: ffff88803de84400 R14: 0000000000000000 R15: 0000000000000000 +[ 2.824285] FS: 00007fee08183540(0000) GS:ffff88803e480000(0000) knlGS:0000000000000000 +[ 2.824767] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +[ 2.825111] CR2: 0000000000000020 CR3: 000000003f856005 CR4: 0000000000060ea0 +[ 2.825479] Call Trace: +[ 2.825790] get_signal+0x11e/0x720 +[ 2.826087] do_signal+0x1d/0x670 +[ 2.826361] ? force_sig_info_to_task+0xc1/0xf0 +[ 2.826691] ? force_sig_fault+0x3c/0x40 +[ 2.826996] ? do_trap+0xc9/0x100 +[ 2.827179] exit_to_usermode_loop+0x49/0x90 +[ 2.827359] prepare_exit_to_usermode+0x77/0xb0 +[ 2.827559] ? invalid_op+0xa/0x30 +[ 2.827747] ret_from_intr+0x20/0x20 +[ 2.827921] RIP: 0033:0x55e2c76d2129 +[ 2.828107] Code: 2d ff ff ff e8 68 ff ff ff 5d c6 05 18 2f 00 00 01 +c3 0f 1f 80 00 00 00 00 c3 0f 1f 80 00 00 00 00 e9 7b ff ff ff 55 48 89 +e5 <0f> 0b b8 00 00 00 00 5d c3 66 2e 0f 1f 84 0 +0 00 00 00 00 0f 1f 40 +[ 2.828603] RSP: 002b:00007fffeba5e080 EFLAGS: 00010246 +[ 2.828801] RAX: 000055e2c76d2125 RBX: 0000000000000000 RCX: 00007fee0817c718 +[ 2.829034] RDX: 00007fffeba5e188 RSI: 00007fffeba5e178 RDI: 0000000000000001 +[ 2.829257] RBP: 00007fffeba5e080 R08: 0000000000000000 R09: 00007fee08193c00 +[ 2.829482] R10: 0000000000000009 R11: 0000000000000000 R12: 000055e2c76d2040 +[ 2.829727] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 +[ 2.829964] CR2: 0000000000000020 +[ 2.830149] ---[ end trace ceed83d8c68a1bf1 ]--- +``` + +Cc: # v4.11+ +Fixes: 64e90a8acb85 ("Introduce STATIC_USERMODEHELPER to mediate call_usermodehelper()") +BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=199795 +Reported-by: Tony Vroon +Reported-by: Sergey Kvachonok +Tested-by: Sergei Trofimovich +Signed-off-by: Luis Chamberlain +Link: https://lore.kernel.org/r/20200416162859.26518-1-mcgrof@kernel.org +Signed-off-by: Greg Kroah-Hartman +Signed-off-by: Greg Kroah-Hartman + +--- + fs/coredump.c | 8 ++++++++ + kernel/umh.c | 5 +++++ + 2 files changed, 13 insertions(+) + +--- a/fs/coredump.c ++++ b/fs/coredump.c +@@ -788,6 +788,14 @@ void do_coredump(const kernel_siginfo_t + if (displaced) + put_files_struct(displaced); + if (!dump_interrupted()) { ++ /* ++ * umh disabled with CONFIG_STATIC_USERMODEHELPER_PATH="" would ++ * have this set to NULL. ++ */ ++ if (!cprm.file) { ++ pr_info("Core dump to |%s disabled\n", cn.corename); ++ goto close_fail; ++ } + file_start_write(cprm.file); + core_dumped = binfmt->core_dump(&cprm); + file_end_write(cprm.file); +--- a/kernel/umh.c ++++ b/kernel/umh.c +@@ -544,6 +544,11 @@ EXPORT_SYMBOL_GPL(fork_usermode_blob); + * Runs a user-space application. The application is started + * asynchronously if wait is not set, and runs as a child of system workqueues. + * (ie. it runs with full root capabilities and optimized affinity). ++ * ++ * Note: successful return value does not guarantee the helper was called at ++ * all. You can't rely on sub_info->{init,cleanup} being called even for ++ * UMH_WAIT_* wait modes as STATIC_USERMODEHELPER_PATH="" turns all helpers ++ * into a successful no-op. + */ + int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) + { diff --git a/queue-5.4/drm-ingenic-drm-add-module_device_table.patch b/queue-5.4/drm-ingenic-drm-add-module_device_table.patch new file mode 100644 index 00000000000..b7691a07923 --- /dev/null +++ b/queue-5.4/drm-ingenic-drm-add-module_device_table.patch @@ -0,0 +1,33 @@ +From c59359a02d14a7256cd508a4886b7d2012df2363 Mon Sep 17 00:00:00 2001 +From: "H. Nikolaus Schaller" +Date: Mon, 4 May 2020 08:35:12 +0200 +Subject: drm: ingenic-drm: add MODULE_DEVICE_TABLE + +From: H. Nikolaus Schaller + +commit c59359a02d14a7256cd508a4886b7d2012df2363 upstream. + +so that the driver can load by matching the device tree +if compiled as module. + +Cc: stable@vger.kernel.org # v5.3+ +Fixes: 90b86fcc47b4 ("DRM: Add KMS driver for the Ingenic JZ47xx SoCs") +Signed-off-by: H. Nikolaus Schaller +Signed-off-by: Paul Cercueil +Link: https://patchwork.freedesktop.org/patch/msgid/1694a29b7a3449b6b662cec33d1b33f2ee0b174a.1588574111.git.hns@goldelico.com +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/gpu/drm/ingenic/ingenic-drm.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/drivers/gpu/drm/ingenic/ingenic-drm.c ++++ b/drivers/gpu/drm/ingenic/ingenic-drm.c +@@ -824,6 +824,7 @@ static const struct of_device_id ingenic + { .compatible = "ingenic,jz4725b-lcd", .data = &jz4725b_soc_info }, + { /* sentinel */ }, + }; ++MODULE_DEVICE_TABLE(of, ingenic_drm_of_match); + + static struct platform_driver ingenic_drm_driver = { + .driver = { diff --git a/queue-5.4/epoll-atomically-remove-wait-entry-on-wake-up.patch b/queue-5.4/epoll-atomically-remove-wait-entry-on-wake-up.patch new file mode 100644 index 00000000000..9fcf0449c1d --- /dev/null +++ b/queue-5.4/epoll-atomically-remove-wait-entry-on-wake-up.patch @@ -0,0 +1,178 @@ +From 412895f03cbf9633298111cb4dfde13b7720e2c5 Mon Sep 17 00:00:00 2001 +From: Roman Penyaev +Date: Thu, 7 May 2020 18:36:16 -0700 +Subject: epoll: atomically remove wait entry on wake up + +From: Roman Penyaev + +commit 412895f03cbf9633298111cb4dfde13b7720e2c5 upstream. + +This patch does two things: + + - fixes a lost wakeup introduced by commit 339ddb53d373 ("fs/epoll: + remove unnecessary wakeups of nested epoll") + + - improves performance for events delivery. + +The description of the problem is the following: if N (>1) threads are +waiting on ep->wq for new events and M (>1) events come, it is quite +likely that >1 wakeups hit the same wait queue entry, because there is +quite a big window between __add_wait_queue_exclusive() and the +following __remove_wait_queue() calls in ep_poll() function. + +This can lead to lost wakeups, because thread, which was woken up, can +handle not all the events in ->rdllist. (in better words the problem is +described here: https://lkml.org/lkml/2019/10/7/905) + +The idea of the current patch is to use init_wait() instead of +init_waitqueue_entry(). + +Internally init_wait() sets autoremove_wake_function as a callback, +which removes the wait entry atomically (under the wq locks) from the +list, thus the next coming wakeup hits the next wait entry in the wait +queue, thus preventing lost wakeups. + +Problem is very well reproduced by the epoll60 test case [1]. + +Wait entry removal on wakeup has also performance benefits, because +there is no need to take a ep->lock and remove wait entry from the queue +after the successful wakeup. Here is the timing output of the epoll60 +test case: + + With explicit wakeup from ep_scan_ready_list() (the state of the + code prior 339ddb53d373): + + real 0m6.970s + user 0m49.786s + sys 0m0.113s + + After this patch: + + real 0m5.220s + user 0m36.879s + sys 0m0.019s + +The other testcase is the stress-epoll [2], where one thread consumes +all the events and other threads produce many events: + + With explicit wakeup from ep_scan_ready_list() (the state of the + code prior 339ddb53d373): + + threads events/ms run-time ms + 8 5427 1474 + 16 6163 2596 + 32 6824 4689 + 64 7060 9064 + 128 6991 18309 + + After this patch: + + threads events/ms run-time ms + 8 5598 1429 + 16 7073 2262 + 32 7502 4265 + 64 7640 8376 + 128 7634 16767 + + (number of "events/ms" represents event bandwidth, thus higher is + better; number of "run-time ms" represents overall time spent + doing the benchmark, thus lower is better) + +[1] tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c +[2] https://github.com/rouming/test-tools/blob/master/stress-epoll.c + +Signed-off-by: Roman Penyaev +Signed-off-by: Andrew Morton +Reviewed-by: Jason Baron +Cc: Khazhismel Kumykov +Cc: Alexander Viro +Cc: Heiher +Cc: +Link: http://lkml.kernel.org/r/20200430130326.1368509-2-rpenyaev@suse.de +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + fs/eventpoll.c | 43 ++++++++++++++++++++++++------------------- + 1 file changed, 24 insertions(+), 19 deletions(-) + +--- a/fs/eventpoll.c ++++ b/fs/eventpoll.c +@@ -1827,7 +1827,6 @@ static int ep_poll(struct eventpoll *ep, + { + int res = 0, eavail, timed_out = 0; + u64 slack = 0; +- bool waiter = false; + wait_queue_entry_t wait; + ktime_t expires, *to = NULL; + +@@ -1872,21 +1871,23 @@ fetch_events: + */ + ep_reset_busy_poll_napi_id(ep); + +- /* +- * We don't have any available event to return to the caller. We need +- * to sleep here, and we will be woken by ep_poll_callback() when events +- * become available. +- */ +- if (!waiter) { +- waiter = true; +- init_waitqueue_entry(&wait, current); +- ++ do { ++ /* ++ * Internally init_wait() uses autoremove_wake_function(), ++ * thus wait entry is removed from the wait queue on each ++ * wakeup. Why it is important? In case of several waiters ++ * each new wakeup will hit the next waiter, giving it the ++ * chance to harvest new event. Otherwise wakeup can be ++ * lost. This is also good performance-wise, because on ++ * normal wakeup path no need to call __remove_wait_queue() ++ * explicitly, thus ep->lock is not taken, which halts the ++ * event delivery. ++ */ ++ init_wait(&wait); + write_lock_irq(&ep->lock); + __add_wait_queue_exclusive(&ep->wq, &wait); + write_unlock_irq(&ep->lock); +- } + +- for (;;) { + /* + * We don't want to sleep if the ep_poll_callback() sends us + * a wakeup in between. That's why we set the task state +@@ -1916,10 +1917,20 @@ fetch_events: + timed_out = 1; + break; + } +- } ++ ++ /* We were woken up, thus go and try to harvest some events */ ++ eavail = 1; ++ ++ } while (0); + + __set_current_state(TASK_RUNNING); + ++ if (!list_empty_careful(&wait.entry)) { ++ write_lock_irq(&ep->lock); ++ __remove_wait_queue(&ep->wq, &wait); ++ write_unlock_irq(&ep->lock); ++ } ++ + send_events: + /* + * Try to transfer events to user space. In case we get 0 events and +@@ -1930,12 +1941,6 @@ send_events: + !(res = ep_send_events(ep, events, maxevents)) && !timed_out) + goto fetch_events; + +- if (waiter) { +- write_lock_irq(&ep->lock); +- __remove_wait_queue(&ep->wq, &wait); +- write_unlock_irq(&ep->lock); +- } +- + return res; + } + diff --git a/queue-5.4/eventpoll-fix-missing-wakeup-for-ovflist-in-ep_poll_callback.patch b/queue-5.4/eventpoll-fix-missing-wakeup-for-ovflist-in-ep_poll_callback.patch new file mode 100644 index 00000000000..ea3e7eaedbd --- /dev/null +++ b/queue-5.4/eventpoll-fix-missing-wakeup-for-ovflist-in-ep_poll_callback.patch @@ -0,0 +1,77 @@ +From 0c54a6a44bf3d41e76ce3f583a6ece267618df2e Mon Sep 17 00:00:00 2001 +From: Khazhismel Kumykov +Date: Thu, 7 May 2020 18:35:59 -0700 +Subject: eventpoll: fix missing wakeup for ovflist in ep_poll_callback + +From: Khazhismel Kumykov + +commit 0c54a6a44bf3d41e76ce3f583a6ece267618df2e upstream. + +In the event that we add to ovflist, before commit 339ddb53d373 +("fs/epoll: remove unnecessary wakeups of nested epoll") we would be +woken up by ep_scan_ready_list, and did no wakeup in ep_poll_callback. + +With that wakeup removed, if we add to ovflist here, we may never wake +up. Rather than adding back the ep_scan_ready_list wakeup - which was +resulting in unnecessary wakeups, trigger a wake-up in ep_poll_callback. + +We noticed that one of our workloads was missing wakeups starting with +339ddb53d373 and upon manual inspection, this wakeup seemed missing to me. +With this patch added, we no longer see missing wakeups. I haven't yet +tried to make a small reproducer, but the existing kselftests in +filesystem/epoll passed for me with this patch. + +[khazhy@google.com: use if/elif instead of goto + cleanup suggested by Roman] + Link: http://lkml.kernel.org/r/20200424190039.192373-1-khazhy@google.com +Fixes: 339ddb53d373 ("fs/epoll: remove unnecessary wakeups of nested epoll") +Signed-off-by: Khazhismel Kumykov +Signed-off-by: Andrew Morton +Reviewed-by: Roman Penyaev +Cc: Alexander Viro +Cc: Roman Penyaev +Cc: Heiher +Cc: Jason Baron +Cc: +Link: http://lkml.kernel.org/r/20200424025057.118641-1-khazhy@google.com +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + fs/eventpoll.c | 18 +++++++++--------- + 1 file changed, 9 insertions(+), 9 deletions(-) + +--- a/fs/eventpoll.c ++++ b/fs/eventpoll.c +@@ -1176,6 +1176,10 @@ static inline bool chain_epi_lockless(st + { + struct eventpoll *ep = epi->ep; + ++ /* Fast preliminary check */ ++ if (epi->next != EP_UNACTIVE_PTR) ++ return false; ++ + /* Check that the same epi has not been just chained from another CPU */ + if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR) + return false; +@@ -1242,16 +1246,12 @@ static int ep_poll_callback(wait_queue_e + * chained in ep->ovflist and requeued later on. + */ + if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) { +- if (epi->next == EP_UNACTIVE_PTR && +- chain_epi_lockless(epi)) ++ if (chain_epi_lockless(epi)) ++ ep_pm_stay_awake_rcu(epi); ++ } else if (!ep_is_linked(epi)) { ++ /* In the usual case, add event to ready list. */ ++ if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) + ep_pm_stay_awake_rcu(epi); +- goto out_unlock; +- } +- +- /* If this file is already in the ready list we exit soon */ +- if (!ep_is_linked(epi) && +- list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) { +- ep_pm_stay_awake_rcu(epi); + } + + /* diff --git a/queue-5.4/iocost-protect-iocg-abs_vdebt-with-iocg-waitq.lock.patch b/queue-5.4/iocost-protect-iocg-abs_vdebt-with-iocg-waitq.lock.patch new file mode 100644 index 00000000000..583091e76bd --- /dev/null +++ b/queue-5.4/iocost-protect-iocg-abs_vdebt-with-iocg-waitq.lock.patch @@ -0,0 +1,300 @@ +From 0b80f9866e6bbfb905140ed8787ff2af03652c0c Mon Sep 17 00:00:00 2001 +From: Tejun Heo +Date: Mon, 4 May 2020 19:27:54 -0400 +Subject: iocost: protect iocg->abs_vdebt with iocg->waitq.lock + +From: Tejun Heo + +commit 0b80f9866e6bbfb905140ed8787ff2af03652c0c upstream. + +abs_vdebt is an atomic_64 which tracks how much over budget a given cgroup +is and controls the activation of use_delay mechanism. Once a cgroup goes +over budget from forced IOs, it has to pay it back with its future budget. +The progress guarantee on debt paying comes from the iocg being active - +active iocgs are processed by the periodic timer, which ensures that as time +passes the debts dissipate and the iocg returns to normal operation. + +However, both iocg activation and vdebt handling are asynchronous and a +sequence like the following may happen. + +1. The iocg is in the process of being deactivated by the periodic timer. + +2. A bio enters ioc_rqos_throttle(), calls iocg_activate() which returns + without anything because it still sees that the iocg is already active. + +3. The iocg is deactivated. + +4. The bio from #2 is over budget but needs to be forced. It increases + abs_vdebt and goes over the threshold and enables use_delay. + +5. IO control is enabled for the iocg's subtree and now IOs are attributed + to the descendant cgroups and the iocg itself no longer issues IOs. + +This leaves the iocg with stuck abs_vdebt - it has debt but inactive and no +further IOs which can activate it. This can end up unduly punishing all the +descendants cgroups. + +The usual throttling path has the same issue - the iocg must be active while +throttled to ensure that future event will wake it up - and solves the +problem by synchronizing the throttling path with a spinlock. abs_vdebt +handling is another form of overage handling and shares a lot of +characteristics including the fact that it isn't in the hottest path. + +This patch fixes the above and other possible races by strictly +synchronizing abs_vdebt and use_delay handling with iocg->waitq.lock. + +Signed-off-by: Tejun Heo +Reported-by: Vlad Dmitriev +Cc: stable@vger.kernel.org # v5.4+ +Fixes: e1518f63f246 ("blk-iocost: Don't let merges push vtime into the future") +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman + +--- + block/blk-iocost.c | 117 ++++++++++++++++++++++++----------------- + tools/cgroup/iocost_monitor.py | 7 ++ + 2 files changed, 77 insertions(+), 47 deletions(-) + +--- a/block/blk-iocost.c ++++ b/block/blk-iocost.c +@@ -469,7 +469,7 @@ struct ioc_gq { + */ + atomic64_t vtime; + atomic64_t done_vtime; +- atomic64_t abs_vdebt; ++ u64 abs_vdebt; + u64 last_vtime; + + /* +@@ -1145,7 +1145,7 @@ static void iocg_kick_waitq(struct ioc_g + struct iocg_wake_ctx ctx = { .iocg = iocg }; + u64 margin_ns = (u64)(ioc->period_us * + WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC; +- u64 abs_vdebt, vdebt, vshortage, expires, oexpires; ++ u64 vdebt, vshortage, expires, oexpires; + s64 vbudget; + u32 hw_inuse; + +@@ -1155,18 +1155,15 @@ static void iocg_kick_waitq(struct ioc_g + vbudget = now->vnow - atomic64_read(&iocg->vtime); + + /* pay off debt */ +- abs_vdebt = atomic64_read(&iocg->abs_vdebt); +- vdebt = abs_cost_to_cost(abs_vdebt, hw_inuse); ++ vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse); + if (vdebt && vbudget > 0) { + u64 delta = min_t(u64, vbudget, vdebt); + u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse), +- abs_vdebt); ++ iocg->abs_vdebt); + + atomic64_add(delta, &iocg->vtime); + atomic64_add(delta, &iocg->done_vtime); +- atomic64_sub(abs_delta, &iocg->abs_vdebt); +- if (WARN_ON_ONCE(atomic64_read(&iocg->abs_vdebt) < 0)) +- atomic64_set(&iocg->abs_vdebt, 0); ++ iocg->abs_vdebt -= abs_delta; + } + + /* +@@ -1222,12 +1219,18 @@ static bool iocg_kick_delay(struct ioc_g + u64 expires, oexpires; + u32 hw_inuse; + ++ lockdep_assert_held(&iocg->waitq.lock); ++ + /* debt-adjust vtime */ + current_hweight(iocg, NULL, &hw_inuse); +- vtime += abs_cost_to_cost(atomic64_read(&iocg->abs_vdebt), hw_inuse); ++ vtime += abs_cost_to_cost(iocg->abs_vdebt, hw_inuse); + +- /* clear or maintain depending on the overage */ +- if (time_before_eq64(vtime, now->vnow)) { ++ /* ++ * Clear or maintain depending on the overage. Non-zero vdebt is what ++ * guarantees that @iocg is online and future iocg_kick_delay() will ++ * clear use_delay. Don't leave it on when there's no vdebt. ++ */ ++ if (!iocg->abs_vdebt || time_before_eq64(vtime, now->vnow)) { + blkcg_clear_delay(blkg); + return false; + } +@@ -1261,9 +1264,12 @@ static enum hrtimer_restart iocg_delay_t + { + struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer); + struct ioc_now now; ++ unsigned long flags; + ++ spin_lock_irqsave(&iocg->waitq.lock, flags); + ioc_now(iocg->ioc, &now); + iocg_kick_delay(iocg, &now, 0); ++ spin_unlock_irqrestore(&iocg->waitq.lock, flags); + + return HRTIMER_NORESTART; + } +@@ -1371,14 +1377,13 @@ static void ioc_timer_fn(struct timer_li + * should have woken up in the last period and expire idle iocgs. + */ + list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) { +- if (!waitqueue_active(&iocg->waitq) && +- !atomic64_read(&iocg->abs_vdebt) && !iocg_is_idle(iocg)) ++ if (!waitqueue_active(&iocg->waitq) && iocg->abs_vdebt && ++ !iocg_is_idle(iocg)) + continue; + + spin_lock(&iocg->waitq.lock); + +- if (waitqueue_active(&iocg->waitq) || +- atomic64_read(&iocg->abs_vdebt)) { ++ if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt) { + /* might be oversleeping vtime / hweight changes, kick */ + iocg_kick_waitq(iocg, &now); + iocg_kick_delay(iocg, &now, 0); +@@ -1721,28 +1726,49 @@ static void ioc_rqos_throttle(struct rq_ + * tests are racy but the races aren't systemic - we only miss once + * in a while which is fine. + */ +- if (!waitqueue_active(&iocg->waitq) && +- !atomic64_read(&iocg->abs_vdebt) && ++ if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt && + time_before_eq64(vtime + cost, now.vnow)) { + iocg_commit_bio(iocg, bio, cost); + return; + } + + /* +- * We're over budget. If @bio has to be issued regardless, +- * remember the abs_cost instead of advancing vtime. +- * iocg_kick_waitq() will pay off the debt before waking more IOs. ++ * We activated above but w/o any synchronization. Deactivation is ++ * synchronized with waitq.lock and we won't get deactivated as long ++ * as we're waiting or has debt, so we're good if we're activated ++ * here. In the unlikely case that we aren't, just issue the IO. ++ */ ++ spin_lock_irq(&iocg->waitq.lock); ++ ++ if (unlikely(list_empty(&iocg->active_list))) { ++ spin_unlock_irq(&iocg->waitq.lock); ++ iocg_commit_bio(iocg, bio, cost); ++ return; ++ } ++ ++ /* ++ * We're over budget. If @bio has to be issued regardless, remember ++ * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay ++ * off the debt before waking more IOs. ++ * + * This way, the debt is continuously paid off each period with the +- * actual budget available to the cgroup. If we just wound vtime, +- * we would incorrectly use the current hw_inuse for the entire +- * amount which, for example, can lead to the cgroup staying +- * blocked for a long time even with substantially raised hw_inuse. ++ * actual budget available to the cgroup. If we just wound vtime, we ++ * would incorrectly use the current hw_inuse for the entire amount ++ * which, for example, can lead to the cgroup staying blocked for a ++ * long time even with substantially raised hw_inuse. ++ * ++ * An iocg with vdebt should stay online so that the timer can keep ++ * deducting its vdebt and [de]activate use_delay mechanism ++ * accordingly. We don't want to race against the timer trying to ++ * clear them and leave @iocg inactive w/ dangling use_delay heavily ++ * penalizing the cgroup and its descendants. + */ + if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) { +- atomic64_add(abs_cost, &iocg->abs_vdebt); ++ iocg->abs_vdebt += abs_cost; + if (iocg_kick_delay(iocg, &now, cost)) + blkcg_schedule_throttle(rqos->q, + (bio->bi_opf & REQ_SWAP) == REQ_SWAP); ++ spin_unlock_irq(&iocg->waitq.lock); + return; + } + +@@ -1759,20 +1785,6 @@ static void ioc_rqos_throttle(struct rq_ + * All waiters are on iocg->waitq and the wait states are + * synchronized using waitq.lock. + */ +- spin_lock_irq(&iocg->waitq.lock); +- +- /* +- * We activated above but w/o any synchronization. Deactivation is +- * synchronized with waitq.lock and we won't get deactivated as +- * long as we're waiting, so we're good if we're activated here. +- * In the unlikely case that we are deactivated, just issue the IO. +- */ +- if (unlikely(list_empty(&iocg->active_list))) { +- spin_unlock_irq(&iocg->waitq.lock); +- iocg_commit_bio(iocg, bio, cost); +- return; +- } +- + init_waitqueue_func_entry(&wait.wait, iocg_wake_fn); + wait.wait.private = current; + wait.bio = bio; +@@ -1804,6 +1816,7 @@ static void ioc_rqos_merge(struct rq_qos + struct ioc_now now; + u32 hw_inuse; + u64 abs_cost, cost; ++ unsigned long flags; + + /* bypass if disabled or for root cgroup */ + if (!ioc->enabled || !iocg->level) +@@ -1823,15 +1836,28 @@ static void ioc_rqos_merge(struct rq_qos + iocg->cursor = bio_end; + + /* +- * Charge if there's enough vtime budget and the existing request +- * has cost assigned. Otherwise, account it as debt. See debt +- * handling in ioc_rqos_throttle() for details. ++ * Charge if there's enough vtime budget and the existing request has ++ * cost assigned. + */ + if (rq->bio && rq->bio->bi_iocost_cost && +- time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) ++ time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) { + iocg_commit_bio(iocg, bio, cost); +- else +- atomic64_add(abs_cost, &iocg->abs_vdebt); ++ return; ++ } ++ ++ /* ++ * Otherwise, account it as debt if @iocg is online, which it should ++ * be for the vast majority of cases. See debt handling in ++ * ioc_rqos_throttle() for details. ++ */ ++ spin_lock_irqsave(&iocg->waitq.lock, flags); ++ if (likely(!list_empty(&iocg->active_list))) { ++ iocg->abs_vdebt += abs_cost; ++ iocg_kick_delay(iocg, &now, cost); ++ } else { ++ iocg_commit_bio(iocg, bio, cost); ++ } ++ spin_unlock_irqrestore(&iocg->waitq.lock, flags); + } + + static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio) +@@ -2001,7 +2027,6 @@ static void ioc_pd_init(struct blkg_poli + iocg->ioc = ioc; + atomic64_set(&iocg->vtime, now.vnow); + atomic64_set(&iocg->done_vtime, now.vnow); +- atomic64_set(&iocg->abs_vdebt, 0); + atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period)); + INIT_LIST_HEAD(&iocg->active_list); + iocg->hweight_active = HWEIGHT_WHOLE; +--- a/tools/cgroup/iocost_monitor.py ++++ b/tools/cgroup/iocost_monitor.py +@@ -159,7 +159,12 @@ class IocgStat: + else: + self.inflight_pct = 0 + +- self.debt_ms = iocg.abs_vdebt.counter.value_() / VTIME_PER_USEC / 1000 ++ # vdebt used to be an atomic64_t and is now u64, support both ++ try: ++ self.debt_ms = iocg.abs_vdebt.counter.value_() / VTIME_PER_USEC / 1000 ++ except: ++ self.debt_ms = iocg.abs_vdebt.value_() / VTIME_PER_USEC / 1000 ++ + self.use_delay = blkg.use_delay.counter.value_() + self.delay_ms = blkg.delay_nsec.counter.value_() / 1_000_000 + diff --git a/queue-5.4/ipc-mqueue.c-change-__do_notify-to-bypass-check_kill_permission.patch b/queue-5.4/ipc-mqueue.c-change-__do_notify-to-bypass-check_kill_permission.patch new file mode 100644 index 00000000000..0b7e7469df6 --- /dev/null +++ b/queue-5.4/ipc-mqueue.c-change-__do_notify-to-bypass-check_kill_permission.patch @@ -0,0 +1,149 @@ +From b5f2006144c6ae941726037120fa1001ddede784 Mon Sep 17 00:00:00 2001 +From: Oleg Nesterov +Date: Thu, 7 May 2020 18:35:39 -0700 +Subject: ipc/mqueue.c: change __do_notify() to bypass check_kill_permission() + +From: Oleg Nesterov + +commit b5f2006144c6ae941726037120fa1001ddede784 upstream. + +Commit cc731525f26a ("signal: Remove kernel interal si_code magic") +changed the value of SI_FROMUSER(SI_MESGQ), this means that mq_notify() no +longer works if the sender doesn't have rights to send a signal. + +Change __do_notify() to use do_send_sig_info() instead of kill_pid_info() +to avoid check_kill_permission(). + +This needs the additional notify.sigev_signo != 0 check, shouldn't we +change do_mq_notify() to deny sigev_signo == 0 ? + +Test-case: + + #include + #include + #include + #include + #include + + static int notified; + + static void sigh(int sig) + { + notified = 1; + } + + int main(void) + { + signal(SIGIO, sigh); + + int fd = mq_open("/mq", O_RDWR|O_CREAT, 0666, NULL); + assert(fd >= 0); + + struct sigevent se = { + .sigev_notify = SIGEV_SIGNAL, + .sigev_signo = SIGIO, + }; + assert(mq_notify(fd, &se) == 0); + + if (!fork()) { + assert(setuid(1) == 0); + mq_send(fd, "",1,0); + return 0; + } + + wait(NULL); + mq_unlink("/mq"); + assert(notified); + return 0; + } + +[manfred@colorfullife.com: 1) Add self_exec_id evaluation so that the implementation matches do_notify_parent 2) use PIDTYPE_TGID everywhere] +Fixes: cc731525f26a ("signal: Remove kernel interal si_code magic") +Reported-by: Yoji +Signed-off-by: Oleg Nesterov +Signed-off-by: Manfred Spraul +Signed-off-by: Andrew Morton +Acked-by: "Eric W. Biederman" +Cc: Davidlohr Bueso +Cc: Markus Elfring +Cc: <1vier1@web.de> +Cc: +Link: http://lkml.kernel.org/r/e2a782e4-eab9-4f5c-c749-c07a8f7a4e66@colorfullife.com +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + ipc/mqueue.c | 34 ++++++++++++++++++++++++++-------- + 1 file changed, 26 insertions(+), 8 deletions(-) + +--- a/ipc/mqueue.c ++++ b/ipc/mqueue.c +@@ -82,6 +82,7 @@ struct mqueue_inode_info { + + struct sigevent notify; + struct pid *notify_owner; ++ u32 notify_self_exec_id; + struct user_namespace *notify_user_ns; + struct user_struct *user; /* user who created, for accounting */ + struct sock *notify_sock; +@@ -709,28 +710,44 @@ static void __do_notify(struct mqueue_in + * synchronously. */ + if (info->notify_owner && + info->attr.mq_curmsgs == 1) { +- struct kernel_siginfo sig_i; + switch (info->notify.sigev_notify) { + case SIGEV_NONE: + break; +- case SIGEV_SIGNAL: +- /* sends signal */ ++ case SIGEV_SIGNAL: { ++ struct kernel_siginfo sig_i; ++ struct task_struct *task; ++ ++ /* do_mq_notify() accepts sigev_signo == 0, why?? */ ++ if (!info->notify.sigev_signo) ++ break; + + clear_siginfo(&sig_i); + sig_i.si_signo = info->notify.sigev_signo; + sig_i.si_errno = 0; + sig_i.si_code = SI_MESGQ; + sig_i.si_value = info->notify.sigev_value; +- /* map current pid/uid into info->owner's namespaces */ + rcu_read_lock(); ++ /* map current pid/uid into info->owner's namespaces */ + sig_i.si_pid = task_tgid_nr_ns(current, + ns_of_pid(info->notify_owner)); +- sig_i.si_uid = from_kuid_munged(info->notify_user_ns, current_uid()); ++ sig_i.si_uid = from_kuid_munged(info->notify_user_ns, ++ current_uid()); ++ /* ++ * We can't use kill_pid_info(), this signal should ++ * bypass check_kill_permission(). It is from kernel ++ * but si_fromuser() can't know this. ++ * We do check the self_exec_id, to avoid sending ++ * signals to programs that don't expect them. ++ */ ++ task = pid_task(info->notify_owner, PIDTYPE_TGID); ++ if (task && task->self_exec_id == ++ info->notify_self_exec_id) { ++ do_send_sig_info(info->notify.sigev_signo, ++ &sig_i, task, PIDTYPE_TGID); ++ } + rcu_read_unlock(); +- +- kill_pid_info(info->notify.sigev_signo, +- &sig_i, info->notify_owner); + break; ++ } + case SIGEV_THREAD: + set_cookie(info->notify_cookie, NOTIFY_WOKENUP); + netlink_sendskb(info->notify_sock, info->notify_cookie); +@@ -1315,6 +1332,7 @@ retry: + info->notify.sigev_signo = notification->sigev_signo; + info->notify.sigev_value = notification->sigev_value; + info->notify.sigev_notify = SIGEV_SIGNAL; ++ info->notify_self_exec_id = current->self_exec_id; + break; + } + diff --git a/queue-5.4/mm-limit-boost_watermark-on-small-zones.patch b/queue-5.4/mm-limit-boost_watermark-on-small-zones.patch new file mode 100644 index 00000000000..eabe7f83f90 --- /dev/null +++ b/queue-5.4/mm-limit-boost_watermark-on-small-zones.patch @@ -0,0 +1,82 @@ +From 14f69140ff9c92a0928547ceefb153a842e8492c Mon Sep 17 00:00:00 2001 +From: Henry Willard +Date: Thu, 7 May 2020 18:36:27 -0700 +Subject: mm: limit boost_watermark on small zones + +From: Henry Willard + +commit 14f69140ff9c92a0928547ceefb153a842e8492c upstream. + +Commit 1c30844d2dfe ("mm: reclaim small amounts of memory when an +external fragmentation event occurs") adds a boost_watermark() function +which increases the min watermark in a zone by at least +pageblock_nr_pages or the number of pages in a page block. + +On Arm64, with 64K pages and 512M huge pages, this is 8192 pages or +512M. It does this regardless of the number of managed pages managed in +the zone or the likelihood of success. + +This can put the zone immediately under water in terms of allocating +pages from the zone, and can cause a small machine to fail immediately +due to OoM. Unlike set_recommended_min_free_kbytes(), which +substantially increases min_free_kbytes and is tied to THP, +boost_watermark() can be called even if THP is not active. + +The problem is most likely to appear on architectures such as Arm64 +where pageblock_nr_pages is very large. + +It is desirable to run the kdump capture kernel in as small a space as +possible to avoid wasting memory. In some architectures, such as Arm64, +there are restrictions on where the capture kernel can run, and +therefore, the space available. A capture kernel running in 768M can +fail due to OoM immediately after boost_watermark() sets the min in zone +DMA32, where most of the memory is, to 512M. It fails even though there +is over 500M of free memory. With boost_watermark() suppressed, the +capture kernel can run successfully in 448M. + +This patch limits boost_watermark() to boosting a zone's min watermark +only when there are enough pages that the boost will produce positive +results. In this case that is estimated to be four times as many pages +as pageblock_nr_pages. + +Mel said: + +: There is no harm in marking it stable. Clearly it does not happen very +: often but it's not impossible. 32-bit x86 is a lot less common now +: which would previously have been vulnerable to triggering this easily. +: ppc64 has a larger base page size but typically only has one zone. +: arm64 is likely the most vulnerable, particularly when CMA is +: configured with a small movable zone. + +Fixes: 1c30844d2dfe ("mm: reclaim small amounts of memory when an external fragmentation event occurs") +Signed-off-by: Henry Willard +Signed-off-by: Andrew Morton +Reviewed-by: David Hildenbrand +Acked-by: Mel Gorman +Cc: Vlastimil Babka +Cc: +Link: http://lkml.kernel.org/r/1588294148-6586-1-git-send-email-henry.willard@oracle.com +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/page_alloc.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -2351,6 +2351,14 @@ static inline void boost_watermark(struc + + if (!watermark_boost_factor) + return; ++ /* ++ * Don't bother in zones that are unlikely to produce results. ++ * On small machines, including kdump capture kernels running ++ * in a small area, boosting the watermark can cause an out of ++ * memory situation immediately. ++ */ ++ if ((pageblock_nr_pages * 4) > zone_managed_pages(zone)) ++ return; + + max_boost = mult_frac(zone->_watermark[WMARK_HIGH], + watermark_boost_factor, 10000); diff --git a/queue-5.4/mm-page_alloc-fix-watchdog-soft-lockups-during-set_zone_contiguous.patch b/queue-5.4/mm-page_alloc-fix-watchdog-soft-lockups-during-set_zone_contiguous.patch new file mode 100644 index 00000000000..5e9c5951118 --- /dev/null +++ b/queue-5.4/mm-page_alloc-fix-watchdog-soft-lockups-during-set_zone_contiguous.patch @@ -0,0 +1,62 @@ +From e84fe99b68ce353c37ceeecc95dce9696c976556 Mon Sep 17 00:00:00 2001 +From: David Hildenbrand +Date: Thu, 7 May 2020 18:35:46 -0700 +Subject: mm/page_alloc: fix watchdog soft lockups during set_zone_contiguous() + +From: David Hildenbrand + +commit e84fe99b68ce353c37ceeecc95dce9696c976556 upstream. + +Without CONFIG_PREEMPT, it can happen that we get soft lockups detected, +e.g., while booting up. + + watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [swapper/0:1] + CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.6.0-next-20200331+ #4 + Hardware name: Red Hat KVM, BIOS 1.11.1-4.module+el8.1.0+4066+0f1aadab 04/01/2014 + RIP: __pageblock_pfn_to_page+0x134/0x1c0 + Call Trace: + set_zone_contiguous+0x56/0x70 + page_alloc_init_late+0x166/0x176 + kernel_init_freeable+0xfa/0x255 + kernel_init+0xa/0x106 + ret_from_fork+0x35/0x40 + +The issue becomes visible when having a lot of memory (e.g., 4TB) +assigned to a single NUMA node - a system that can easily be created +using QEMU. Inside VMs on a hypervisor with quite some memory +overcommit, this is fairly easy to trigger. + +Signed-off-by: David Hildenbrand +Signed-off-by: Andrew Morton +Reviewed-by: Pavel Tatashin +Reviewed-by: Pankaj Gupta +Reviewed-by: Baoquan He +Reviewed-by: Shile Zhang +Acked-by: Michal Hocko +Cc: Kirill Tkhai +Cc: Shile Zhang +Cc: Pavel Tatashin +Cc: Daniel Jordan +Cc: Michal Hocko +Cc: Alexander Duyck +Cc: Baoquan He +Cc: Oscar Salvador +Cc: +Link: http://lkml.kernel.org/r/20200416073417.5003-1-david@redhat.com +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/page_alloc.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -1555,6 +1555,7 @@ void set_zone_contiguous(struct zone *zo + if (!__pageblock_pfn_to_page(block_start_pfn, + block_end_pfn, zone)) + return; ++ cond_resched(); + } + + /* We confirm that there is no hole */ diff --git a/queue-5.4/riscv-set-max_pfn-to-the-pfn-of-the-last-page.patch b/queue-5.4/riscv-set-max_pfn-to-the-pfn-of-the-last-page.patch new file mode 100644 index 00000000000..1960d4c4d32 --- /dev/null +++ b/queue-5.4/riscv-set-max_pfn-to-the-pfn-of-the-last-page.patch @@ -0,0 +1,70 @@ +From c749bb2d554825e007cbc43b791f54e124dadfce Mon Sep 17 00:00:00 2001 +From: Vincent Chen +Date: Mon, 27 Apr 2020 14:59:24 +0800 +Subject: riscv: set max_pfn to the PFN of the last page + +From: Vincent Chen + +commit c749bb2d554825e007cbc43b791f54e124dadfce upstream. + +The current max_pfn equals to zero. In this case, I found it caused users +cannot get some page information through /proc such as kpagecount in v5.6 +kernel because of new sanity checks. The following message is displayed by +stress-ng test suite with the command "stress-ng --verbose --physpage 1 -t +1" on HiFive unleashed board. + + # stress-ng --verbose --physpage 1 -t 1 + stress-ng: debug: [109] 4 processors online, 4 processors configured + stress-ng: info: [109] dispatching hogs: 1 physpage + stress-ng: debug: [109] cache allocate: reducing cache level from L3 (too high) to L0 + stress-ng: debug: [109] get_cpu_cache: invalid cache_level: 0 + stress-ng: info: [109] cache allocate: using built-in defaults as no suitable cache found + stress-ng: debug: [109] cache allocate: default cache size: 2048K + stress-ng: debug: [109] starting stressors + stress-ng: debug: [109] 1 stressor spawned + stress-ng: debug: [110] stress-ng-physpage: started [110] (instance 0) + stress-ng: error: [110] stress-ng-physpage: cannot read page count for address 0x3fd34de000 in /proc/kpagecount, errno=0 (Success) + stress-ng: error: [110] stress-ng-physpage: cannot read page count for address 0x3fd32db078 in /proc/kpagecount, errno=0 (Success) + ... + stress-ng: error: [110] stress-ng-physpage: cannot read page count for address 0x3fd32db078 in /proc/kpagecount, errno=0 (Success) + stress-ng: debug: [110] stress-ng-physpage: exited [110] (instance 0) + stress-ng: debug: [109] process [110] terminated + stress-ng: info: [109] successful run completed in 1.00s + # + +After applying this patch, the kernel can pass the test. + + # stress-ng --verbose --physpage 1 -t 1 + stress-ng: debug: [104] 4 processors online, 4 processors configured stress-ng: info: [104] dispatching hogs: 1 physpage + stress-ng: info: [104] cache allocate: using defaults, can't determine cache details from sysfs + stress-ng: debug: [104] cache allocate: default cache size: 2048K + stress-ng: debug: [104] starting stressors + stress-ng: debug: [104] 1 stressor spawned + stress-ng: debug: [105] stress-ng-physpage: started [105] (instance 0) stress-ng: debug: [105] stress-ng-physpage: exited [105] (instance 0) stress-ng: debug: [104] process [105] terminated + stress-ng: info: [104] successful run completed in 1.01s + # + +Cc: stable@vger.kernel.org +Signed-off-by: Vincent Chen +Reviewed-by: Anup Patel +Reviewed-by: Yash Shah +Tested-by: Yash Shah +Signed-off-by: Palmer Dabbelt +Signed-off-by: Greg Kroah-Hartman + +--- + arch/riscv/mm/init.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/arch/riscv/mm/init.c ++++ b/arch/riscv/mm/init.c +@@ -116,7 +116,8 @@ void __init setup_bootmem(void) + memblock_reserve(vmlinux_start, vmlinux_end - vmlinux_start); + + set_max_mapnr(PFN_DOWN(mem_size)); +- max_low_pfn = PFN_DOWN(memblock_end_of_DRAM()); ++ max_pfn = PFN_DOWN(memblock_end_of_DRAM()); ++ max_low_pfn = max_pfn; + + #ifdef CONFIG_BLK_DEV_INITRD + setup_initrd(); diff --git a/queue-5.4/series b/queue-5.4/series index a9e44c4f080..444b81594bc 100644 --- a/queue-5.4/series +++ b/queue-5.4/series @@ -52,3 +52,16 @@ kvm-s390-remove-false-warn_on_once-for-the-pqap-instruction.patch kvm-vmx-explicitly-clear-rflags.cf-and-rflags.zf-in-vm-exit-rsb-path.patch kvm-arm-vgic-fix-limit-condition-when-writing-to-gicd_iactiver.patch kvm-arm64-fix-32bit-pc-wrap-around.patch +arm64-hugetlb-avoid-potential-null-dereference.patch +drm-ingenic-drm-add-module_device_table.patch +ipc-mqueue.c-change-__do_notify-to-bypass-check_kill_permission.patch +epoll-atomically-remove-wait-entry-on-wake-up.patch +eventpoll-fix-missing-wakeup-for-ovflist-in-ep_poll_callback.patch +mm-page_alloc-fix-watchdog-soft-lockups-during-set_zone_contiguous.patch +mm-limit-boost_watermark-on-small-zones.patch +ceph-fix-endianness-bug-when-handling-mds-session-feature-bits.patch +ceph-demote-quotarealm-lookup-warning-to-a-debug-message.patch +staging-gasket-check-the-return-value-of-gasket_get_bar_index.patch +coredump-fix-crash-when-umh-is-disabled.patch +riscv-set-max_pfn-to-the-pfn-of-the-last-page.patch +iocost-protect-iocg-abs_vdebt-with-iocg-waitq.lock.patch diff --git a/queue-5.4/staging-gasket-check-the-return-value-of-gasket_get_bar_index.patch b/queue-5.4/staging-gasket-check-the-return-value-of-gasket_get_bar_index.patch new file mode 100644 index 00000000000..6144b7633a9 --- /dev/null +++ b/queue-5.4/staging-gasket-check-the-return-value-of-gasket_get_bar_index.patch @@ -0,0 +1,39 @@ +From 769acc3656d93aaacada814939743361d284fd87 Mon Sep 17 00:00:00 2001 +From: Oscar Carter +Date: Fri, 1 May 2020 17:51:18 +0200 +Subject: staging: gasket: Check the return value of gasket_get_bar_index() + +From: Oscar Carter + +commit 769acc3656d93aaacada814939743361d284fd87 upstream. + +Check the return value of gasket_get_bar_index function as it can return +a negative one (-EINVAL). If this happens, a negative index is used in +the "gasket_dev->bar_data" array. + +Addresses-Coverity-ID: 1438542 ("Negative array index read") +Fixes: 9a69f5087ccc2 ("drivers/staging: Gasket driver framework + Apex driver") +Signed-off-by: Oscar Carter +Cc: stable +Reviewed-by: Richard Yeh +Link: https://lore.kernel.org/r/20200501155118.13380-1-oscar.carter@gmx.com +Signed-off-by: Greg Kroah-Hartman +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/staging/gasket/gasket_core.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/drivers/staging/gasket/gasket_core.c ++++ b/drivers/staging/gasket/gasket_core.c +@@ -926,6 +926,10 @@ do_map_region(const struct gasket_dev *g + gasket_get_bar_index(gasket_dev, + (vma->vm_pgoff << PAGE_SHIFT) + + driver_desc->legacy_mmap_address_offset); ++ ++ if (bar_index < 0) ++ return DO_MAP_REGION_INVALID; ++ + phys_base = gasket_dev->bar_data[bar_index].phys_base + phys_offset; + while (mapped_bytes < map_length) { + /* -- 2.47.3