From d9b1b45f117b73adf31135b1ddce814a605588c0 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 25 Apr 2022 12:27:43 +0200 Subject: [PATCH] 5.15-stable patches added patches: ata-pata_marvell-check-the-bmdma_addr-beforing-reading.patch dma-at_xdmac-fix-a-missing-check-on-list-iterator.patch dmaengine-imx-sdma-fix-init-of-uart-scripts.patch edac-synopsys-read-the-error-count-from-the-correct-register.patch memcg-sync-flush-only-if-periodic-flush-is-delayed.patch mm-hugetlb-allow-for-high-userspace-addresses.patch mm-memory-failure.c-skip-huge_zero_page-in-memory_failure.patch mm-mmu_notifier.c-fix-race-in-mmu_interval_notifier_remove.patch net-atlantic-invert-deep-par-in-pm-functions-preventing-null-derefs.patch oom_kill.c-futex-delay-the-oom-reaper-to-allow-time-for-proper-futex-cleanup.patch --- ...heck-the-bmdma_addr-beforing-reading.patch | 40 ++++ ...fix-a-missing-check-on-list-iterator.patch | 57 +++++ ...ne-imx-sdma-fix-init-of-uart-scripts.patch | 103 +++++++++ ...rror-count-from-the-correct-register.patch | 61 ++++++ ...sh-only-if-periodic-flush-is-delayed.patch | 130 +++++++++++ ...b-allow-for-high-userspace-addresses.patch | 145 ++++++++++++ ...kip-huge_zero_page-in-memory_failure.patch | 95 ++++++++ ...race-in-mmu_interval_notifier_remove.patch | 84 +++++++ ...-pm-functions-preventing-null-derefs.patch | 95 ++++++++ ...-allow-time-for-proper-futex-cleanup.patch | 206 ++++++++++++++++++ queue-5.15/series | 10 + 11 files changed, 1026 insertions(+) create mode 100644 queue-5.15/ata-pata_marvell-check-the-bmdma_addr-beforing-reading.patch create mode 100644 queue-5.15/dma-at_xdmac-fix-a-missing-check-on-list-iterator.patch create mode 100644 queue-5.15/dmaengine-imx-sdma-fix-init-of-uart-scripts.patch create mode 100644 queue-5.15/edac-synopsys-read-the-error-count-from-the-correct-register.patch create mode 100644 queue-5.15/memcg-sync-flush-only-if-periodic-flush-is-delayed.patch create mode 100644 queue-5.15/mm-hugetlb-allow-for-high-userspace-addresses.patch create mode 100644 queue-5.15/mm-memory-failure.c-skip-huge_zero_page-in-memory_failure.patch create mode 100644 queue-5.15/mm-mmu_notifier.c-fix-race-in-mmu_interval_notifier_remove.patch create mode 100644 queue-5.15/net-atlantic-invert-deep-par-in-pm-functions-preventing-null-derefs.patch create mode 100644 queue-5.15/oom_kill.c-futex-delay-the-oom-reaper-to-allow-time-for-proper-futex-cleanup.patch diff --git a/queue-5.15/ata-pata_marvell-check-the-bmdma_addr-beforing-reading.patch b/queue-5.15/ata-pata_marvell-check-the-bmdma_addr-beforing-reading.patch new file mode 100644 index 00000000000..c79f199f64d --- /dev/null +++ b/queue-5.15/ata-pata_marvell-check-the-bmdma_addr-beforing-reading.patch @@ -0,0 +1,40 @@ +From aafa9f958342db36c17ac2a7f1b841032c96feb4 Mon Sep 17 00:00:00 2001 +From: Zheyu Ma +Date: Thu, 21 Apr 2022 09:39:20 +0800 +Subject: ata: pata_marvell: Check the 'bmdma_addr' beforing reading + +From: Zheyu Ma + +commit aafa9f958342db36c17ac2a7f1b841032c96feb4 upstream. + +Before detecting the cable type on the dma bar, the driver should check +whether the 'bmdma_addr' is zero, which means the adapter does not +support DMA, otherwise we will get the following error: + +[ 5.146634] Bad IO access at port 0x1 (return inb(port)) +[ 5.147206] WARNING: CPU: 2 PID: 303 at lib/iomap.c:44 ioread8+0x4a/0x60 +[ 5.150856] RIP: 0010:ioread8+0x4a/0x60 +[ 5.160238] Call Trace: +[ 5.160470] +[ 5.160674] marvell_cable_detect+0x6e/0xc0 [pata_marvell] +[ 5.161728] ata_eh_recover+0x3520/0x6cc0 +[ 5.168075] ata_do_eh+0x49/0x3c0 + +Signed-off-by: Zheyu Ma +Signed-off-by: Damien Le Moal +Signed-off-by: Greg Kroah-Hartman +--- + drivers/ata/pata_marvell.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/drivers/ata/pata_marvell.c ++++ b/drivers/ata/pata_marvell.c +@@ -83,6 +83,8 @@ static int marvell_cable_detect(struct a + switch(ap->port_no) + { + case 0: ++ if (!ap->ioaddr.bmdma_addr) ++ return ATA_CBL_PATA_UNK; + if (ioread8(ap->ioaddr.bmdma_addr + 1) & 1) + return ATA_CBL_PATA40; + return ATA_CBL_PATA80; diff --git a/queue-5.15/dma-at_xdmac-fix-a-missing-check-on-list-iterator.patch b/queue-5.15/dma-at_xdmac-fix-a-missing-check-on-list-iterator.patch new file mode 100644 index 00000000000..9575a7f1813 --- /dev/null +++ b/queue-5.15/dma-at_xdmac-fix-a-missing-check-on-list-iterator.patch @@ -0,0 +1,57 @@ +From 206680c4e46b62fd8909385e0874a36952595b85 Mon Sep 17 00:00:00 2001 +From: Xiaomeng Tong +Date: Sun, 27 Mar 2022 14:11:54 +0800 +Subject: dma: at_xdmac: fix a missing check on list iterator + +From: Xiaomeng Tong + +commit 206680c4e46b62fd8909385e0874a36952595b85 upstream. + +The bug is here: + __func__, desc, &desc->tx_dma_desc.phys, ret, cookie, residue); + +The list iterator 'desc' will point to a bogus position containing +HEAD if the list is empty or no element is found. To avoid dev_dbg() +prints a invalid address, use a new variable 'iter' as the list +iterator, while use the origin variable 'desc' as a dedicated +pointer to point to the found element. + +Cc: stable@vger.kernel.org +Fixes: 82e2424635f4c ("dmaengine: xdmac: fix print warning on dma_addr_t variable") +Signed-off-by: Xiaomeng Tong +Link: https://lore.kernel.org/r/20220327061154.4867-1-xiam0nd.tong@gmail.com +Signed-off-by: Vinod Koul +Signed-off-by: Greg Kroah-Hartman +--- + drivers/dma/at_xdmac.c | 12 +++++++----- + 1 file changed, 7 insertions(+), 5 deletions(-) + +--- a/drivers/dma/at_xdmac.c ++++ b/drivers/dma/at_xdmac.c +@@ -1450,7 +1450,7 @@ at_xdmac_tx_status(struct dma_chan *chan + { + struct at_xdmac_chan *atchan = to_at_xdmac_chan(chan); + struct at_xdmac *atxdmac = to_at_xdmac(atchan->chan.device); +- struct at_xdmac_desc *desc, *_desc; ++ struct at_xdmac_desc *desc, *_desc, *iter; + struct list_head *descs_list; + enum dma_status ret; + int residue, retry; +@@ -1565,11 +1565,13 @@ at_xdmac_tx_status(struct dma_chan *chan + * microblock. + */ + descs_list = &desc->descs_list; +- list_for_each_entry_safe(desc, _desc, descs_list, desc_node) { +- dwidth = at_xdmac_get_dwidth(desc->lld.mbr_cfg); +- residue -= (desc->lld.mbr_ubc & 0xffffff) << dwidth; +- if ((desc->lld.mbr_nda & 0xfffffffc) == cur_nda) ++ list_for_each_entry_safe(iter, _desc, descs_list, desc_node) { ++ dwidth = at_xdmac_get_dwidth(iter->lld.mbr_cfg); ++ residue -= (iter->lld.mbr_ubc & 0xffffff) << dwidth; ++ if ((iter->lld.mbr_nda & 0xfffffffc) == cur_nda) { ++ desc = iter; + break; ++ } + } + residue += cur_ubc << dwidth; + diff --git a/queue-5.15/dmaengine-imx-sdma-fix-init-of-uart-scripts.patch b/queue-5.15/dmaengine-imx-sdma-fix-init-of-uart-scripts.patch new file mode 100644 index 00000000000..0c62d8bdcfd --- /dev/null +++ b/queue-5.15/dmaengine-imx-sdma-fix-init-of-uart-scripts.patch @@ -0,0 +1,103 @@ +From a3ae97f4c87d9570e7e9a3e3324c443757f6e29a Mon Sep 17 00:00:00 2001 +From: Kevin Groeneveld +Date: Sun, 10 Apr 2022 18:31:18 -0400 +Subject: dmaengine: imx-sdma: fix init of uart scripts + +From: Kevin Groeneveld + +commit a3ae97f4c87d9570e7e9a3e3324c443757f6e29a upstream. + +Commit b98ce2f4e32b ("dmaengine: imx-sdma: add uart rom script") broke +uart rx on imx5 when using sdma firmware from older Freescale 2.6.35 +kernel. In this case reading addr->uartXX_2_mcu_addr was going out of +bounds of the firmware memory and corrupting the uart script addresses. + +Simply adding a bounds check before accessing addr->uartXX_2_mcu_addr +does not work as the uartXX_2_mcu_addr members are now beyond the size +of the older firmware and the uart addresses would never be populated +in that case. There are other ways to fix this but overall the logic +seems clearer to me to revert the uartXX_2_mcu_ram_addr structure +entries back to uartXX_2_mcu_addr, change the newer entries to +uartXX_2_mcu_rom_addr and update the logic accordingly. + +I have tested this patch on: +1. An i.MX53 system with sdma firmware from Freescale 2.6.35 kernel. + Without this patch uart rx is broken in this scenario, with the + patch uart rx is restored. +2. An i.MX6D system with no external sdma firmware. uart is okay with + or without this patch. +3. An i.MX8MM system using current sdma-imx7d.bin firmware from + linux-firmware. uart is okay with or without this patch and I + confirmed the rom version of the uart script is being used which was + the intention and reason for commit b98ce2f4e32b ("dmaengine: + imx-sdma: add uart rom script") in the first place. + +Fixes: b98ce2f4e32b ("dmaengine: imx-sdma: add uart rom script") +Cc: stable@vger.kernel.org +Signed-off-by: Kevin Groeneveld +Reviewed-by: Lucas Stach +Reviewed-by: Fabio Estevam +Acked-by: Russell King (Oracle) +Link: https://lore.kernel.org/r/20220410223118.15086-1-kgroeneveld@lenbrook.com +Signed-off-by: Vinod Koul +Signed-off-by: Greg Kroah-Hartman +--- + drivers/dma/imx-sdma.c | 28 ++++++++++++++-------------- + 1 file changed, 14 insertions(+), 14 deletions(-) + +--- a/drivers/dma/imx-sdma.c ++++ b/drivers/dma/imx-sdma.c +@@ -198,12 +198,12 @@ struct sdma_script_start_addrs { + s32 per_2_firi_addr; + s32 mcu_2_firi_addr; + s32 uart_2_per_addr; +- s32 uart_2_mcu_ram_addr; ++ s32 uart_2_mcu_addr; + s32 per_2_app_addr; + s32 mcu_2_app_addr; + s32 per_2_per_addr; + s32 uartsh_2_per_addr; +- s32 uartsh_2_mcu_ram_addr; ++ s32 uartsh_2_mcu_addr; + s32 per_2_shp_addr; + s32 mcu_2_shp_addr; + s32 ata_2_mcu_addr; +@@ -232,8 +232,8 @@ struct sdma_script_start_addrs { + s32 mcu_2_ecspi_addr; + s32 mcu_2_sai_addr; + s32 sai_2_mcu_addr; +- s32 uart_2_mcu_addr; +- s32 uartsh_2_mcu_addr; ++ s32 uart_2_mcu_rom_addr; ++ s32 uartsh_2_mcu_rom_addr; + /* End of v3 array */ + s32 mcu_2_zqspi_addr; + /* End of v4 array */ +@@ -1780,17 +1780,17 @@ static void sdma_add_scripts(struct sdma + saddr_arr[i] = addr_arr[i]; + + /* +- * get uart_2_mcu_addr/uartsh_2_mcu_addr rom script specially because +- * they are now replaced by uart_2_mcu_ram_addr/uartsh_2_mcu_ram_addr +- * to be compatible with legacy freescale/nxp sdma firmware, and they +- * are located in the bottom part of sdma_script_start_addrs which are +- * beyond the SDMA_SCRIPT_ADDRS_ARRAY_SIZE_V1. ++ * For compatibility with NXP internal legacy kernel before 4.19 which ++ * is based on uart ram script and mainline kernel based on uart rom ++ * script, both uart ram/rom scripts are present in newer sdma ++ * firmware. Use the rom versions if they are present (V3 or newer). + */ +- if (addr->uart_2_mcu_addr) +- sdma->script_addrs->uart_2_mcu_addr = addr->uart_2_mcu_addr; +- if (addr->uartsh_2_mcu_addr) +- sdma->script_addrs->uartsh_2_mcu_addr = addr->uartsh_2_mcu_addr; +- ++ if (sdma->script_number >= SDMA_SCRIPT_ADDRS_ARRAY_SIZE_V3) { ++ if (addr->uart_2_mcu_rom_addr) ++ sdma->script_addrs->uart_2_mcu_addr = addr->uart_2_mcu_rom_addr; ++ if (addr->uartsh_2_mcu_rom_addr) ++ sdma->script_addrs->uartsh_2_mcu_addr = addr->uartsh_2_mcu_rom_addr; ++ } + } + + static void sdma_load_firmware(const struct firmware *fw, void *context) diff --git a/queue-5.15/edac-synopsys-read-the-error-count-from-the-correct-register.patch b/queue-5.15/edac-synopsys-read-the-error-count-from-the-correct-register.patch new file mode 100644 index 00000000000..7d6c370eeaf --- /dev/null +++ b/queue-5.15/edac-synopsys-read-the-error-count-from-the-correct-register.patch @@ -0,0 +1,61 @@ +From e2932d1f6f055b2af2114c7e64a26dc1b5593d0c Mon Sep 17 00:00:00 2001 +From: Shubhrajyoti Datta +Date: Thu, 14 Apr 2022 15:58:13 +0530 +Subject: EDAC/synopsys: Read the error count from the correct register + +From: Shubhrajyoti Datta + +commit e2932d1f6f055b2af2114c7e64a26dc1b5593d0c upstream. + +Currently, the error count is read wrongly from the status register. Read +the count from the proper error count register (ERRCNT). + + [ bp: Massage. ] + +Fixes: b500b4a029d5 ("EDAC, synopsys: Add ECC support for ZynqMP DDR controller") +Signed-off-by: Shubhrajyoti Datta +Signed-off-by: Borislav Petkov +Acked-by: Michal Simek +Cc: +Link: https://lore.kernel.org/r/20220414102813.4468-1-shubhrajyoti.datta@xilinx.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/edac/synopsys_edac.c | 16 +++++++++++----- + 1 file changed, 11 insertions(+), 5 deletions(-) + +--- a/drivers/edac/synopsys_edac.c ++++ b/drivers/edac/synopsys_edac.c +@@ -163,6 +163,11 @@ + #define ECC_STAT_CECNT_SHIFT 8 + #define ECC_STAT_BITNUM_MASK 0x7F + ++/* ECC error count register definitions */ ++#define ECC_ERRCNT_UECNT_MASK 0xFFFF0000 ++#define ECC_ERRCNT_UECNT_SHIFT 16 ++#define ECC_ERRCNT_CECNT_MASK 0xFFFF ++ + /* DDR QOS Interrupt register definitions */ + #define DDR_QOS_IRQ_STAT_OFST 0x20200 + #define DDR_QOSUE_MASK 0x4 +@@ -418,15 +423,16 @@ static int zynqmp_get_error_info(struct + base = priv->baseaddr; + p = &priv->stat; + ++ regval = readl(base + ECC_ERRCNT_OFST); ++ p->ce_cnt = regval & ECC_ERRCNT_CECNT_MASK; ++ p->ue_cnt = (regval & ECC_ERRCNT_UECNT_MASK) >> ECC_ERRCNT_UECNT_SHIFT; ++ if (!p->ce_cnt) ++ goto ue_err; ++ + regval = readl(base + ECC_STAT_OFST); + if (!regval) + return 1; + +- p->ce_cnt = (regval & ECC_STAT_CECNT_MASK) >> ECC_STAT_CECNT_SHIFT; +- p->ue_cnt = (regval & ECC_STAT_UECNT_MASK) >> ECC_STAT_UECNT_SHIFT; +- if (!p->ce_cnt) +- goto ue_err; +- + p->ceinfo.bitpos = (regval & ECC_STAT_BITNUM_MASK); + + regval = readl(base + ECC_CEADDR0_OFST); diff --git a/queue-5.15/memcg-sync-flush-only-if-periodic-flush-is-delayed.patch b/queue-5.15/memcg-sync-flush-only-if-periodic-flush-is-delayed.patch new file mode 100644 index 00000000000..8d87b5bd983 --- /dev/null +++ b/queue-5.15/memcg-sync-flush-only-if-periodic-flush-is-delayed.patch @@ -0,0 +1,130 @@ +From 9b3016154c913b2e7ec5ae5c9a42eb9e732d86aa Mon Sep 17 00:00:00 2001 +From: Shakeel Butt +Date: Thu, 21 Apr 2022 16:35:40 -0700 +Subject: memcg: sync flush only if periodic flush is delayed +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Shakeel Butt + +commit 9b3016154c913b2e7ec5ae5c9a42eb9e732d86aa upstream. + +Daniel Dao has reported [1] a regression on workloads that may trigger a +lot of refaults (anon and file). The underlying issue is that flushing +rstat is expensive. Although rstat flush are batched with (nr_cpus * +MEMCG_BATCH) stat updates, it seems like there are workloads which +genuinely do stat updates larger than batch value within short amount of +time. Since the rstat flush can happen in the performance critical +codepaths like page faults, such workload can suffer greatly. + +This patch fixes this regression by making the rstat flushing +conditional in the performance critical codepaths. More specifically, +the kernel relies on the async periodic rstat flusher to flush the stats +and only if the periodic flusher is delayed by more than twice the +amount of its normal time window then the kernel allows rstat flushing +from the performance critical codepaths. + +Now the question: what are the side-effects of this change? The worst +that can happen is the refault codepath will see 4sec old lruvec stats +and may cause false (or missed) activations of the refaulted page which +may under-or-overestimate the workingset size. Though that is not very +concerning as the kernel can already miss or do false activations. + +There are two more codepaths whose flushing behavior is not changed by +this patch and we may need to come to them in future. One is the +writeback stats used by dirty throttling and second is the deactivation +heuristic in the reclaim. For now keeping an eye on them and if there +is report of regression due to these codepaths, we will reevaluate then. + +Link: https://lore.kernel.org/all/CA+wXwBSyO87ZX5PVwdHm-=dBjZYECGmfnydUicUyrQqndgX2MQ@mail.gmail.com [1] +Link: https://lkml.kernel.org/r/20220304184040.1304781-1-shakeelb@google.com +Fixes: 1f828223b799 ("memcg: flush lruvec stats in the refault") +Signed-off-by: Shakeel Butt +Reported-by: Daniel Dao +Tested-by: Ivan Babrou +Cc: Michal Hocko +Cc: Roman Gushchin +Cc: Johannes Weiner +Cc: Michal Koutný +Cc: Frank Hofmann +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/memcontrol.h | 5 +++++ + mm/memcontrol.c | 12 +++++++++++- + mm/workingset.c | 2 +- + 3 files changed, 17 insertions(+), 2 deletions(-) + +--- a/include/linux/memcontrol.h ++++ b/include/linux/memcontrol.h +@@ -1002,6 +1002,7 @@ static inline unsigned long lruvec_page_ + } + + void mem_cgroup_flush_stats(void); ++void mem_cgroup_flush_stats_delayed(void); + + void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, + int val); +@@ -1422,6 +1423,10 @@ static inline void mem_cgroup_flush_stat + { + } + ++static inline void mem_cgroup_flush_stats_delayed(void) ++{ ++} ++ + static inline void __mod_memcg_lruvec_state(struct lruvec *lruvec, + enum node_stat_item idx, int val) + { +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -650,6 +650,9 @@ static DECLARE_DEFERRABLE_WORK(stats_flu + static DEFINE_SPINLOCK(stats_flush_lock); + static DEFINE_PER_CPU(unsigned int, stats_updates); + static atomic_t stats_flush_threshold = ATOMIC_INIT(0); ++static u64 flush_next_time; ++ ++#define FLUSH_TIME (2UL*HZ) + + static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) + { +@@ -671,6 +674,7 @@ static void __mem_cgroup_flush_stats(voi + if (!spin_trylock_irqsave(&stats_flush_lock, flag)) + return; + ++ flush_next_time = jiffies_64 + 2*FLUSH_TIME; + cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup); + atomic_set(&stats_flush_threshold, 0); + spin_unlock_irqrestore(&stats_flush_lock, flag); +@@ -682,10 +686,16 @@ void mem_cgroup_flush_stats(void) + __mem_cgroup_flush_stats(); + } + ++void mem_cgroup_flush_stats_delayed(void) ++{ ++ if (time_after64(jiffies_64, flush_next_time)) ++ mem_cgroup_flush_stats(); ++} ++ + static void flush_memcg_stats_dwork(struct work_struct *w) + { + __mem_cgroup_flush_stats(); +- queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ); ++ queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME); + } + + /** +--- a/mm/workingset.c ++++ b/mm/workingset.c +@@ -352,7 +352,7 @@ void workingset_refault(struct page *pag + + inc_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file); + +- mem_cgroup_flush_stats(); ++ mem_cgroup_flush_stats_delayed(); + /* + * Compare the distance to the existing workingset size. We + * don't activate pages that couldn't stay resident even if diff --git a/queue-5.15/mm-hugetlb-allow-for-high-userspace-addresses.patch b/queue-5.15/mm-hugetlb-allow-for-high-userspace-addresses.patch new file mode 100644 index 00000000000..d45ae1789dc --- /dev/null +++ b/queue-5.15/mm-hugetlb-allow-for-high-userspace-addresses.patch @@ -0,0 +1,145 @@ +From 5f24d5a579d1eace79d505b148808a850b417d4c Mon Sep 17 00:00:00 2001 +From: Christophe Leroy +Date: Thu, 21 Apr 2022 16:35:46 -0700 +Subject: mm, hugetlb: allow for "high" userspace addresses + +From: Christophe Leroy + +commit 5f24d5a579d1eace79d505b148808a850b417d4c upstream. + +This is a fix for commit f6795053dac8 ("mm: mmap: Allow for "high" +userspace addresses") for hugetlb. + +This patch adds support for "high" userspace addresses that are +optionally supported on the system and have to be requested via a hint +mechanism ("high" addr parameter to mmap). + +Architectures such as powerpc and x86 achieve this by making changes to +their architectural versions of hugetlb_get_unmapped_area() function. +However, arm64 uses the generic version of that function. + +So take into account arch_get_mmap_base() and arch_get_mmap_end() in +hugetlb_get_unmapped_area(). To allow that, move those two macros out +of mm/mmap.c into include/linux/sched/mm.h + +If these macros are not defined in architectural code then they default +to (TASK_SIZE) and (base) so should not introduce any behavioural +changes to architectures that do not define them. + +For the time being, only ARM64 is affected by this change. + +Catalin (ARM64) said + "We should have fixed hugetlb_get_unmapped_area() as well when we added + support for 52-bit VA. The reason for commit f6795053dac8 was to + prevent normal mmap() from returning addresses above 48-bit by default + as some user-space had hard assumptions about this. + + It's a slight ABI change if you do this for hugetlb_get_unmapped_area() + but I doubt anyone would notice. It's more likely that the current + behaviour would cause issues, so I'd rather have them consistent. + + Basically when arm64 gained support for 52-bit addresses we did not + want user-space calling mmap() to suddenly get such high addresses, + otherwise we could have inadvertently broken some programs (similar + behaviour to x86 here). Hence we added commit f6795053dac8. But we + missed hugetlbfs which could still get such high mmap() addresses. So + in theory that's a potential regression that should have bee addressed + at the same time as commit f6795053dac8 (and before arm64 enabled + 52-bit addresses)" + +Link: https://lkml.kernel.org/r/ab847b6edb197bffdfe189e70fb4ac76bfe79e0d.1650033747.git.christophe.leroy@csgroup.eu +Fixes: f6795053dac8 ("mm: mmap: Allow for "high" userspace addresses") +Signed-off-by: Christophe Leroy +Reviewed-by: Catalin Marinas +Cc: Steve Capper +Cc: Will Deacon +Cc: [5.0.x] +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + fs/hugetlbfs/inode.c | 9 +++++---- + include/linux/sched/mm.h | 8 ++++++++ + mm/mmap.c | 8 -------- + 3 files changed, 13 insertions(+), 12 deletions(-) + +--- a/fs/hugetlbfs/inode.c ++++ b/fs/hugetlbfs/inode.c +@@ -206,7 +206,7 @@ hugetlb_get_unmapped_area_bottomup(struc + info.flags = 0; + info.length = len; + info.low_limit = current->mm->mmap_base; +- info.high_limit = TASK_SIZE; ++ info.high_limit = arch_get_mmap_end(addr); + info.align_mask = PAGE_MASK & ~huge_page_mask(h); + info.align_offset = 0; + return vm_unmapped_area(&info); +@@ -222,7 +222,7 @@ hugetlb_get_unmapped_area_topdown(struct + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; + info.low_limit = max(PAGE_SIZE, mmap_min_addr); +- info.high_limit = current->mm->mmap_base; ++ info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base); + info.align_mask = PAGE_MASK & ~huge_page_mask(h); + info.align_offset = 0; + addr = vm_unmapped_area(&info); +@@ -237,7 +237,7 @@ hugetlb_get_unmapped_area_topdown(struct + VM_BUG_ON(addr != -ENOMEM); + info.flags = 0; + info.low_limit = current->mm->mmap_base; +- info.high_limit = TASK_SIZE; ++ info.high_limit = arch_get_mmap_end(addr); + addr = vm_unmapped_area(&info); + } + +@@ -251,6 +251,7 @@ hugetlb_get_unmapped_area(struct file *f + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + struct hstate *h = hstate_file(file); ++ const unsigned long mmap_end = arch_get_mmap_end(addr); + + if (len & ~huge_page_mask(h)) + return -EINVAL; +@@ -266,7 +267,7 @@ hugetlb_get_unmapped_area(struct file *f + if (addr) { + addr = ALIGN(addr, huge_page_size(h)); + vma = find_vma(mm, addr); +- if (TASK_SIZE - len >= addr && ++ if (mmap_end - len >= addr && + (!vma || addr + len <= vm_start_gap(vma))) + return addr; + } +--- a/include/linux/sched/mm.h ++++ b/include/linux/sched/mm.h +@@ -106,6 +106,14 @@ static inline void mm_update_next_owner( + #endif /* CONFIG_MEMCG */ + + #ifdef CONFIG_MMU ++#ifndef arch_get_mmap_end ++#define arch_get_mmap_end(addr) (TASK_SIZE) ++#endif ++ ++#ifndef arch_get_mmap_base ++#define arch_get_mmap_base(addr, base) (base) ++#endif ++ + extern void arch_pick_mmap_layout(struct mm_struct *mm, + struct rlimit *rlim_stack); + extern unsigned long +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -2113,14 +2113,6 @@ unsigned long vm_unmapped_area(struct vm + return addr; + } + +-#ifndef arch_get_mmap_end +-#define arch_get_mmap_end(addr) (TASK_SIZE) +-#endif +- +-#ifndef arch_get_mmap_base +-#define arch_get_mmap_base(addr, base) (base) +-#endif +- + /* Get an address range which is currently unmapped. + * For shmat() with addr=0. + * diff --git a/queue-5.15/mm-memory-failure.c-skip-huge_zero_page-in-memory_failure.patch b/queue-5.15/mm-memory-failure.c-skip-huge_zero_page-in-memory_failure.patch new file mode 100644 index 00000000000..ff30abb616c --- /dev/null +++ b/queue-5.15/mm-memory-failure.c-skip-huge_zero_page-in-memory_failure.patch @@ -0,0 +1,95 @@ +From d173d5417fb67411e623d394aab986d847e47dad Mon Sep 17 00:00:00 2001 +From: Xu Yu +Date: Thu, 21 Apr 2022 16:35:37 -0700 +Subject: mm/memory-failure.c: skip huge_zero_page in memory_failure() + +From: Xu Yu + +commit d173d5417fb67411e623d394aab986d847e47dad upstream. + +Kernel panic when injecting memory_failure for the global +huge_zero_page, when CONFIG_DEBUG_VM is enabled, as follows. + + Injecting memory failure for pfn 0x109ff9 at process virtual address 0x20ff9000 + page:00000000fb053fc3 refcount:2 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x109e00 + head:00000000fb053fc3 order:9 compound_mapcount:0 compound_pincount:0 + flags: 0x17fffc000010001(locked|head|node=0|zone=2|lastcpupid=0x1ffff) + raw: 017fffc000010001 0000000000000000 dead000000000122 0000000000000000 + raw: 0000000000000000 0000000000000000 00000002ffffffff 0000000000000000 + page dumped because: VM_BUG_ON_PAGE(is_huge_zero_page(head)) + ------------[ cut here ]------------ + kernel BUG at mm/huge_memory.c:2499! + invalid opcode: 0000 [#1] PREEMPT SMP PTI + CPU: 6 PID: 553 Comm: split_bug Not tainted 5.18.0-rc1+ #11 + Hardware name: Alibaba Cloud Alibaba Cloud ECS, BIOS 3288b3c 04/01/2014 + RIP: 0010:split_huge_page_to_list+0x66a/0x880 + Code: 84 9b fb ff ff 48 8b 7c 24 08 31 f6 e8 9f 5d 2a 00 b8 b8 02 00 00 e9 e8 fb ff ff 48 c7 c6 e8 47 3c 82 4c b + RSP: 0018:ffffc90000dcbdf8 EFLAGS: 00010246 + RAX: 000000000000003c RBX: 0000000000000001 RCX: 0000000000000000 + RDX: 0000000000000000 RSI: ffffffff823e4c4f RDI: 00000000ffffffff + RBP: ffff88843fffdb40 R08: 0000000000000000 R09: 00000000fffeffff + R10: ffffc90000dcbc48 R11: ffffffff82d68448 R12: ffffea0004278000 + R13: ffffffff823c6203 R14: 0000000000109ff9 R15: ffffea000427fe40 + FS: 00007fc375a26740(0000) GS:ffff88842fd80000(0000) knlGS:0000000000000000 + CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + CR2: 00007fc3757c9290 CR3: 0000000102174006 CR4: 00000000003706e0 + DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 + DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 + Call Trace: + try_to_split_thp_page+0x3a/0x130 + memory_failure+0x128/0x800 + madvise_inject_error.cold+0x8b/0xa1 + __x64_sys_madvise+0x54/0x60 + do_syscall_64+0x35/0x80 + entry_SYSCALL_64_after_hwframe+0x44/0xae + RIP: 0033:0x7fc3754f8bf9 + Code: 01 00 48 81 c4 80 00 00 00 e9 f1 fe ff ff 0f 1f 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 8 + RSP: 002b:00007ffeda93a1d8 EFLAGS: 00000217 ORIG_RAX: 000000000000001c + RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fc3754f8bf9 + RDX: 0000000000000064 RSI: 0000000000003000 RDI: 0000000020ff9000 + RBP: 00007ffeda93a200 R08: 0000000000000000 R09: 0000000000000000 + R10: 00000000ffffffff R11: 0000000000000217 R12: 0000000000400490 + R13: 00007ffeda93a2e0 R14: 0000000000000000 R15: 0000000000000000 + +This makes huge_zero_page bail out explicitly before split in +memory_failure(), thus the panic above won't happen again. + +Link: https://lkml.kernel.org/r/497d3835612610e370c74e697ea3c721d1d55b9c.1649775850.git.xuyu@linux.alibaba.com +Fixes: 6a46079cf57a ("HWPOISON: The high level memory error handler in the VM v7") +Signed-off-by: Xu Yu +Reported-by: Abaci +Suggested-by: Naoya Horiguchi +Acked-by: Naoya Horiguchi +Reviewed-by: Miaohe Lin +Cc: Anshuman Khandual +Cc: Oscar Salvador +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + mm/memory-failure.c | 13 +++++++++++++ + 1 file changed, 13 insertions(+) + +--- a/mm/memory-failure.c ++++ b/mm/memory-failure.c +@@ -1690,6 +1690,19 @@ try_again: + + if (PageTransHuge(hpage)) { + /* ++ * Bail out before SetPageHasHWPoisoned() if hpage is ++ * huge_zero_page, although PG_has_hwpoisoned is not ++ * checked in set_huge_zero_page(). ++ * ++ * TODO: Handle memory failure of huge_zero_page thoroughly. ++ */ ++ if (is_huge_zero_page(hpage)) { ++ action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED); ++ res = -EBUSY; ++ goto unlock_mutex; ++ } ++ ++ /* + * The flag must be set after the refcount is bumped + * otherwise it may race with THP split. + * And the flag can't be set in get_hwpoison_page() since diff --git a/queue-5.15/mm-mmu_notifier.c-fix-race-in-mmu_interval_notifier_remove.patch b/queue-5.15/mm-mmu_notifier.c-fix-race-in-mmu_interval_notifier_remove.patch new file mode 100644 index 00000000000..8a792930f64 --- /dev/null +++ b/queue-5.15/mm-mmu_notifier.c-fix-race-in-mmu_interval_notifier_remove.patch @@ -0,0 +1,84 @@ +From 319561669a59d8e9206ab311ae5433ef92fd79d1 Mon Sep 17 00:00:00 2001 +From: Alistair Popple +Date: Thu, 21 Apr 2022 16:36:10 -0700 +Subject: mm/mmu_notifier.c: fix race in mmu_interval_notifier_remove() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Alistair Popple + +commit 319561669a59d8e9206ab311ae5433ef92fd79d1 upstream. + +In some cases it is possible for mmu_interval_notifier_remove() to race +with mn_tree_inv_end() allowing it to return while the notifier data +structure is still in use. Consider the following sequence: + + CPU0 - mn_tree_inv_end() CPU1 - mmu_interval_notifier_remove() + ----------------------------------- ------------------------------------ + spin_lock(subscriptions->lock); + seq = subscriptions->invalidate_seq; + spin_lock(subscriptions->lock); spin_unlock(subscriptions->lock); + subscriptions->invalidate_seq++; + wait_event(invalidate_seq != seq); + return; + interval_tree_remove(interval_sub); kfree(interval_sub); + spin_unlock(subscriptions->lock); + wake_up_all(); + +As the wait_event() condition is true it will return immediately. This +can lead to use-after-free type errors if the caller frees the data +structure containing the interval notifier subscription while it is +still on a deferred list. Fix this by taking the appropriate lock when +reading invalidate_seq to ensure proper synchronisation. + +I observed this whilst running stress testing during some development. +You do have to be pretty unlucky, but it leads to the usual problems of +use-after-free (memory corruption, kernel crash, difficult to diagnose +WARN_ON, etc). + +Link: https://lkml.kernel.org/r/20220420043734.476348-1-apopple@nvidia.com +Fixes: 99cb252f5e68 ("mm/mmu_notifier: add an interval tree notifier") +Signed-off-by: Alistair Popple +Signed-off-by: Jason Gunthorpe +Cc: Christian König +Cc: John Hubbard +Cc: Ralph Campbell +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + mm/mmu_notifier.c | 14 +++++++++++++- + 1 file changed, 13 insertions(+), 1 deletion(-) + +--- a/mm/mmu_notifier.c ++++ b/mm/mmu_notifier.c +@@ -1036,6 +1036,18 @@ int mmu_interval_notifier_insert_locked( + } + EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert_locked); + ++static bool ++mmu_interval_seq_released(struct mmu_notifier_subscriptions *subscriptions, ++ unsigned long seq) ++{ ++ bool ret; ++ ++ spin_lock(&subscriptions->lock); ++ ret = subscriptions->invalidate_seq != seq; ++ spin_unlock(&subscriptions->lock); ++ return ret; ++} ++ + /** + * mmu_interval_notifier_remove - Remove a interval notifier + * @interval_sub: Interval subscription to unregister +@@ -1083,7 +1095,7 @@ void mmu_interval_notifier_remove(struct + lock_map_release(&__mmu_notifier_invalidate_range_start_map); + if (seq) + wait_event(subscriptions->wq, +- READ_ONCE(subscriptions->invalidate_seq) != seq); ++ mmu_interval_seq_released(subscriptions, seq)); + + /* pairs with mmgrab in mmu_interval_notifier_insert() */ + mmdrop(mm); diff --git a/queue-5.15/net-atlantic-invert-deep-par-in-pm-functions-preventing-null-derefs.patch b/queue-5.15/net-atlantic-invert-deep-par-in-pm-functions-preventing-null-derefs.patch new file mode 100644 index 00000000000..369826ad4a0 --- /dev/null +++ b/queue-5.15/net-atlantic-invert-deep-par-in-pm-functions-preventing-null-derefs.patch @@ -0,0 +1,95 @@ +From cbe6c3a8f8f4315b96e46e1a1c70393c06d95a4c Mon Sep 17 00:00:00 2001 +From: Manuel Ullmann +Date: Mon, 18 Apr 2022 00:20:01 +0200 +Subject: net: atlantic: invert deep par in pm functions, preventing null derefs + +From: Manuel Ullmann + +commit cbe6c3a8f8f4315b96e46e1a1c70393c06d95a4c upstream. + +This will reset deeply on freeze and thaw instead of suspend and +resume and prevent null pointer dereferences of the uninitialized ring +0 buffer while thawing. + +The impact is an indefinitely hanging kernel. You can't switch +consoles after this and the only possible user interaction is SysRq. + +BUG: kernel NULL pointer dereference +RIP: 0010:aq_ring_rx_fill+0xcf/0x210 [atlantic] +aq_vec_init+0x85/0xe0 [atlantic] +aq_nic_init+0xf7/0x1d0 [atlantic] +atl_resume_common+0x4f/0x100 [atlantic] +pci_pm_thaw+0x42/0xa0 + +resolves in aq_ring.o to + +``` +0000000000000ae0 : +{ +/* ... */ + baf: 48 8b 43 08 mov 0x8(%rbx),%rax + buff->flags = 0U; /* buff is NULL */ +``` + +The bug has been present since the introduction of the new pm code in +8aaa112a57c1 ("net: atlantic: refactoring pm logic") and was hidden +until 8ce84271697a ("net: atlantic: changes for multi-TC support"), +which refactored the aq_vec_{free,alloc} functions into +aq_vec_{,ring}_{free,alloc}, but is technically not wrong. The +original functions just always reinitialized the buffers on S3/S4. If +the interface is down before freezing, the bug does not occur. It does +not matter, whether the initrd contains and loads the module before +thawing. + +So the fix is to invert the boolean parameter deep in all pm function +calls, which was clearly intended to be set like that. + +First report was on Github [1], which you have to guess from the +resume logs in the posted dmesg snippet. Recently I posted one on +Bugzilla [2], since I did not have an AQC device so far. + +#regzbot introduced: 8ce84271697a +#regzbot from: koo5 +#regzbot monitor: https://github.com/Aquantia/AQtion/issues/32 + +Fixes: 8aaa112a57c1 ("net: atlantic: refactoring pm logic") +Link: https://github.com/Aquantia/AQtion/issues/32 [1] +Link: https://bugzilla.kernel.org/show_bug.cgi?id=215798 [2] +Cc: stable@vger.kernel.org +Reported-by: koo5 +Signed-off-by: Manuel Ullmann +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +--- a/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c ++++ b/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c +@@ -444,22 +444,22 @@ err_exit: + + static int aq_pm_freeze(struct device *dev) + { +- return aq_suspend_common(dev, false); ++ return aq_suspend_common(dev, true); + } + + static int aq_pm_suspend_poweroff(struct device *dev) + { +- return aq_suspend_common(dev, true); ++ return aq_suspend_common(dev, false); + } + + static int aq_pm_thaw(struct device *dev) + { +- return atl_resume_common(dev, false); ++ return atl_resume_common(dev, true); + } + + static int aq_pm_resume_restore(struct device *dev) + { +- return atl_resume_common(dev, true); ++ return atl_resume_common(dev, false); + } + + static const struct dev_pm_ops aq_pm_ops = { diff --git a/queue-5.15/oom_kill.c-futex-delay-the-oom-reaper-to-allow-time-for-proper-futex-cleanup.patch b/queue-5.15/oom_kill.c-futex-delay-the-oom-reaper-to-allow-time-for-proper-futex-cleanup.patch new file mode 100644 index 00000000000..b2ab4be0ccf --- /dev/null +++ b/queue-5.15/oom_kill.c-futex-delay-the-oom-reaper-to-allow-time-for-proper-futex-cleanup.patch @@ -0,0 +1,206 @@ +From e4a38402c36e42df28eb1a5394be87e6571fb48a Mon Sep 17 00:00:00 2001 +From: Nico Pache +Date: Thu, 21 Apr 2022 16:36:01 -0700 +Subject: oom_kill.c: futex: delay the OOM reaper to allow time for proper futex cleanup + +From: Nico Pache + +commit e4a38402c36e42df28eb1a5394be87e6571fb48a upstream. + +The pthread struct is allocated on PRIVATE|ANONYMOUS memory [1] which +can be targeted by the oom reaper. This mapping is used to store the +futex robust list head; the kernel does not keep a copy of the robust +list and instead references a userspace address to maintain the +robustness during a process death. + +A race can occur between exit_mm and the oom reaper that allows the oom +reaper to free the memory of the futex robust list before the exit path +has handled the futex death: + + CPU1 CPU2 + -------------------------------------------------------------------- + page_fault + do_exit "signal" + wake_oom_reaper + oom_reaper + oom_reap_task_mm (invalidates mm) + exit_mm + exit_mm_release + futex_exit_release + futex_cleanup + exit_robust_list + get_user (EFAULT- can't access memory) + +If the get_user EFAULT's, the kernel will be unable to recover the +waiters on the robust_list, leaving userspace mutexes hung indefinitely. + +Delay the OOM reaper, allowing more time for the exit path to perform +the futex cleanup. + +Reproducer: https://gitlab.com/jsavitz/oom_futex_reproducer + +Based on a patch by Michal Hocko. + +Link: https://elixir.bootlin.com/glibc/glibc-2.35/source/nptl/allocatestack.c#L370 [1] +Link: https://lkml.kernel.org/r/20220414144042.677008-1-npache@redhat.com +Fixes: 212925802454 ("mm: oom: let oom_reap_task and exit_mmap run concurrently") +Signed-off-by: Joel Savitz +Signed-off-by: Nico Pache +Co-developed-by: Joel Savitz +Suggested-by: Thomas Gleixner +Acked-by: Thomas Gleixner +Acked-by: Michal Hocko +Cc: Rafael Aquini +Cc: Waiman Long +Cc: Herton R. Krzesinski +Cc: Juri Lelli +Cc: Vincent Guittot +Cc: Dietmar Eggemann +Cc: Steven Rostedt +Cc: Ben Segall +Cc: Mel Gorman +Cc: Daniel Bristot de Oliveira +Cc: David Rientjes +Cc: Andrea Arcangeli +Cc: Davidlohr Bueso +Cc: Peter Zijlstra +Cc: Ingo Molnar +Cc: Joel Savitz +Cc: Darren Hart +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/sched.h | 1 + mm/oom_kill.c | 54 +++++++++++++++++++++++++++++++++++++------------- + 2 files changed, 41 insertions(+), 14 deletions(-) + +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -1436,6 +1436,7 @@ struct task_struct { + int pagefault_disabled; + #ifdef CONFIG_MMU + struct task_struct *oom_reaper_list; ++ struct timer_list oom_reaper_timer; + #endif + #ifdef CONFIG_VMAP_STACK + struct vm_struct *stack_vm_area; +--- a/mm/oom_kill.c ++++ b/mm/oom_kill.c +@@ -635,7 +635,7 @@ done: + */ + set_bit(MMF_OOM_SKIP, &mm->flags); + +- /* Drop a reference taken by wake_oom_reaper */ ++ /* Drop a reference taken by queue_oom_reaper */ + put_task_struct(tsk); + } + +@@ -645,12 +645,12 @@ static int oom_reaper(void *unused) + struct task_struct *tsk = NULL; + + wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL); +- spin_lock(&oom_reaper_lock); ++ spin_lock_irq(&oom_reaper_lock); + if (oom_reaper_list != NULL) { + tsk = oom_reaper_list; + oom_reaper_list = tsk->oom_reaper_list; + } +- spin_unlock(&oom_reaper_lock); ++ spin_unlock_irq(&oom_reaper_lock); + + if (tsk) + oom_reap_task(tsk); +@@ -659,22 +659,48 @@ static int oom_reaper(void *unused) + return 0; + } + +-static void wake_oom_reaper(struct task_struct *tsk) ++static void wake_oom_reaper(struct timer_list *timer) + { +- /* mm is already queued? */ +- if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags)) ++ struct task_struct *tsk = container_of(timer, struct task_struct, ++ oom_reaper_timer); ++ struct mm_struct *mm = tsk->signal->oom_mm; ++ unsigned long flags; ++ ++ /* The victim managed to terminate on its own - see exit_mmap */ ++ if (test_bit(MMF_OOM_SKIP, &mm->flags)) { ++ put_task_struct(tsk); + return; ++ } + +- get_task_struct(tsk); +- +- spin_lock(&oom_reaper_lock); ++ spin_lock_irqsave(&oom_reaper_lock, flags); + tsk->oom_reaper_list = oom_reaper_list; + oom_reaper_list = tsk; +- spin_unlock(&oom_reaper_lock); ++ spin_unlock_irqrestore(&oom_reaper_lock, flags); + trace_wake_reaper(tsk->pid); + wake_up(&oom_reaper_wait); + } + ++/* ++ * Give the OOM victim time to exit naturally before invoking the oom_reaping. ++ * The timers timeout is arbitrary... the longer it is, the longer the worst ++ * case scenario for the OOM can take. If it is too small, the oom_reaper can ++ * get in the way and release resources needed by the process exit path. ++ * e.g. The futex robust list can sit in Anon|Private memory that gets reaped ++ * before the exit path is able to wake the futex waiters. ++ */ ++#define OOM_REAPER_DELAY (2*HZ) ++static void queue_oom_reaper(struct task_struct *tsk) ++{ ++ /* mm is already queued? */ ++ if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags)) ++ return; ++ ++ get_task_struct(tsk); ++ timer_setup(&tsk->oom_reaper_timer, wake_oom_reaper, 0); ++ tsk->oom_reaper_timer.expires = jiffies + OOM_REAPER_DELAY; ++ add_timer(&tsk->oom_reaper_timer); ++} ++ + static int __init oom_init(void) + { + oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); +@@ -682,7 +708,7 @@ static int __init oom_init(void) + } + subsys_initcall(oom_init) + #else +-static inline void wake_oom_reaper(struct task_struct *tsk) ++static inline void queue_oom_reaper(struct task_struct *tsk) + { + } + #endif /* CONFIG_MMU */ +@@ -933,7 +959,7 @@ static void __oom_kill_process(struct ta + rcu_read_unlock(); + + if (can_oom_reap) +- wake_oom_reaper(victim); ++ queue_oom_reaper(victim); + + mmdrop(mm); + put_task_struct(victim); +@@ -969,7 +995,7 @@ static void oom_kill_process(struct oom_ + task_lock(victim); + if (task_will_free_mem(victim)) { + mark_oom_victim(victim); +- wake_oom_reaper(victim); ++ queue_oom_reaper(victim); + task_unlock(victim); + put_task_struct(victim); + return; +@@ -1067,7 +1093,7 @@ bool out_of_memory(struct oom_control *o + */ + if (task_will_free_mem(current)) { + mark_oom_victim(current); +- wake_oom_reaper(current); ++ queue_oom_reaper(current); + return true; + } + diff --git a/queue-5.15/series b/queue-5.15/series index f86bbf7bb68..bc761162613 100644 --- a/queue-5.15/series +++ b/queue-5.15/series @@ -74,3 +74,13 @@ vfs-filename_create-fix-incorrect-intent.patch nvme-add-a-quirk-to-disable-namespace-identifiers.patch nvme-pci-disable-namespace-identifiers-for-the-maxio.patch nvme-pci-disable-namespace-identifiers-for-qemu-cont.patch +edac-synopsys-read-the-error-count-from-the-correct-register.patch +mm-memory-failure.c-skip-huge_zero_page-in-memory_failure.patch +memcg-sync-flush-only-if-periodic-flush-is-delayed.patch +mm-hugetlb-allow-for-high-userspace-addresses.patch +oom_kill.c-futex-delay-the-oom-reaper-to-allow-time-for-proper-futex-cleanup.patch +mm-mmu_notifier.c-fix-race-in-mmu_interval_notifier_remove.patch +ata-pata_marvell-check-the-bmdma_addr-beforing-reading.patch +dma-at_xdmac-fix-a-missing-check-on-list-iterator.patch +dmaengine-imx-sdma-fix-init-of-uart-scripts.patch +net-atlantic-invert-deep-par-in-pm-functions-preventing-null-derefs.patch -- 2.47.3