From 1301dcfc13fa9ace680bb006f918d1f12c5a005c Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 25 Apr 2022 12:27:35 +0200 Subject: [PATCH] 5.10-stable patches added patches: ata-pata_marvell-check-the-bmdma_addr-beforing-reading.patch dma-at_xdmac-fix-a-missing-check-on-list-iterator.patch edac-synopsys-read-the-error-count-from-the-correct-register.patch mm-hugetlb-allow-for-high-userspace-addresses.patch mm-mmu_notifier.c-fix-race-in-mmu_interval_notifier_remove.patch net-atlantic-invert-deep-par-in-pm-functions-preventing-null-derefs.patch oom_kill.c-futex-delay-the-oom-reaper-to-allow-time-for-proper-futex-cleanup.patch --- ...heck-the-bmdma_addr-beforing-reading.patch | 40 ++++ ...fix-a-missing-check-on-list-iterator.patch | 57 +++++ ...rror-count-from-the-correct-register.patch | 61 ++++++ ...b-allow-for-high-userspace-addresses.patch | 145 ++++++++++++ ...race-in-mmu_interval_notifier_remove.patch | 84 +++++++ ...-pm-functions-preventing-null-derefs.patch | 95 ++++++++ ...-allow-time-for-proper-futex-cleanup.patch | 206 ++++++++++++++++++ queue-5.10/series | 7 + 8 files changed, 695 insertions(+) create mode 100644 queue-5.10/ata-pata_marvell-check-the-bmdma_addr-beforing-reading.patch create mode 100644 queue-5.10/dma-at_xdmac-fix-a-missing-check-on-list-iterator.patch create mode 100644 queue-5.10/edac-synopsys-read-the-error-count-from-the-correct-register.patch create mode 100644 queue-5.10/mm-hugetlb-allow-for-high-userspace-addresses.patch create mode 100644 queue-5.10/mm-mmu_notifier.c-fix-race-in-mmu_interval_notifier_remove.patch create mode 100644 queue-5.10/net-atlantic-invert-deep-par-in-pm-functions-preventing-null-derefs.patch create mode 100644 queue-5.10/oom_kill.c-futex-delay-the-oom-reaper-to-allow-time-for-proper-futex-cleanup.patch diff --git a/queue-5.10/ata-pata_marvell-check-the-bmdma_addr-beforing-reading.patch b/queue-5.10/ata-pata_marvell-check-the-bmdma_addr-beforing-reading.patch new file mode 100644 index 00000000000..c79f199f64d --- /dev/null +++ b/queue-5.10/ata-pata_marvell-check-the-bmdma_addr-beforing-reading.patch @@ -0,0 +1,40 @@ +From aafa9f958342db36c17ac2a7f1b841032c96feb4 Mon Sep 17 00:00:00 2001 +From: Zheyu Ma +Date: Thu, 21 Apr 2022 09:39:20 +0800 +Subject: ata: pata_marvell: Check the 'bmdma_addr' beforing reading + +From: Zheyu Ma + +commit aafa9f958342db36c17ac2a7f1b841032c96feb4 upstream. + +Before detecting the cable type on the dma bar, the driver should check +whether the 'bmdma_addr' is zero, which means the adapter does not +support DMA, otherwise we will get the following error: + +[ 5.146634] Bad IO access at port 0x1 (return inb(port)) +[ 5.147206] WARNING: CPU: 2 PID: 303 at lib/iomap.c:44 ioread8+0x4a/0x60 +[ 5.150856] RIP: 0010:ioread8+0x4a/0x60 +[ 5.160238] Call Trace: +[ 5.160470] +[ 5.160674] marvell_cable_detect+0x6e/0xc0 [pata_marvell] +[ 5.161728] ata_eh_recover+0x3520/0x6cc0 +[ 5.168075] ata_do_eh+0x49/0x3c0 + +Signed-off-by: Zheyu Ma +Signed-off-by: Damien Le Moal +Signed-off-by: Greg Kroah-Hartman +--- + drivers/ata/pata_marvell.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/drivers/ata/pata_marvell.c ++++ b/drivers/ata/pata_marvell.c +@@ -83,6 +83,8 @@ static int marvell_cable_detect(struct a + switch(ap->port_no) + { + case 0: ++ if (!ap->ioaddr.bmdma_addr) ++ return ATA_CBL_PATA_UNK; + if (ioread8(ap->ioaddr.bmdma_addr + 1) & 1) + return ATA_CBL_PATA40; + return ATA_CBL_PATA80; diff --git a/queue-5.10/dma-at_xdmac-fix-a-missing-check-on-list-iterator.patch b/queue-5.10/dma-at_xdmac-fix-a-missing-check-on-list-iterator.patch new file mode 100644 index 00000000000..25b5d4de86a --- /dev/null +++ b/queue-5.10/dma-at_xdmac-fix-a-missing-check-on-list-iterator.patch @@ -0,0 +1,57 @@ +From 206680c4e46b62fd8909385e0874a36952595b85 Mon Sep 17 00:00:00 2001 +From: Xiaomeng Tong +Date: Sun, 27 Mar 2022 14:11:54 +0800 +Subject: dma: at_xdmac: fix a missing check on list iterator + +From: Xiaomeng Tong + +commit 206680c4e46b62fd8909385e0874a36952595b85 upstream. + +The bug is here: + __func__, desc, &desc->tx_dma_desc.phys, ret, cookie, residue); + +The list iterator 'desc' will point to a bogus position containing +HEAD if the list is empty or no element is found. To avoid dev_dbg() +prints a invalid address, use a new variable 'iter' as the list +iterator, while use the origin variable 'desc' as a dedicated +pointer to point to the found element. + +Cc: stable@vger.kernel.org +Fixes: 82e2424635f4c ("dmaengine: xdmac: fix print warning on dma_addr_t variable") +Signed-off-by: Xiaomeng Tong +Link: https://lore.kernel.org/r/20220327061154.4867-1-xiam0nd.tong@gmail.com +Signed-off-by: Vinod Koul +Signed-off-by: Greg Kroah-Hartman +--- + drivers/dma/at_xdmac.c | 12 +++++++----- + 1 file changed, 7 insertions(+), 5 deletions(-) + +--- a/drivers/dma/at_xdmac.c ++++ b/drivers/dma/at_xdmac.c +@@ -1390,7 +1390,7 @@ at_xdmac_tx_status(struct dma_chan *chan + { + struct at_xdmac_chan *atchan = to_at_xdmac_chan(chan); + struct at_xdmac *atxdmac = to_at_xdmac(atchan->chan.device); +- struct at_xdmac_desc *desc, *_desc; ++ struct at_xdmac_desc *desc, *_desc, *iter; + struct list_head *descs_list; + enum dma_status ret; + int residue, retry; +@@ -1505,11 +1505,13 @@ at_xdmac_tx_status(struct dma_chan *chan + * microblock. + */ + descs_list = &desc->descs_list; +- list_for_each_entry_safe(desc, _desc, descs_list, desc_node) { +- dwidth = at_xdmac_get_dwidth(desc->lld.mbr_cfg); +- residue -= (desc->lld.mbr_ubc & 0xffffff) << dwidth; +- if ((desc->lld.mbr_nda & 0xfffffffc) == cur_nda) ++ list_for_each_entry_safe(iter, _desc, descs_list, desc_node) { ++ dwidth = at_xdmac_get_dwidth(iter->lld.mbr_cfg); ++ residue -= (iter->lld.mbr_ubc & 0xffffff) << dwidth; ++ if ((iter->lld.mbr_nda & 0xfffffffc) == cur_nda) { ++ desc = iter; + break; ++ } + } + residue += cur_ubc << dwidth; + diff --git a/queue-5.10/edac-synopsys-read-the-error-count-from-the-correct-register.patch b/queue-5.10/edac-synopsys-read-the-error-count-from-the-correct-register.patch new file mode 100644 index 00000000000..7d6c370eeaf --- /dev/null +++ b/queue-5.10/edac-synopsys-read-the-error-count-from-the-correct-register.patch @@ -0,0 +1,61 @@ +From e2932d1f6f055b2af2114c7e64a26dc1b5593d0c Mon Sep 17 00:00:00 2001 +From: Shubhrajyoti Datta +Date: Thu, 14 Apr 2022 15:58:13 +0530 +Subject: EDAC/synopsys: Read the error count from the correct register + +From: Shubhrajyoti Datta + +commit e2932d1f6f055b2af2114c7e64a26dc1b5593d0c upstream. + +Currently, the error count is read wrongly from the status register. Read +the count from the proper error count register (ERRCNT). + + [ bp: Massage. ] + +Fixes: b500b4a029d5 ("EDAC, synopsys: Add ECC support for ZynqMP DDR controller") +Signed-off-by: Shubhrajyoti Datta +Signed-off-by: Borislav Petkov +Acked-by: Michal Simek +Cc: +Link: https://lore.kernel.org/r/20220414102813.4468-1-shubhrajyoti.datta@xilinx.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/edac/synopsys_edac.c | 16 +++++++++++----- + 1 file changed, 11 insertions(+), 5 deletions(-) + +--- a/drivers/edac/synopsys_edac.c ++++ b/drivers/edac/synopsys_edac.c +@@ -163,6 +163,11 @@ + #define ECC_STAT_CECNT_SHIFT 8 + #define ECC_STAT_BITNUM_MASK 0x7F + ++/* ECC error count register definitions */ ++#define ECC_ERRCNT_UECNT_MASK 0xFFFF0000 ++#define ECC_ERRCNT_UECNT_SHIFT 16 ++#define ECC_ERRCNT_CECNT_MASK 0xFFFF ++ + /* DDR QOS Interrupt register definitions */ + #define DDR_QOS_IRQ_STAT_OFST 0x20200 + #define DDR_QOSUE_MASK 0x4 +@@ -418,15 +423,16 @@ static int zynqmp_get_error_info(struct + base = priv->baseaddr; + p = &priv->stat; + ++ regval = readl(base + ECC_ERRCNT_OFST); ++ p->ce_cnt = regval & ECC_ERRCNT_CECNT_MASK; ++ p->ue_cnt = (regval & ECC_ERRCNT_UECNT_MASK) >> ECC_ERRCNT_UECNT_SHIFT; ++ if (!p->ce_cnt) ++ goto ue_err; ++ + regval = readl(base + ECC_STAT_OFST); + if (!regval) + return 1; + +- p->ce_cnt = (regval & ECC_STAT_CECNT_MASK) >> ECC_STAT_CECNT_SHIFT; +- p->ue_cnt = (regval & ECC_STAT_UECNT_MASK) >> ECC_STAT_UECNT_SHIFT; +- if (!p->ce_cnt) +- goto ue_err; +- + p->ceinfo.bitpos = (regval & ECC_STAT_BITNUM_MASK); + + regval = readl(base + ECC_CEADDR0_OFST); diff --git a/queue-5.10/mm-hugetlb-allow-for-high-userspace-addresses.patch b/queue-5.10/mm-hugetlb-allow-for-high-userspace-addresses.patch new file mode 100644 index 00000000000..50541d66623 --- /dev/null +++ b/queue-5.10/mm-hugetlb-allow-for-high-userspace-addresses.patch @@ -0,0 +1,145 @@ +From 5f24d5a579d1eace79d505b148808a850b417d4c Mon Sep 17 00:00:00 2001 +From: Christophe Leroy +Date: Thu, 21 Apr 2022 16:35:46 -0700 +Subject: mm, hugetlb: allow for "high" userspace addresses + +From: Christophe Leroy + +commit 5f24d5a579d1eace79d505b148808a850b417d4c upstream. + +This is a fix for commit f6795053dac8 ("mm: mmap: Allow for "high" +userspace addresses") for hugetlb. + +This patch adds support for "high" userspace addresses that are +optionally supported on the system and have to be requested via a hint +mechanism ("high" addr parameter to mmap). + +Architectures such as powerpc and x86 achieve this by making changes to +their architectural versions of hugetlb_get_unmapped_area() function. +However, arm64 uses the generic version of that function. + +So take into account arch_get_mmap_base() and arch_get_mmap_end() in +hugetlb_get_unmapped_area(). To allow that, move those two macros out +of mm/mmap.c into include/linux/sched/mm.h + +If these macros are not defined in architectural code then they default +to (TASK_SIZE) and (base) so should not introduce any behavioural +changes to architectures that do not define them. + +For the time being, only ARM64 is affected by this change. + +Catalin (ARM64) said + "We should have fixed hugetlb_get_unmapped_area() as well when we added + support for 52-bit VA. The reason for commit f6795053dac8 was to + prevent normal mmap() from returning addresses above 48-bit by default + as some user-space had hard assumptions about this. + + It's a slight ABI change if you do this for hugetlb_get_unmapped_area() + but I doubt anyone would notice. It's more likely that the current + behaviour would cause issues, so I'd rather have them consistent. + + Basically when arm64 gained support for 52-bit addresses we did not + want user-space calling mmap() to suddenly get such high addresses, + otherwise we could have inadvertently broken some programs (similar + behaviour to x86 here). Hence we added commit f6795053dac8. But we + missed hugetlbfs which could still get such high mmap() addresses. So + in theory that's a potential regression that should have bee addressed + at the same time as commit f6795053dac8 (and before arm64 enabled + 52-bit addresses)" + +Link: https://lkml.kernel.org/r/ab847b6edb197bffdfe189e70fb4ac76bfe79e0d.1650033747.git.christophe.leroy@csgroup.eu +Fixes: f6795053dac8 ("mm: mmap: Allow for "high" userspace addresses") +Signed-off-by: Christophe Leroy +Reviewed-by: Catalin Marinas +Cc: Steve Capper +Cc: Will Deacon +Cc: [5.0.x] +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + fs/hugetlbfs/inode.c | 9 +++++---- + include/linux/sched/mm.h | 8 ++++++++ + mm/mmap.c | 8 -------- + 3 files changed, 13 insertions(+), 12 deletions(-) + +--- a/fs/hugetlbfs/inode.c ++++ b/fs/hugetlbfs/inode.c +@@ -206,7 +206,7 @@ hugetlb_get_unmapped_area_bottomup(struc + info.flags = 0; + info.length = len; + info.low_limit = current->mm->mmap_base; +- info.high_limit = TASK_SIZE; ++ info.high_limit = arch_get_mmap_end(addr); + info.align_mask = PAGE_MASK & ~huge_page_mask(h); + info.align_offset = 0; + return vm_unmapped_area(&info); +@@ -222,7 +222,7 @@ hugetlb_get_unmapped_area_topdown(struct + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; + info.low_limit = max(PAGE_SIZE, mmap_min_addr); +- info.high_limit = current->mm->mmap_base; ++ info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base); + info.align_mask = PAGE_MASK & ~huge_page_mask(h); + info.align_offset = 0; + addr = vm_unmapped_area(&info); +@@ -237,7 +237,7 @@ hugetlb_get_unmapped_area_topdown(struct + VM_BUG_ON(addr != -ENOMEM); + info.flags = 0; + info.low_limit = current->mm->mmap_base; +- info.high_limit = TASK_SIZE; ++ info.high_limit = arch_get_mmap_end(addr); + addr = vm_unmapped_area(&info); + } + +@@ -251,6 +251,7 @@ hugetlb_get_unmapped_area(struct file *f + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + struct hstate *h = hstate_file(file); ++ const unsigned long mmap_end = arch_get_mmap_end(addr); + + if (len & ~huge_page_mask(h)) + return -EINVAL; +@@ -266,7 +267,7 @@ hugetlb_get_unmapped_area(struct file *f + if (addr) { + addr = ALIGN(addr, huge_page_size(h)); + vma = find_vma(mm, addr); +- if (TASK_SIZE - len >= addr && ++ if (mmap_end - len >= addr && + (!vma || addr + len <= vm_start_gap(vma))) + return addr; + } +--- a/include/linux/sched/mm.h ++++ b/include/linux/sched/mm.h +@@ -106,6 +106,14 @@ static inline void mm_update_next_owner( + #endif /* CONFIG_MEMCG */ + + #ifdef CONFIG_MMU ++#ifndef arch_get_mmap_end ++#define arch_get_mmap_end(addr) (TASK_SIZE) ++#endif ++ ++#ifndef arch_get_mmap_base ++#define arch_get_mmap_base(addr, base) (base) ++#endif ++ + extern void arch_pick_mmap_layout(struct mm_struct *mm, + struct rlimit *rlim_stack); + extern unsigned long +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -2140,14 +2140,6 @@ unsigned long vm_unmapped_area(struct vm + return addr; + } + +-#ifndef arch_get_mmap_end +-#define arch_get_mmap_end(addr) (TASK_SIZE) +-#endif +- +-#ifndef arch_get_mmap_base +-#define arch_get_mmap_base(addr, base) (base) +-#endif +- + /* Get an address range which is currently unmapped. + * For shmat() with addr=0. + * diff --git a/queue-5.10/mm-mmu_notifier.c-fix-race-in-mmu_interval_notifier_remove.patch b/queue-5.10/mm-mmu_notifier.c-fix-race-in-mmu_interval_notifier_remove.patch new file mode 100644 index 00000000000..27502266eb3 --- /dev/null +++ b/queue-5.10/mm-mmu_notifier.c-fix-race-in-mmu_interval_notifier_remove.patch @@ -0,0 +1,84 @@ +From 319561669a59d8e9206ab311ae5433ef92fd79d1 Mon Sep 17 00:00:00 2001 +From: Alistair Popple +Date: Thu, 21 Apr 2022 16:36:10 -0700 +Subject: mm/mmu_notifier.c: fix race in mmu_interval_notifier_remove() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Alistair Popple + +commit 319561669a59d8e9206ab311ae5433ef92fd79d1 upstream. + +In some cases it is possible for mmu_interval_notifier_remove() to race +with mn_tree_inv_end() allowing it to return while the notifier data +structure is still in use. Consider the following sequence: + + CPU0 - mn_tree_inv_end() CPU1 - mmu_interval_notifier_remove() + ----------------------------------- ------------------------------------ + spin_lock(subscriptions->lock); + seq = subscriptions->invalidate_seq; + spin_lock(subscriptions->lock); spin_unlock(subscriptions->lock); + subscriptions->invalidate_seq++; + wait_event(invalidate_seq != seq); + return; + interval_tree_remove(interval_sub); kfree(interval_sub); + spin_unlock(subscriptions->lock); + wake_up_all(); + +As the wait_event() condition is true it will return immediately. This +can lead to use-after-free type errors if the caller frees the data +structure containing the interval notifier subscription while it is +still on a deferred list. Fix this by taking the appropriate lock when +reading invalidate_seq to ensure proper synchronisation. + +I observed this whilst running stress testing during some development. +You do have to be pretty unlucky, but it leads to the usual problems of +use-after-free (memory corruption, kernel crash, difficult to diagnose +WARN_ON, etc). + +Link: https://lkml.kernel.org/r/20220420043734.476348-1-apopple@nvidia.com +Fixes: 99cb252f5e68 ("mm/mmu_notifier: add an interval tree notifier") +Signed-off-by: Alistair Popple +Signed-off-by: Jason Gunthorpe +Cc: Christian König +Cc: John Hubbard +Cc: Ralph Campbell +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + mm/mmu_notifier.c | 14 +++++++++++++- + 1 file changed, 13 insertions(+), 1 deletion(-) + +--- a/mm/mmu_notifier.c ++++ b/mm/mmu_notifier.c +@@ -1043,6 +1043,18 @@ int mmu_interval_notifier_insert_locked( + } + EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert_locked); + ++static bool ++mmu_interval_seq_released(struct mmu_notifier_subscriptions *subscriptions, ++ unsigned long seq) ++{ ++ bool ret; ++ ++ spin_lock(&subscriptions->lock); ++ ret = subscriptions->invalidate_seq != seq; ++ spin_unlock(&subscriptions->lock); ++ return ret; ++} ++ + /** + * mmu_interval_notifier_remove - Remove a interval notifier + * @interval_sub: Interval subscription to unregister +@@ -1090,7 +1102,7 @@ void mmu_interval_notifier_remove(struct + lock_map_release(&__mmu_notifier_invalidate_range_start_map); + if (seq) + wait_event(subscriptions->wq, +- READ_ONCE(subscriptions->invalidate_seq) != seq); ++ mmu_interval_seq_released(subscriptions, seq)); + + /* pairs with mmgrab in mmu_interval_notifier_insert() */ + mmdrop(mm); diff --git a/queue-5.10/net-atlantic-invert-deep-par-in-pm-functions-preventing-null-derefs.patch b/queue-5.10/net-atlantic-invert-deep-par-in-pm-functions-preventing-null-derefs.patch new file mode 100644 index 00000000000..8f4ebbb569a --- /dev/null +++ b/queue-5.10/net-atlantic-invert-deep-par-in-pm-functions-preventing-null-derefs.patch @@ -0,0 +1,95 @@ +From cbe6c3a8f8f4315b96e46e1a1c70393c06d95a4c Mon Sep 17 00:00:00 2001 +From: Manuel Ullmann +Date: Mon, 18 Apr 2022 00:20:01 +0200 +Subject: net: atlantic: invert deep par in pm functions, preventing null derefs + +From: Manuel Ullmann + +commit cbe6c3a8f8f4315b96e46e1a1c70393c06d95a4c upstream. + +This will reset deeply on freeze and thaw instead of suspend and +resume and prevent null pointer dereferences of the uninitialized ring +0 buffer while thawing. + +The impact is an indefinitely hanging kernel. You can't switch +consoles after this and the only possible user interaction is SysRq. + +BUG: kernel NULL pointer dereference +RIP: 0010:aq_ring_rx_fill+0xcf/0x210 [atlantic] +aq_vec_init+0x85/0xe0 [atlantic] +aq_nic_init+0xf7/0x1d0 [atlantic] +atl_resume_common+0x4f/0x100 [atlantic] +pci_pm_thaw+0x42/0xa0 + +resolves in aq_ring.o to + +``` +0000000000000ae0 : +{ +/* ... */ + baf: 48 8b 43 08 mov 0x8(%rbx),%rax + buff->flags = 0U; /* buff is NULL */ +``` + +The bug has been present since the introduction of the new pm code in +8aaa112a57c1 ("net: atlantic: refactoring pm logic") and was hidden +until 8ce84271697a ("net: atlantic: changes for multi-TC support"), +which refactored the aq_vec_{free,alloc} functions into +aq_vec_{,ring}_{free,alloc}, but is technically not wrong. The +original functions just always reinitialized the buffers on S3/S4. If +the interface is down before freezing, the bug does not occur. It does +not matter, whether the initrd contains and loads the module before +thawing. + +So the fix is to invert the boolean parameter deep in all pm function +calls, which was clearly intended to be set like that. + +First report was on Github [1], which you have to guess from the +resume logs in the posted dmesg snippet. Recently I posted one on +Bugzilla [2], since I did not have an AQC device so far. + +#regzbot introduced: 8ce84271697a +#regzbot from: koo5 +#regzbot monitor: https://github.com/Aquantia/AQtion/issues/32 + +Fixes: 8aaa112a57c1 ("net: atlantic: refactoring pm logic") +Link: https://github.com/Aquantia/AQtion/issues/32 [1] +Link: https://bugzilla.kernel.org/show_bug.cgi?id=215798 [2] +Cc: stable@vger.kernel.org +Reported-by: koo5 +Signed-off-by: Manuel Ullmann +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +--- a/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c ++++ b/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c +@@ -450,22 +450,22 @@ err_exit: + + static int aq_pm_freeze(struct device *dev) + { +- return aq_suspend_common(dev, false); ++ return aq_suspend_common(dev, true); + } + + static int aq_pm_suspend_poweroff(struct device *dev) + { +- return aq_suspend_common(dev, true); ++ return aq_suspend_common(dev, false); + } + + static int aq_pm_thaw(struct device *dev) + { +- return atl_resume_common(dev, false); ++ return atl_resume_common(dev, true); + } + + static int aq_pm_resume_restore(struct device *dev) + { +- return atl_resume_common(dev, true); ++ return atl_resume_common(dev, false); + } + + static const struct dev_pm_ops aq_pm_ops = { diff --git a/queue-5.10/oom_kill.c-futex-delay-the-oom-reaper-to-allow-time-for-proper-futex-cleanup.patch b/queue-5.10/oom_kill.c-futex-delay-the-oom-reaper-to-allow-time-for-proper-futex-cleanup.patch new file mode 100644 index 00000000000..58f1d2b162d --- /dev/null +++ b/queue-5.10/oom_kill.c-futex-delay-the-oom-reaper-to-allow-time-for-proper-futex-cleanup.patch @@ -0,0 +1,206 @@ +From e4a38402c36e42df28eb1a5394be87e6571fb48a Mon Sep 17 00:00:00 2001 +From: Nico Pache +Date: Thu, 21 Apr 2022 16:36:01 -0700 +Subject: oom_kill.c: futex: delay the OOM reaper to allow time for proper futex cleanup + +From: Nico Pache + +commit e4a38402c36e42df28eb1a5394be87e6571fb48a upstream. + +The pthread struct is allocated on PRIVATE|ANONYMOUS memory [1] which +can be targeted by the oom reaper. This mapping is used to store the +futex robust list head; the kernel does not keep a copy of the robust +list and instead references a userspace address to maintain the +robustness during a process death. + +A race can occur between exit_mm and the oom reaper that allows the oom +reaper to free the memory of the futex robust list before the exit path +has handled the futex death: + + CPU1 CPU2 + -------------------------------------------------------------------- + page_fault + do_exit "signal" + wake_oom_reaper + oom_reaper + oom_reap_task_mm (invalidates mm) + exit_mm + exit_mm_release + futex_exit_release + futex_cleanup + exit_robust_list + get_user (EFAULT- can't access memory) + +If the get_user EFAULT's, the kernel will be unable to recover the +waiters on the robust_list, leaving userspace mutexes hung indefinitely. + +Delay the OOM reaper, allowing more time for the exit path to perform +the futex cleanup. + +Reproducer: https://gitlab.com/jsavitz/oom_futex_reproducer + +Based on a patch by Michal Hocko. + +Link: https://elixir.bootlin.com/glibc/glibc-2.35/source/nptl/allocatestack.c#L370 [1] +Link: https://lkml.kernel.org/r/20220414144042.677008-1-npache@redhat.com +Fixes: 212925802454 ("mm: oom: let oom_reap_task and exit_mmap run concurrently") +Signed-off-by: Joel Savitz +Signed-off-by: Nico Pache +Co-developed-by: Joel Savitz +Suggested-by: Thomas Gleixner +Acked-by: Thomas Gleixner +Acked-by: Michal Hocko +Cc: Rafael Aquini +Cc: Waiman Long +Cc: Herton R. Krzesinski +Cc: Juri Lelli +Cc: Vincent Guittot +Cc: Dietmar Eggemann +Cc: Steven Rostedt +Cc: Ben Segall +Cc: Mel Gorman +Cc: Daniel Bristot de Oliveira +Cc: David Rientjes +Cc: Andrea Arcangeli +Cc: Davidlohr Bueso +Cc: Peter Zijlstra +Cc: Ingo Molnar +Cc: Joel Savitz +Cc: Darren Hart +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/sched.h | 1 + mm/oom_kill.c | 54 +++++++++++++++++++++++++++++++++++++------------- + 2 files changed, 41 insertions(+), 14 deletions(-) + +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -1325,6 +1325,7 @@ struct task_struct { + int pagefault_disabled; + #ifdef CONFIG_MMU + struct task_struct *oom_reaper_list; ++ struct timer_list oom_reaper_timer; + #endif + #ifdef CONFIG_VMAP_STACK + struct vm_struct *stack_vm_area; +--- a/mm/oom_kill.c ++++ b/mm/oom_kill.c +@@ -633,7 +633,7 @@ done: + */ + set_bit(MMF_OOM_SKIP, &mm->flags); + +- /* Drop a reference taken by wake_oom_reaper */ ++ /* Drop a reference taken by queue_oom_reaper */ + put_task_struct(tsk); + } + +@@ -643,12 +643,12 @@ static int oom_reaper(void *unused) + struct task_struct *tsk = NULL; + + wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL); +- spin_lock(&oom_reaper_lock); ++ spin_lock_irq(&oom_reaper_lock); + if (oom_reaper_list != NULL) { + tsk = oom_reaper_list; + oom_reaper_list = tsk->oom_reaper_list; + } +- spin_unlock(&oom_reaper_lock); ++ spin_unlock_irq(&oom_reaper_lock); + + if (tsk) + oom_reap_task(tsk); +@@ -657,22 +657,48 @@ static int oom_reaper(void *unused) + return 0; + } + +-static void wake_oom_reaper(struct task_struct *tsk) ++static void wake_oom_reaper(struct timer_list *timer) + { +- /* mm is already queued? */ +- if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags)) ++ struct task_struct *tsk = container_of(timer, struct task_struct, ++ oom_reaper_timer); ++ struct mm_struct *mm = tsk->signal->oom_mm; ++ unsigned long flags; ++ ++ /* The victim managed to terminate on its own - see exit_mmap */ ++ if (test_bit(MMF_OOM_SKIP, &mm->flags)) { ++ put_task_struct(tsk); + return; ++ } + +- get_task_struct(tsk); +- +- spin_lock(&oom_reaper_lock); ++ spin_lock_irqsave(&oom_reaper_lock, flags); + tsk->oom_reaper_list = oom_reaper_list; + oom_reaper_list = tsk; +- spin_unlock(&oom_reaper_lock); ++ spin_unlock_irqrestore(&oom_reaper_lock, flags); + trace_wake_reaper(tsk->pid); + wake_up(&oom_reaper_wait); + } + ++/* ++ * Give the OOM victim time to exit naturally before invoking the oom_reaping. ++ * The timers timeout is arbitrary... the longer it is, the longer the worst ++ * case scenario for the OOM can take. If it is too small, the oom_reaper can ++ * get in the way and release resources needed by the process exit path. ++ * e.g. The futex robust list can sit in Anon|Private memory that gets reaped ++ * before the exit path is able to wake the futex waiters. ++ */ ++#define OOM_REAPER_DELAY (2*HZ) ++static void queue_oom_reaper(struct task_struct *tsk) ++{ ++ /* mm is already queued? */ ++ if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags)) ++ return; ++ ++ get_task_struct(tsk); ++ timer_setup(&tsk->oom_reaper_timer, wake_oom_reaper, 0); ++ tsk->oom_reaper_timer.expires = jiffies + OOM_REAPER_DELAY; ++ add_timer(&tsk->oom_reaper_timer); ++} ++ + static int __init oom_init(void) + { + oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); +@@ -680,7 +706,7 @@ static int __init oom_init(void) + } + subsys_initcall(oom_init) + #else +-static inline void wake_oom_reaper(struct task_struct *tsk) ++static inline void queue_oom_reaper(struct task_struct *tsk) + { + } + #endif /* CONFIG_MMU */ +@@ -931,7 +957,7 @@ static void __oom_kill_process(struct ta + rcu_read_unlock(); + + if (can_oom_reap) +- wake_oom_reaper(victim); ++ queue_oom_reaper(victim); + + mmdrop(mm); + put_task_struct(victim); +@@ -967,7 +993,7 @@ static void oom_kill_process(struct oom_ + task_lock(victim); + if (task_will_free_mem(victim)) { + mark_oom_victim(victim); +- wake_oom_reaper(victim); ++ queue_oom_reaper(victim); + task_unlock(victim); + put_task_struct(victim); + return; +@@ -1065,7 +1091,7 @@ bool out_of_memory(struct oom_control *o + */ + if (task_will_free_mem(current)) { + mark_oom_victim(current); +- wake_oom_reaper(current); ++ queue_oom_reaper(current); + return true; + } + diff --git a/queue-5.10/series b/queue-5.10/series index 02531252248..f9ac328ab35 100644 --- a/queue-5.10/series +++ b/queue-5.10/series @@ -48,3 +48,10 @@ scsi-qedi-fix-failed-disconnect-handling.patch stat-fix-inconsistency-between-struct-stat-and-struc.patch nvme-add-a-quirk-to-disable-namespace-identifiers.patch nvme-pci-disable-namespace-identifiers-for-qemu-cont.patch +edac-synopsys-read-the-error-count-from-the-correct-register.patch +mm-hugetlb-allow-for-high-userspace-addresses.patch +oom_kill.c-futex-delay-the-oom-reaper-to-allow-time-for-proper-futex-cleanup.patch +mm-mmu_notifier.c-fix-race-in-mmu_interval_notifier_remove.patch +ata-pata_marvell-check-the-bmdma_addr-beforing-reading.patch +dma-at_xdmac-fix-a-missing-check-on-list-iterator.patch +net-atlantic-invert-deep-par-in-pm-functions-preventing-null-derefs.patch -- 2.47.3