From: Greg Kroah-Hartman Date: Thu, 18 Jun 2020 16:21:56 +0000 (+0200) Subject: 4.19-stable patches X-Git-Tag: v4.4.228~49 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=a78e258651210c78cb377bf99fc3f0011f8e4484;p=thirdparty%2Fkernel%2Fstable-queue.git 4.19-stable patches added patches: mm-initialize-deferred-pages-with-interrupts-enabled.patch mm-thp-make-the-thp-mapcount-atomic-against-__split_huge_pmd_locked.patch --- diff --git a/queue-4.19/mm-initialize-deferred-pages-with-interrupts-enabled.patch b/queue-4.19/mm-initialize-deferred-pages-with-interrupts-enabled.patch new file mode 100644 index 00000000000..1b5179b4c92 --- /dev/null +++ b/queue-4.19/mm-initialize-deferred-pages-with-interrupts-enabled.patch @@ -0,0 +1,108 @@ +From 3d060856adfc59afb9d029c233141334cfaba418 Mon Sep 17 00:00:00 2001 +From: Pavel Tatashin +Date: Wed, 3 Jun 2020 15:59:24 -0700 +Subject: mm: initialize deferred pages with interrupts enabled + +From: Pavel Tatashin + +commit 3d060856adfc59afb9d029c233141334cfaba418 upstream. + +Initializing struct pages is a long task and keeping interrupts disabled +for the duration of this operation introduces a number of problems. + +1. jiffies are not updated for long period of time, and thus incorrect time + is reported. See proposed solution and discussion here: + lkml/20200311123848.118638-1-shile.zhang@linux.alibaba.com +2. It prevents farther improving deferred page initialization by allowing + intra-node multi-threading. + +We are keeping interrupts disabled to solve a rather theoretical problem +that was never observed in real world (See 3a2d7fa8a3d5). + +Let's keep interrupts enabled. In case we ever encounter a scenario where +an interrupt thread wants to allocate large amount of memory this early in +boot we can deal with that by growing zone (see deferred_grow_zone()) by +the needed amount before starting deferred_init_memmap() threads. + +Before: +[ 1.232459] node 0 initialised, 12058412 pages in 1ms + +After: +[ 1.632580] node 0 initialised, 12051227 pages in 436ms + +Fixes: 3a2d7fa8a3d5 ("mm: disable interrupts while initializing deferred pages") +Reported-by: Shile Zhang +Signed-off-by: Pavel Tatashin +Signed-off-by: Andrew Morton +Reviewed-by: Daniel Jordan +Reviewed-by: David Hildenbrand +Acked-by: Michal Hocko +Acked-by: Vlastimil Babka +Cc: Dan Williams +Cc: James Morris +Cc: Kirill Tkhai +Cc: Sasha Levin +Cc: Yiqian Wei +Cc: [4.17+] +Link: http://lkml.kernel.org/r/20200403140952.17177-3-pasha.tatashin@soleen.com +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/mmzone.h | 2 ++ + mm/page_alloc.c | 19 +++++++------------ + 2 files changed, 9 insertions(+), 12 deletions(-) + +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -638,6 +638,8 @@ typedef struct pglist_data { + /* + * Must be held any time you expect node_start_pfn, node_present_pages + * or node_spanned_pages stay constant. ++ * Also synchronizes pgdat->first_deferred_pfn during deferred page ++ * init. + * + * pgdat_resize_lock() and pgdat_resize_unlock() are provided to + * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -1586,6 +1586,13 @@ static int __init deferred_init_memmap(v + BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat)); + pgdat->first_deferred_pfn = ULONG_MAX; + ++ /* ++ * Once we unlock here, the zone cannot be grown anymore, thus if an ++ * interrupt thread must allocate this early in boot, zone must be ++ * pre-grown prior to start of deferred page initialization. ++ */ ++ pgdat_resize_unlock(pgdat, &flags); ++ + /* Only the highest zone is deferred so find it */ + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + zone = pgdat->node_zones + zid; +@@ -1610,7 +1617,6 @@ static int __init deferred_init_memmap(v + epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); + deferred_free_pages(nid, zid, spfn, epfn); + } +- pgdat_resize_unlock(pgdat, &flags); + + /* Sanity check that the next zone really is unpopulated */ + WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); +@@ -1657,17 +1663,6 @@ deferred_grow_zone(struct zone *zone, un + pgdat_resize_lock(pgdat, &flags); + + /* +- * If deferred pages have been initialized while we were waiting for +- * the lock, return true, as the zone was grown. The caller will retry +- * this zone. We won't return to this function since the caller also +- * has this static branch. +- */ +- if (!static_branch_unlikely(&deferred_pages)) { +- pgdat_resize_unlock(pgdat, &flags); +- return true; +- } +- +- /* + * If someone grew this zone while we were waiting for spinlock, return + * true, as there might be enough pages already. + */ diff --git a/queue-4.19/mm-thp-make-the-thp-mapcount-atomic-against-__split_huge_pmd_locked.patch b/queue-4.19/mm-thp-make-the-thp-mapcount-atomic-against-__split_huge_pmd_locked.patch new file mode 100644 index 00000000000..19b3bba59a2 --- /dev/null +++ b/queue-4.19/mm-thp-make-the-thp-mapcount-atomic-against-__split_huge_pmd_locked.patch @@ -0,0 +1,102 @@ +From c444eb564fb16645c172d550359cb3d75fe8a040 Mon Sep 17 00:00:00 2001 +From: Andrea Arcangeli +Date: Wed, 27 May 2020 19:06:24 -0400 +Subject: mm: thp: make the THP mapcount atomic against __split_huge_pmd_locked() + +From: Andrea Arcangeli + +commit c444eb564fb16645c172d550359cb3d75fe8a040 upstream. + +Write protect anon page faults require an accurate mapcount to decide +if to break the COW or not. This is implemented in the THP path with +reuse_swap_page() -> +page_trans_huge_map_swapcount()/page_trans_huge_mapcount(). + +If the COW triggers while the other processes sharing the page are +under a huge pmd split, to do an accurate reading, we must ensure the +mapcount isn't computed while it's being transferred from the head +page to the tail pages. + +reuse_swap_cache() already runs serialized by the page lock, so it's +enough to add the page lock around __split_huge_pmd_locked too, in +order to add the missing serialization. + +Note: the commit in "Fixes" is just to facilitate the backporting, +because the code before such commit didn't try to do an accurate THP +mapcount calculation and it instead used the page_count() to decide if +to COW or not. Both the page_count and the pin_count are THP-wide +refcounts, so they're inaccurate if used in +reuse_swap_page(). Reverting such commit (besides the unrelated fix to +the local anon_vma assignment) would have also opened the window for +memory corruption side effects to certain workloads as documented in +such commit header. + +Signed-off-by: Andrea Arcangeli +Suggested-by: Jann Horn +Reported-by: Jann Horn +Acked-by: Kirill A. Shutemov +Fixes: 6d0a07edd17c ("mm: thp: calculate the mapcount correctly for THP pages during WP faults") +Cc: stable@vger.kernel.org +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/huge_memory.c | 31 ++++++++++++++++++++++++++++--- + 1 file changed, 28 insertions(+), 3 deletions(-) + +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -2273,6 +2273,8 @@ void __split_huge_pmd(struct vm_area_str + spinlock_t *ptl; + struct mm_struct *mm = vma->vm_mm; + unsigned long haddr = address & HPAGE_PMD_MASK; ++ bool was_locked = false; ++ pmd_t _pmd; + + mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE); + ptl = pmd_lock(mm, pmd); +@@ -2282,11 +2284,32 @@ void __split_huge_pmd(struct vm_area_str + * pmd against. Otherwise we can end up replacing wrong page. + */ + VM_BUG_ON(freeze && !page); +- if (page && page != pmd_page(*pmd)) +- goto out; ++ if (page) { ++ VM_WARN_ON_ONCE(!PageLocked(page)); ++ was_locked = true; ++ if (page != pmd_page(*pmd)) ++ goto out; ++ } + ++repeat: + if (pmd_trans_huge(*pmd)) { +- page = pmd_page(*pmd); ++ if (!page) { ++ page = pmd_page(*pmd); ++ if (unlikely(!trylock_page(page))) { ++ get_page(page); ++ _pmd = *pmd; ++ spin_unlock(ptl); ++ lock_page(page); ++ spin_lock(ptl); ++ if (unlikely(!pmd_same(*pmd, _pmd))) { ++ unlock_page(page); ++ put_page(page); ++ page = NULL; ++ goto repeat; ++ } ++ put_page(page); ++ } ++ } + if (PageMlocked(page)) + clear_page_mlock(page); + } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd))) +@@ -2294,6 +2317,8 @@ void __split_huge_pmd(struct vm_area_str + __split_huge_pmd_locked(vma, pmd, haddr, freeze); + out: + spin_unlock(ptl); ++ if (!was_locked && page) ++ unlock_page(page); + /* + * No need to double call mmu_notifier->invalidate_range() callback. + * They are 3 cases to consider inside __split_huge_pmd_locked(): diff --git a/queue-4.19/series b/queue-4.19/series index 6da0ff5c2b1..ae9eb5b5294 100644 --- a/queue-4.19/series +++ b/queue-4.19/series @@ -187,3 +187,5 @@ platform-x86-intel-vbtn-only-blacklist-sw_tablet_mod.patch string.h-fix-incompatibility-between-fortify_source-.patch btrfs-include-non-missing-as-a-qualifier-for-the-latest_bdev.patch btrfs-send-emit-file-capabilities-after-chown.patch +mm-thp-make-the-thp-mapcount-atomic-against-__split_huge_pmd_locked.patch +mm-initialize-deferred-pages-with-interrupts-enabled.patch