From 292a9b0f83fabd4da4a5318d89cded8c8f5b65b0 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sun, 2 Oct 2022 12:22:30 +0200 Subject: [PATCH] 5.10-stable patches added patches: mm-page_alloc-fix-race-condition-between-build_all_zonelists-and-page-allocation.patch mm-prevent-page_frag_alloc-from-corrupting-the-memory.patch --- ...ld_all_zonelists-and-page-allocation.patch | 179 ++++++++++++++++++ ...rag_alloc-from-corrupting-the-memory.patch | 54 ++++++ queue-5.10/series | 2 + 3 files changed, 235 insertions(+) create mode 100644 queue-5.10/mm-page_alloc-fix-race-condition-between-build_all_zonelists-and-page-allocation.patch create mode 100644 queue-5.10/mm-prevent-page_frag_alloc-from-corrupting-the-memory.patch diff --git a/queue-5.10/mm-page_alloc-fix-race-condition-between-build_all_zonelists-and-page-allocation.patch b/queue-5.10/mm-page_alloc-fix-race-condition-between-build_all_zonelists-and-page-allocation.patch new file mode 100644 index 00000000000..1b217b25675 --- /dev/null +++ b/queue-5.10/mm-page_alloc-fix-race-condition-between-build_all_zonelists-and-page-allocation.patch @@ -0,0 +1,179 @@ +From 3d36424b3b5850bd92f3e89b953a430d7cfc88ef Mon Sep 17 00:00:00 2001 +From: Mel Gorman +Date: Wed, 24 Aug 2022 12:14:50 +0100 +Subject: mm/page_alloc: fix race condition between build_all_zonelists and page allocation + +From: Mel Gorman + +commit 3d36424b3b5850bd92f3e89b953a430d7cfc88ef upstream. + +Patrick Daly reported the following problem; + + NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK] - before offline operation + [0] - ZONE_MOVABLE + [1] - ZONE_NORMAL + [2] - NULL + + For a GFP_KERNEL allocation, alloc_pages_slowpath() will save the + offset of ZONE_NORMAL in ac->preferred_zoneref. If a concurrent + memory_offline operation removes the last page from ZONE_MOVABLE, + build_all_zonelists() & build_zonerefs_node() will update + node_zonelists as shown below. Only populated zones are added. + + NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK] - after offline operation + [0] - ZONE_NORMAL + [1] - NULL + [2] - NULL + +The race is simple -- page allocation could be in progress when a memory +hot-remove operation triggers a zonelist rebuild that removes zones. The +allocation request will still have a valid ac->preferred_zoneref that is +now pointing to NULL and triggers an OOM kill. + +This problem probably always existed but may be slightly easier to trigger +due to 6aa303defb74 ("mm, vmscan: only allocate and reclaim from zones +with pages managed by the buddy allocator") which distinguishes between +zones that are completely unpopulated versus zones that have valid pages +not managed by the buddy allocator (e.g. reserved, memblock, ballooning +etc). Memory hotplug had multiple stages with timing considerations +around managed/present page updates, the zonelist rebuild and the zone +span updates. As David Hildenbrand puts it + + memory offlining adjusts managed+present pages of the zone + essentially in one go. If after the adjustments, the zone is no + longer populated (present==0), we rebuild the zone lists. + + Once that's done, we try shrinking the zone (start+spanned + pages) -- which results in zone_start_pfn == 0 if there are no + more pages. That happens *after* rebuilding the zonelists via + remove_pfn_range_from_zone(). + +The only requirement to fix the race is that a page allocation request +identifies when a zonelist rebuild has happened since the allocation +request started and no page has yet been allocated. Use a seqlock_t to +track zonelist updates with a lockless read-side of the zonelist and +protecting the rebuild and update of the counter with a spinlock. + +[akpm@linux-foundation.org: make zonelist_update_seq static] +Link: https://lkml.kernel.org/r/20220824110900.vh674ltxmzb3proq@techsingularity.net +Fixes: 6aa303defb74 ("mm, vmscan: only allocate and reclaim from zones with pages managed by the buddy allocator") +Signed-off-by: Mel Gorman +Reported-by: Patrick Daly +Acked-by: Michal Hocko +Reviewed-by: David Hildenbrand +Cc: [4.9+] +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/page_alloc.c | 53 +++++++++++++++++++++++++++++++++++++++++++---------- + 1 file changed, 43 insertions(+), 10 deletions(-) + +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -4322,6 +4322,30 @@ void fs_reclaim_release(gfp_t gfp_mask) + EXPORT_SYMBOL_GPL(fs_reclaim_release); + #endif + ++/* ++ * Zonelists may change due to hotplug during allocation. Detect when zonelists ++ * have been rebuilt so allocation retries. Reader side does not lock and ++ * retries the allocation if zonelist changes. Writer side is protected by the ++ * embedded spin_lock. ++ */ ++static DEFINE_SEQLOCK(zonelist_update_seq); ++ ++static unsigned int zonelist_iter_begin(void) ++{ ++ if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)) ++ return read_seqbegin(&zonelist_update_seq); ++ ++ return 0; ++} ++ ++static unsigned int check_retry_zonelist(unsigned int seq) ++{ ++ if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)) ++ return read_seqretry(&zonelist_update_seq, seq); ++ ++ return seq; ++} ++ + /* Perform direct synchronous page reclaim */ + static unsigned long + __perform_reclaim(gfp_t gfp_mask, unsigned int order, +@@ -4629,6 +4653,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, u + int compaction_retries; + int no_progress_loops; + unsigned int cpuset_mems_cookie; ++ unsigned int zonelist_iter_cookie; + int reserve_flags; + + /* +@@ -4639,11 +4664,12 @@ __alloc_pages_slowpath(gfp_t gfp_mask, u + (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) + gfp_mask &= ~__GFP_ATOMIC; + +-retry_cpuset: ++restart: + compaction_retries = 0; + no_progress_loops = 0; + compact_priority = DEF_COMPACT_PRIORITY; + cpuset_mems_cookie = read_mems_allowed_begin(); ++ zonelist_iter_cookie = zonelist_iter_begin(); + + /* + * The fast path uses conservative alloc_flags to succeed only until +@@ -4802,9 +4828,13 @@ retry: + goto retry; + + +- /* Deal with possible cpuset update races before we start OOM killing */ +- if (check_retry_cpuset(cpuset_mems_cookie, ac)) +- goto retry_cpuset; ++ /* ++ * Deal with possible cpuset update races or zonelist updates to avoid ++ * a unnecessary OOM kill. ++ */ ++ if (check_retry_cpuset(cpuset_mems_cookie, ac) || ++ check_retry_zonelist(zonelist_iter_cookie)) ++ goto restart; + + /* Reclaim has failed us, start killing things */ + page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); +@@ -4824,9 +4854,13 @@ retry: + } + + nopage: +- /* Deal with possible cpuset update races before we fail */ +- if (check_retry_cpuset(cpuset_mems_cookie, ac)) +- goto retry_cpuset; ++ /* ++ * Deal with possible cpuset update races or zonelist updates to avoid ++ * a unnecessary OOM kill. ++ */ ++ if (check_retry_cpuset(cpuset_mems_cookie, ac) || ++ check_retry_zonelist(zonelist_iter_cookie)) ++ goto restart; + + /* + * Make sure that __GFP_NOFAIL request doesn't leak out and make sure +@@ -5924,9 +5958,8 @@ static void __build_all_zonelists(void * + int nid; + int __maybe_unused cpu; + pg_data_t *self = data; +- static DEFINE_SPINLOCK(lock); + +- spin_lock(&lock); ++ write_seqlock(&zonelist_update_seq); + + #ifdef CONFIG_NUMA + memset(node_load, 0, sizeof(node_load)); +@@ -5959,7 +5992,7 @@ static void __build_all_zonelists(void * + #endif + } + +- spin_unlock(&lock); ++ write_sequnlock(&zonelist_update_seq); + } + + static noinline void __init diff --git a/queue-5.10/mm-prevent-page_frag_alloc-from-corrupting-the-memory.patch b/queue-5.10/mm-prevent-page_frag_alloc-from-corrupting-the-memory.patch new file mode 100644 index 00000000000..6482850ebcd --- /dev/null +++ b/queue-5.10/mm-prevent-page_frag_alloc-from-corrupting-the-memory.patch @@ -0,0 +1,54 @@ +From dac22531bbd4af2426c4e29e05594415ccfa365d Mon Sep 17 00:00:00 2001 +From: Maurizio Lombardi +Date: Fri, 15 Jul 2022 14:50:13 +0200 +Subject: mm: prevent page_frag_alloc() from corrupting the memory + +From: Maurizio Lombardi + +commit dac22531bbd4af2426c4e29e05594415ccfa365d upstream. + +A number of drivers call page_frag_alloc() with a fragment's size > +PAGE_SIZE. + +In low memory conditions, __page_frag_cache_refill() may fail the order +3 cache allocation and fall back to order 0; In this case, the cache +will be smaller than the fragment, causing memory corruptions. + +Prevent this from happening by checking if the newly allocated cache is +large enough for the fragment; if not, the allocation will fail and +page_frag_alloc() will return NULL. + +Link: https://lkml.kernel.org/r/20220715125013.247085-1-mlombard@redhat.com +Fixes: b63ae8ca096d ("mm/net: Rename and move page fragment handling from net/ to mm/") +Signed-off-by: Maurizio Lombardi +Reviewed-by: Alexander Duyck +Cc: Chen Lin +Cc: Jakub Kicinski +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/page_alloc.c | 12 ++++++++++++ + 1 file changed, 12 insertions(+) + +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -5163,6 +5163,18 @@ refill: + /* reset page count bias and offset to start of new frag */ + nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; + offset = size - fragsz; ++ if (unlikely(offset < 0)) { ++ /* ++ * The caller is trying to allocate a fragment ++ * with fragsz > PAGE_SIZE but the cache isn't big ++ * enough to satisfy the request, this may ++ * happen in low memory conditions. ++ * We don't release the cache page because ++ * it could make memory pressure worse ++ * so we simply return NULL here. ++ */ ++ return NULL; ++ } + } + + nc->pagecnt_bias--; diff --git a/queue-5.10/series b/queue-5.10/series index 01654812b16..1eb97161594 100644 --- a/queue-5.10/series +++ b/queue-5.10/series @@ -20,3 +20,5 @@ powerpc-64s-radix-don-t-need-to-broadcast-ipi-for-radix-pmd-collapse-flush.patch libata-add-ata_horkage_nolpm-for-pioneer-bdr-207m-and-bdr-205.patch mmc-moxart-fix-4-bit-bus-width-and-remove-8-bit-bus-width.patch mmc-hsq-fix-data-stomping-during-mmc-recovery.patch +mm-page_alloc-fix-race-condition-between-build_all_zonelists-and-page-allocation.patch +mm-prevent-page_frag_alloc-from-corrupting-the-memory.patch -- 2.47.3