From d2f3e9a30fbf7b0ee80bb7a0e6ad1050d646d361 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 6 Jan 2025 12:01:16 +0100 Subject: [PATCH] 6.1-stable patches added patches: mm-readahead-fix-large-folio-support-in-async-readahead.patch mm-vmscan-account-for-free-pages-to-prevent-infinite-loop-in-throttle_direct_reclaim.patch --- ...rge-folio-support-in-async-readahead.patch | 67 ++++++++ ...nite-loop-in-throttle_direct_reclaim.patch | 144 ++++++++++++++++++ queue-6.1/series | 2 + 3 files changed, 213 insertions(+) create mode 100644 queue-6.1/mm-readahead-fix-large-folio-support-in-async-readahead.patch create mode 100644 queue-6.1/mm-vmscan-account-for-free-pages-to-prevent-infinite-loop-in-throttle_direct_reclaim.patch diff --git a/queue-6.1/mm-readahead-fix-large-folio-support-in-async-readahead.patch b/queue-6.1/mm-readahead-fix-large-folio-support-in-async-readahead.patch new file mode 100644 index 00000000000..8dc96a93064 --- /dev/null +++ b/queue-6.1/mm-readahead-fix-large-folio-support-in-async-readahead.patch @@ -0,0 +1,67 @@ +From 158cdce87c8c172787063998ad5dd3e2f658b963 Mon Sep 17 00:00:00 2001 +From: Yafang Shao +Date: Fri, 6 Dec 2024 16:30:25 +0800 +Subject: mm/readahead: fix large folio support in async readahead + +From: Yafang Shao + +commit 158cdce87c8c172787063998ad5dd3e2f658b963 upstream. + +When testing large folio support with XFS on our servers, we observed that +only a few large folios are mapped when reading large files via mmap. +After a thorough analysis, I identified it was caused by the +`/sys/block/*/queue/read_ahead_kb` setting. On our test servers, this +parameter is set to 128KB. After I tune it to 2MB, the large folio can +work as expected. However, I believe the large folio behavior should not +be dependent on the value of read_ahead_kb. It would be more robust if +the kernel can automatically adopt to it. + +With /sys/block/*/queue/read_ahead_kb set to 128KB and performing a +sequential read on a 1GB file using MADV_HUGEPAGE, the differences in +/proc/meminfo are as follows: + +- before this patch + FileHugePages: 18432 kB + FilePmdMapped: 4096 kB + +- after this patch + FileHugePages: 1067008 kB + FilePmdMapped: 1048576 kB + +This shows that after applying the patch, the entire 1GB file is mapped to +huge pages. The stable list is CCed, as without this patch, large folios +don't function optimally in the readahead path. + +It's worth noting that if read_ahead_kb is set to a larger value that +isn't aligned with huge page sizes (e.g., 4MB + 128KB), it may still fail +to map to hugepages. + +Link: https://lkml.kernel.org/r/20241108141710.9721-1-laoar.shao@gmail.com +Link: https://lkml.kernel.org/r/20241206083025.3478-1-laoar.shao@gmail.com +Fixes: 4687fdbb805a ("mm/filemap: Support VM_HUGEPAGE for file mappings") +Signed-off-by: Yafang Shao +Tested-by: kernel test robot +Cc: Matthew Wilcox +Cc: David Hildenbrand +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/readahead.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +--- a/mm/readahead.c ++++ b/mm/readahead.c +@@ -599,7 +599,11 @@ static void ondemand_readahead(struct re + 1UL << order); + if (index == expected || index == (ra->start + ra->size)) { + ra->start += ra->size; +- ra->size = get_next_ra_size(ra, max_pages); ++ /* ++ * In the case of MADV_HUGEPAGE, the actual size might exceed ++ * the readahead window. ++ */ ++ ra->size = max(ra->size, get_next_ra_size(ra, max_pages)); + ra->async_size = ra->size; + goto readit; + } diff --git a/queue-6.1/mm-vmscan-account-for-free-pages-to-prevent-infinite-loop-in-throttle_direct_reclaim.patch b/queue-6.1/mm-vmscan-account-for-free-pages-to-prevent-infinite-loop-in-throttle_direct_reclaim.patch new file mode 100644 index 00000000000..8da421d09be --- /dev/null +++ b/queue-6.1/mm-vmscan-account-for-free-pages-to-prevent-infinite-loop-in-throttle_direct_reclaim.patch @@ -0,0 +1,144 @@ +From 6aaced5abd32e2a57cd94fd64f824514d0361da8 Mon Sep 17 00:00:00 2001 +From: Seiji Nishikawa +Date: Sun, 1 Dec 2024 01:12:34 +0900 +Subject: mm: vmscan: account for free pages to prevent infinite Loop in throttle_direct_reclaim() + +From: Seiji Nishikawa + +commit 6aaced5abd32e2a57cd94fd64f824514d0361da8 upstream. + +The task sometimes continues looping in throttle_direct_reclaim() because +allow_direct_reclaim(pgdat) keeps returning false. + + #0 [ffff80002cb6f8d0] __switch_to at ffff8000080095ac + #1 [ffff80002cb6f900] __schedule at ffff800008abbd1c + #2 [ffff80002cb6f990] schedule at ffff800008abc50c + #3 [ffff80002cb6f9b0] throttle_direct_reclaim at ffff800008273550 + #4 [ffff80002cb6fa20] try_to_free_pages at ffff800008277b68 + #5 [ffff80002cb6fae0] __alloc_pages_nodemask at ffff8000082c4660 + #6 [ffff80002cb6fc50] alloc_pages_vma at ffff8000082e4a98 + #7 [ffff80002cb6fca0] do_anonymous_page at ffff80000829f5a8 + #8 [ffff80002cb6fce0] __handle_mm_fault at ffff8000082a5974 + #9 [ffff80002cb6fd90] handle_mm_fault at ffff8000082a5bd4 + +At this point, the pgdat contains the following two zones: + + NODE: 4 ZONE: 0 ADDR: ffff00817fffe540 NAME: "DMA32" + SIZE: 20480 MIN/LOW/HIGH: 11/28/45 + VM_STAT: + NR_FREE_PAGES: 359 + NR_ZONE_INACTIVE_ANON: 18813 + NR_ZONE_ACTIVE_ANON: 0 + NR_ZONE_INACTIVE_FILE: 50 + NR_ZONE_ACTIVE_FILE: 0 + NR_ZONE_UNEVICTABLE: 0 + NR_ZONE_WRITE_PENDING: 0 + NR_MLOCK: 0 + NR_BOUNCE: 0 + NR_ZSPAGES: 0 + NR_FREE_CMA_PAGES: 0 + + NODE: 4 ZONE: 1 ADDR: ffff00817fffec00 NAME: "Normal" + SIZE: 8454144 PRESENT: 98304 MIN/LOW/HIGH: 68/166/264 + VM_STAT: + NR_FREE_PAGES: 146 + NR_ZONE_INACTIVE_ANON: 94668 + NR_ZONE_ACTIVE_ANON: 3 + NR_ZONE_INACTIVE_FILE: 735 + NR_ZONE_ACTIVE_FILE: 78 + NR_ZONE_UNEVICTABLE: 0 + NR_ZONE_WRITE_PENDING: 0 + NR_MLOCK: 0 + NR_BOUNCE: 0 + NR_ZSPAGES: 0 + NR_FREE_CMA_PAGES: 0 + +In allow_direct_reclaim(), while processing ZONE_DMA32, the sum of +inactive/active file-backed pages calculated in zone_reclaimable_pages() +based on the result of zone_page_state_snapshot() is zero. + +Additionally, since this system lacks swap, the calculation of inactive/ +active anonymous pages is skipped. + + crash> p nr_swap_pages + nr_swap_pages = $1937 = { + counter = 0 + } + +As a result, ZONE_DMA32 is deemed unreclaimable and skipped, moving on to +the processing of the next zone, ZONE_NORMAL, despite ZONE_DMA32 having +free pages significantly exceeding the high watermark. + +The problem is that the pgdat->kswapd_failures hasn't been incremented. + + crash> px ((struct pglist_data *) 0xffff00817fffe540)->kswapd_failures + $1935 = 0x0 + +This is because the node deemed balanced. The node balancing logic in +balance_pgdat() evaluates all zones collectively. If one or more zones +(e.g., ZONE_DMA32) have enough free pages to meet their watermarks, the +entire node is deemed balanced. This causes balance_pgdat() to exit early +before incrementing the kswapd_failures, as it considers the overall +memory state acceptable, even though some zones (like ZONE_NORMAL) remain +under significant pressure. + + +The patch ensures that zone_reclaimable_pages() includes free pages +(NR_FREE_PAGES) in its calculation when no other reclaimable pages are +available (e.g., file-backed or anonymous pages). This change prevents +zones like ZONE_DMA32, which have sufficient free pages, from being +mistakenly deemed unreclaimable. By doing so, the patch ensures proper +node balancing, avoids masking pressure on other zones like ZONE_NORMAL, +and prevents infinite loops in throttle_direct_reclaim() caused by +allow_direct_reclaim(pgdat) repeatedly returning false. + + +The kernel hangs due to a task stuck in throttle_direct_reclaim(), caused +by a node being incorrectly deemed balanced despite pressure in certain +zones, such as ZONE_NORMAL. This issue arises from +zone_reclaimable_pages() returning 0 for zones without reclaimable file- +backed or anonymous pages, causing zones like ZONE_DMA32 with sufficient +free pages to be skipped. + +The lack of swap or reclaimable pages results in ZONE_DMA32 being ignored +during reclaim, masking pressure in other zones. Consequently, +pgdat->kswapd_failures remains 0 in balance_pgdat(), preventing fallback +mechanisms in allow_direct_reclaim() from being triggered, leading to an +infinite loop in throttle_direct_reclaim(). + +This patch modifies zone_reclaimable_pages() to account for free pages +(NR_FREE_PAGES) when no other reclaimable pages exist. This ensures zones +with sufficient free pages are not skipped, enabling proper balancing and +reclaim behavior. + +[akpm@linux-foundation.org: coding-style cleanups] +Link: https://lkml.kernel.org/r/20241130164346.436469-1-snishika@redhat.com +Link: https://lkml.kernel.org/r/20241130161236.433747-2-snishika@redhat.com +Fixes: 5a1c84b404a7 ("mm: remove reclaim and compaction retry approximations") +Signed-off-by: Seiji Nishikawa +Cc: Mel Gorman +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/vmscan.c | 9 ++++++++- + 1 file changed, 8 insertions(+), 1 deletion(-) + +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -588,7 +588,14 @@ unsigned long zone_reclaimable_pages(str + if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL)) + nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) + + zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON); +- ++ /* ++ * If there are no reclaimable file-backed or anonymous pages, ++ * ensure zones with sufficient free pages are not skipped. ++ * This prevents zones like DMA32 from being ignored in reclaim ++ * scenarios where they can still help alleviate memory pressure. ++ */ ++ if (nr == 0) ++ nr = zone_page_state_snapshot(zone, NR_FREE_PAGES); + return nr; + } + diff --git a/queue-6.1/series b/queue-6.1/series index 31c2bf60609..9b923dfbaec 100644 --- a/queue-6.1/series +++ b/queue-6.1/series @@ -74,3 +74,5 @@ sky2-add-device-id-11ab-4373-for-marvell-88e8075.patch net-sctp-prevent-autoclose-integer-overflow-in-sctp_association_init.patch drm-adv7511-drop-dsi-single-lane-support.patch dt-bindings-display-adi-adv7533-drop-single-lane-support.patch +mm-readahead-fix-large-folio-support-in-async-readahead.patch +mm-vmscan-account-for-free-pages-to-prevent-infinite-loop-in-throttle_direct_reclaim.patch -- 2.47.3