]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
5.15-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 26 May 2025 10:47:42 +0000 (12:47 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 26 May 2025 10:47:42 +0000 (12:47 +0200)
added patches:
memcg-always-call-cond_resched-after-fn.patch
mm-page_alloc.c-avoid-infinite-retries-caused-by-cpuset-race.patch

queue-5.15/memcg-always-call-cond_resched-after-fn.patch [new file with mode: 0644]
queue-5.15/mm-page_alloc.c-avoid-infinite-retries-caused-by-cpuset-race.patch [new file with mode: 0644]
queue-5.15/series

diff --git a/queue-5.15/memcg-always-call-cond_resched-after-fn.patch b/queue-5.15/memcg-always-call-cond_resched-after-fn.patch
new file mode 100644 (file)
index 0000000..9f303c0
--- /dev/null
@@ -0,0 +1,80 @@
+From 06717a7b6c86514dbd6ab322e8083ffaa4db5712 Mon Sep 17 00:00:00 2001
+From: Breno Leitao <leitao@debian.org>
+Date: Fri, 23 May 2025 10:21:06 -0700
+Subject: memcg: always call cond_resched() after fn()
+
+From: Breno Leitao <leitao@debian.org>
+
+commit 06717a7b6c86514dbd6ab322e8083ffaa4db5712 upstream.
+
+I am seeing soft lockup on certain machine types when a cgroup OOMs.  This
+is happening because killing the process in certain machine might be very
+slow, which causes the soft lockup and RCU stalls.  This happens usually
+when the cgroup has MANY processes and memory.oom.group is set.
+
+Example I am seeing in real production:
+
+       [462012.244552] Memory cgroup out of memory: Killed process 3370438 (crosvm) ....
+       ....
+       [462037.318059] Memory cgroup out of memory: Killed process 4171372 (adb) ....
+       [462037.348314] watchdog: BUG: soft lockup - CPU#64 stuck for 26s! [stat_manager-ag:1618982]
+       ....
+
+Quick look at why this is so slow, it seems to be related to serial flush
+for certain machine types.  For all the crashes I saw, the target CPU was
+at console_flush_all().
+
+In the case above, there are thousands of processes in the cgroup, and it
+is soft locking up before it reaches the 1024 limit in the code (which
+would call the cond_resched()).  So, cond_resched() in 1024 blocks is not
+sufficient.
+
+Remove the counter-based conditional rescheduling logic and call
+cond_resched() unconditionally after each task iteration, after fn() is
+called.  This avoids the lockup independently of how slow fn() is.
+
+Link: https://lkml.kernel.org/r/20250523-memcg_fix-v1-1-ad3eafb60477@debian.org
+Fixes: ade81479c7dd ("memcg: fix soft lockup in the OOM process")
+Signed-off-by: Breno Leitao <leitao@debian.org>
+Suggested-by: Rik van Riel <riel@surriel.com>
+Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
+Cc: Michael van der Westhuizen <rmikey@meta.com>
+Cc: Usama Arif <usamaarif642@gmail.com>
+Cc: Pavel Begunkov <asml.silence@gmail.com>
+Cc: Chen Ridong <chenridong@huawei.com>
+Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Muchun Song <muchun.song@linux.dev>
+Cc: Roman Gushchin <roman.gushchin@linux.dev>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/memcontrol.c |    6 ++----
+ 1 file changed, 2 insertions(+), 4 deletions(-)
+
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -1194,7 +1194,6 @@ int mem_cgroup_scan_tasks(struct mem_cgr
+ {
+       struct mem_cgroup *iter;
+       int ret = 0;
+-      int i = 0;
+       BUG_ON(memcg == root_mem_cgroup);
+@@ -1204,10 +1203,9 @@ int mem_cgroup_scan_tasks(struct mem_cgr
+               css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
+               while (!ret && (task = css_task_iter_next(&it))) {
+-                      /* Avoid potential softlockup warning */
+-                      if ((++i & 1023) == 0)
+-                              cond_resched();
+                       ret = fn(task, arg);
++                      /* Avoid potential softlockup warning */
++                      cond_resched();
+               }
+               css_task_iter_end(&it);
+               if (ret) {
diff --git a/queue-5.15/mm-page_alloc.c-avoid-infinite-retries-caused-by-cpuset-race.patch b/queue-5.15/mm-page_alloc.c-avoid-infinite-retries-caused-by-cpuset-race.patch
new file mode 100644 (file)
index 0000000..e01e52c
--- /dev/null
@@ -0,0 +1,79 @@
+From e05741fb10c38d70bbd7ec12b23c197b6355d519 Mon Sep 17 00:00:00 2001
+From: Tianyang Zhang <zhangtianyang@loongson.cn>
+Date: Wed, 16 Apr 2025 16:24:05 +0800
+Subject: mm/page_alloc.c: avoid infinite retries caused by cpuset race
+
+From: Tianyang Zhang <zhangtianyang@loongson.cn>
+
+commit e05741fb10c38d70bbd7ec12b23c197b6355d519 upstream.
+
+__alloc_pages_slowpath has no change detection for ac->nodemask in the
+part of retry path, while cpuset can modify it in parallel.  For some
+processes that set mempolicy as MPOL_BIND, this results ac->nodemask
+changes, and then the should_reclaim_retry will judge based on the latest
+nodemask and jump to retry, while the get_page_from_freelist only
+traverses the zonelist from ac->preferred_zoneref, which selected by a
+expired nodemask and may cause infinite retries in some cases
+
+cpu 64:
+__alloc_pages_slowpath {
+        /* ..... */
+retry:
+        /* ac->nodemask = 0x1, ac->preferred->zone->nid = 1 */
+        if (alloc_flags & ALLOC_KSWAPD)
+                wake_all_kswapds(order, gfp_mask, ac);
+        /* cpu 1:
+        cpuset_write_resmask
+            update_nodemask
+                update_nodemasks_hier
+                    update_tasks_nodemask
+                        mpol_rebind_task
+                         mpol_rebind_policy
+                          mpol_rebind_nodemask
+               // mempolicy->nodes has been modified,
+               // which ac->nodemask point to
+
+        */
+        /* ac->nodemask = 0x3, ac->preferred->zone->nid = 1 */
+        if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
+                                 did_some_progress > 0, &no_progress_loops))
+                goto retry;
+}
+
+Simultaneously starting multiple cpuset01 from LTP can quickly reproduce
+this issue on a multi node server when the maximum memory pressure is
+reached and the swap is enabled
+
+Link: https://lkml.kernel.org/r/20250416082405.20988-1-zhangtianyang@loongson.cn
+Fixes: c33d6c06f60f ("mm, page_alloc: avoid looking up the first zone in a zonelist twice")
+Signed-off-by: Tianyang Zhang <zhangtianyang@loongson.cn>
+Reviewed-by: Suren Baghdasaryan <surenb@google.com>
+Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Brendan Jackman <jackmanb@google.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Zi Yan <ziy@nvidia.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/page_alloc.c |    8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -5057,6 +5057,14 @@ restart:
+       }
+ retry:
++      /*
++       * Deal with possible cpuset update races or zonelist updates to avoid
++       * infinite retries.
++       */
++      if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
++          check_retry_zonelist(zonelist_iter_cookie))
++              goto restart;
++
+       /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
+       if (alloc_flags & ALLOC_KSWAPD)
+               wake_all_kswapds(order, gfp_mask, ac);
index b9e434f846f30e1a9bd6efeed5a187d22da5b2eb..42773c2fe862ae34414cc49fb6e360b4850e7172 100644 (file)
@@ -178,3 +178,5 @@ llc-fix-data-loss-when-reading-from-a-socket-in-llc_ui_recvmsg.patch
 platform-x86-dell-wmi-sysman-avoid-buffer-overflow-in-current_password_store.patch
 drm-edid-fixed-the-bug-that-hdr-metadata-was-not-reset.patch
 revert-drm-amd-keep-display-off-while-going-into-s4.patch
+memcg-always-call-cond_resched-after-fn.patch
+mm-page_alloc.c-avoid-infinite-retries-caused-by-cpuset-race.patch