]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
mm: fix OOM killer inaccuracy on large many-core systems
authorMathieu Desnoyers <mathieu.desnoyers@efficios.com>
Wed, 14 Jan 2026 14:36:42 +0000 (09:36 -0500)
committerAndrew Morton <akpm@linux-foundation.org>
Sat, 31 Jan 2026 22:22:37 +0000 (14:22 -0800)
Use the precise, albeit slower, precise RSS counter sums for the OOM
killer task selection and console dumps.  The approximated value is too
imprecise on large many-core systems.

The following rss tracking issues were noted by Sweet Tea Dorminy [1],
which lead to picking wrong tasks as OOM kill target:

  Recently, several internal services had an RSS usage regression as part of a
  kernel upgrade. Previously, they were on a pre-6.2 kernel and were able to
  read RSS statistics in a backup watchdog process to monitor and decide if
  they'd overrun their memory budget. Now, however, a representative service
  with five threads, expected to use about a hundred MB of memory, on a 250-cpu
  machine had memory usage tens of megabytes different from the expected amount
  -- this constituted a significant percentage of inaccuracy, causing the
  watchdog to act.

  This was a result of commit f1a7941243c1 ("mm: convert mm's rss stats
  into percpu_counter") [1].  Previously, the memory error was bounded by
  64*nr_threads pages, a very livable megabyte. Now, however, as a result of
  scheduler decisions moving the threads around the CPUs, the memory error could
  be as large as a gigabyte.

  This is a really tremendous inaccuracy for any few-threaded program on a
  large machine and impedes monitoring significantly. These stat counters are
  also used to make OOM killing decisions, so this additional inaccuracy could
  make a big difference in OOM situations -- either resulting in the wrong
  process being killed, or in less memory being returned from an OOM-kill than
  expected.

Here is a (possibly incomplete) list of the prior approaches that were
used or proposed, along with their downside:

1) Per-thread rss tracking: large error on many-thread processes.

2) Per-CPU counters: up to 12% slower for short-lived processes and 9%
   increased system time in make test workloads [1]. Moreover, the
   inaccuracy increases with O(n^2) with the number of CPUs.

3) Per-NUMA-node counters: requires atomics on fast-path (overhead),
   error is high with systems that have lots of NUMA nodes (32 times
   the number of NUMA nodes).

commit 82241a83cd15 ("mm: fix the inaccurate memory statistics issue for
users") introduced get_mm_counter_sum() for precise proc memory status
queries for some proc files.

The simple fix proposed here is to do the precise per-cpu counters sum
every time a counter value needs to be read.  This applies to the OOM
killer task selection, oom task console dumps (printk).

This change increases the latency introduced when the OOM killer executes
in favor of doing a more precise OOM target task selection.  Effectively,
the OOM killer iterates on all tasks, for all relevant page types, for
which the precise sum iterates on all possible CPUs.

As a reference, here is the execution time of the OOM killer before/after
the change:

AMD EPYC 9654 96-Core (2 sockets)
Within a KVM, configured with 256 logical cpus.

                                  |  before  |  after   |
----------------------------------|----------|----------|
nr_processes=40                   |  0.3 ms  |   0.5 ms |
nr_processes=10000                |  3.0 ms  |  80.0 ms |

Link: https://lkml.kernel.org/r/20260114143642.47333-1-mathieu.desnoyers@efficios.com
Fixes: f1a7941243c1 ("mm: convert mm's rss stats into percpu_counter")
Link: https://lore.kernel.org/lkml/20250331223516.7810-2-sweettea-kernel@dorminy.me/
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Suggested-by: Michal Hocko <mhocko@suse.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Martin Liu <liumartin@google.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: SeongJae Park <sj@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: "Liam R . Howlett" <liam.howlett@oracle.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Aboorva Devarajan <aboorvad@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
include/linux/mm.h
mm/oom_kill.c

index aacabf8a0b58e69d2325a6bdb45e0ace59e4c75b..aa90719234f1051a1211d35923615ded3ce9c74f 100644 (file)
@@ -2906,6 +2906,13 @@ static inline unsigned long get_mm_rss(struct mm_struct *mm)
                get_mm_counter(mm, MM_SHMEMPAGES);
 }
 
+static inline unsigned long get_mm_rss_sum(struct mm_struct *mm)
+{
+       return get_mm_counter_sum(mm, MM_FILEPAGES) +
+               get_mm_counter_sum(mm, MM_ANONPAGES) +
+               get_mm_counter_sum(mm, MM_SHMEMPAGES);
+}
+
 static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm)
 {
        return max(mm->hiwater_rss, get_mm_rss(mm));
index 94066316e3ecb2bbb0452b89a5e41c44fa19f4a4..5c6c95c169ee8bbb1e90fab5be1cb39704f0f136 100644 (file)
@@ -228,7 +228,7 @@ long oom_badness(struct task_struct *p, unsigned long totalpages)
         * The baseline for the badness score is the proportion of RAM that each
         * task's rss, pagetable and swap space use.
         */
-       points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
+       points = get_mm_rss_sum(p->mm) + get_mm_counter_sum(p->mm, MM_SWAPENTS) +
                mm_pgtables_bytes(p->mm) / PAGE_SIZE;
        task_unlock(p);
 
@@ -402,10 +402,10 @@ static int dump_task(struct task_struct *p, void *arg)
 
        pr_info("[%7d] %5d %5d %8lu %8lu %8lu %8lu %9lu %8ld %8lu         %5hd %s\n",
                task->pid, from_kuid(&init_user_ns, task_uid(task)),
-               task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
-               get_mm_counter(task->mm, MM_ANONPAGES), get_mm_counter(task->mm, MM_FILEPAGES),
-               get_mm_counter(task->mm, MM_SHMEMPAGES), mm_pgtables_bytes(task->mm),
-               get_mm_counter(task->mm, MM_SWAPENTS),
+               task->tgid, task->mm->total_vm, get_mm_rss_sum(task->mm),
+               get_mm_counter_sum(task->mm, MM_ANONPAGES), get_mm_counter_sum(task->mm, MM_FILEPAGES),
+               get_mm_counter_sum(task->mm, MM_SHMEMPAGES), mm_pgtables_bytes(task->mm),
+               get_mm_counter_sum(task->mm, MM_SWAPENTS),
                task->signal->oom_score_adj, task->comm);
        task_unlock(task);
 
@@ -604,9 +604,9 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 
        pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
                        task_pid_nr(tsk), tsk->comm,
-                       K(get_mm_counter(mm, MM_ANONPAGES)),
-                       K(get_mm_counter(mm, MM_FILEPAGES)),
-                       K(get_mm_counter(mm, MM_SHMEMPAGES)));
+                       K(get_mm_counter_sum(mm, MM_ANONPAGES)),
+                       K(get_mm_counter_sum(mm, MM_FILEPAGES)),
+                       K(get_mm_counter_sum(mm, MM_SHMEMPAGES)));
 out_finish:
        trace_finish_task_reaping(tsk->pid);
 out_unlock:
@@ -960,9 +960,9 @@ static void __oom_kill_process(struct task_struct *victim, const char *message)
        mark_oom_victim(victim);
        pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%d\n",
                message, task_pid_nr(victim), victim->comm, K(mm->total_vm),
-               K(get_mm_counter(mm, MM_ANONPAGES)),
-               K(get_mm_counter(mm, MM_FILEPAGES)),
-               K(get_mm_counter(mm, MM_SHMEMPAGES)),
+               K(get_mm_counter_sum(mm, MM_ANONPAGES)),
+               K(get_mm_counter_sum(mm, MM_FILEPAGES)),
+               K(get_mm_counter_sum(mm, MM_SHMEMPAGES)),
                from_kuid(&init_user_ns, task_uid(victim)),
                mm_pgtables_bytes(mm) >> 10, victim->signal->oom_score_adj);
        task_unlock(victim);