]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
sched/cache: Avoid cache-aware scheduling for memory-heavy processes
authorChen Yu <yu.c.chen@intel.com>
Wed, 13 May 2026 20:39:16 +0000 (13:39 -0700)
committerPeter Zijlstra <peterz@infradead.org>
Mon, 18 May 2026 19:33:15 +0000 (21:33 +0200)
Prateek and Tingyin reported that memory-intensive workloads (such as
stream) can saturate memory bandwidth and caches on the preferred LLC
when sched_cache aggregates too many threads.

To mitigate this, estimate a process's memory footprint by comparing
its NUMA balancing fault statistics to the size of the LLC. If the
footprint exceeds the LLC size, skip cache-aware scheduling.

Note that footprint is only an approximation of the memory footprint,
since the kernel lacks suitable metrics to estimate the real working
set. If a user-provided hint is available in the future, it would be
more accurate. A later patch will allow users to provide a hint to
adjust this threshold.

Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Suggested-by: Vern Hao <vernhao@tencent.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Tingyin Duan <tingyin.duan@gmail.com>
Link: https://patch.msgid.link/95cf64a385bcc12f18dcebe9d59e8d3ba8bb318f.1778703694.git.tim.c.chen@linux.intel.com
include/linux/sched.h
kernel/exit.c
kernel/sched/fair.c

index 6701911eaaf73557d0f26cf466ca72c31941127a..95729670929cda28d6e477c2543d1f36bbd989a7 100644 (file)
@@ -2425,6 +2425,7 @@ struct sched_cache_stat {
        unsigned long epoch;
        u64 nr_running_avg;
        unsigned long next_scan;
+       unsigned long footprint;
        int cpu;
 } ____cacheline_aligned_in_smp;
 
index ede3117fa7d413a40105af85be6da21583cbaa88..77275c26a2a17145ef5d342f3ab1c4cac8b24ae8 100644 (file)
@@ -543,6 +543,32 @@ void mm_update_next_owner(struct mm_struct *mm)
 }
 #endif /* CONFIG_MEMCG */
 
+#if defined(CONFIG_SCHED_CACHE) && defined(CONFIG_NUMA_BALANCING)
+/*
+ * Subtract the memory footprint of the current task from
+ * mm.
+ */
+static void exit_mm_sched_cache(struct mm_struct *mm)
+{
+       unsigned long fp, sub;
+
+       if (!current->total_numa_faults)
+               return;
+       /*
+        * No lock protection due to performance considerations.
+        * Make sure mm->sc_stat.footprint does not become
+        * negative.
+        */
+       fp = READ_ONCE(mm->sc_stat.footprint);
+       sub = min(fp, current->total_numa_faults);
+       WRITE_ONCE(mm->sc_stat.footprint, fp - sub);
+}
+#else
+static inline void exit_mm_sched_cache(struct mm_struct *mm)
+{
+}
+#endif /* CONFIG_SCHED_CACHE CONFIG_NUMA_BALANCING */
+
 /*
  * Turn us into a lazy TLB process if we
  * aren't already..
@@ -554,6 +580,9 @@ static void exit_mm(void)
        exit_mm_release(current, mm);
        if (!mm)
                return;
+
+       exit_mm_sched_cache(mm);
+
        mmap_read_lock(mm);
        mmgrab_lazy_tlb(mm);
        BUG_ON(mm != current->active_mm);
index df21366ba1cab171754f245f9621560c374e6015..a10116ffe0d1fe0b611c5ecd9243e8c7b0e4a76e 100644 (file)
@@ -1384,6 +1384,32 @@ static int llc_id(int cpu)
        return per_cpu(sd_llc_id, cpu);
 }
 
+static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
+{
+#ifdef CONFIG_NUMA_BALANCING
+       unsigned long llc, footprint;
+       struct sched_domain *sd;
+
+       guard(rcu)();
+
+       sd = rcu_dereference_sched_domain(cpu_rq(cpu)->sd);
+       if (!sd)
+               return true;
+
+       if (static_branch_likely(&sched_numa_balancing)) {
+               /*
+                * TBD: RDT exclusive LLC ways reserved should be
+                * excluded.
+                */
+               llc = sd->llc_bytes;
+               footprint = READ_ONCE(mm->sc_stat.footprint);
+
+               return (llc < (footprint * PAGE_SIZE));
+       }
+#endif
+       return false;
+}
+
 static bool invalid_llc_nr(struct mm_struct *mm, struct task_struct *p,
                           int cpu)
 {
@@ -1463,6 +1489,7 @@ void mm_init_sched(struct mm_struct *mm,
        mm->sc_stat.cpu = -1;
        mm->sc_stat.next_scan = jiffies;
        mm->sc_stat.nr_running_avg = 0;
+       mm->sc_stat.footprint = 0;
        /*
         * The update to mm->sc_stat should not be reordered
         * before initialization to mm's other fields, in case
@@ -1585,7 +1612,8 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
         * its preferred state.
         */
        if (epoch - READ_ONCE(mm->sc_stat.epoch) > EPOCH_LLC_AFFINITY_TIMEOUT ||
-           invalid_llc_nr(mm, p, cpu_of(rq))) {
+           invalid_llc_nr(mm, p, cpu_of(rq)) ||
+           exceed_llc_capacity(mm, cpu_of(rq))) {
                if (mm->sc_stat.cpu != -1)
                        mm->sc_stat.cpu = -1;
        }
@@ -1716,7 +1744,8 @@ static void task_cache_work(struct callback_head *work)
                return;
 
        curr_cpu = task_cpu(p);
-       if (invalid_llc_nr(mm, p, curr_cpu)) {
+       if (invalid_llc_nr(mm, p, curr_cpu) ||
+           exceed_llc_capacity(mm, curr_cpu)) {
                if (mm->sc_stat.cpu != -1)
                        mm->sc_stat.cpu = -1;
 
@@ -3515,6 +3544,7 @@ static void task_numa_placement(struct task_struct *p)
        unsigned long total_faults;
        u64 runtime, period;
        spinlock_t *group_lock = NULL;
+       long __maybe_unused new_fp;
        struct numa_group *ng;
 
        /*
@@ -3589,6 +3619,31 @@ static void task_numa_placement(struct task_struct *p)
                                ng->total_faults += diff;
                                group_faults += ng->faults[mem_idx];
                        }
+#ifdef CONFIG_SCHED_CACHE
+                       /*
+                        * Per task p->numa_faults[mem_idx] converges,
+                        * so the accumulation of each task's faults
+                        * converges too - Given the number of threads,
+                        * it cannot overflow an unsigned long.
+                        * Racy with concurrent updates from other threads
+                        * sharing this mm. Acceptable since footprint is a
+                        * heuristic and occasional lost updates are tolerable.
+                        *
+                        * If a task exits, its corresponding footprint must
+                        * be subtracted from the mm->sc_stat.footprint, otherwise
+                        * the mm->sc_stat.footprint will not converge:
+                        * the exiting thread's footprint remains unchanged/undecayed
+                        * in mm->sc_stat.footprint. See exit_mm().
+                        *
+                        * Lost updates and unsynchronized subtraction
+                        * in exit_mm() can cause footprint + diff to
+                        * go negative. Clamp to zero to prevent the
+                        * unsigned footprint from wrapping.
+                        */
+                       new_fp = (long)READ_ONCE(p->mm->sc_stat.footprint) + diff;
+                       WRITE_ONCE(p->mm->sc_stat.footprint,
+                                  max(new_fp, 0L));
+#endif
                }
 
                if (!ng) {
@@ -10338,7 +10393,8 @@ static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu,
                return mig_unrestricted;
 
        /* skip cache aware load balance for too many threads */
-       if (invalid_llc_nr(mm, p, dst_cpu)) {
+       if (invalid_llc_nr(mm, p, dst_cpu) ||
+           exceed_llc_capacity(mm, dst_cpu)) {
                if (mm->sc_stat.cpu != -1)
                        mm->sc_stat.cpu = -1;
                return mig_unrestricted;