From: Chen Yu Date: Wed, 13 May 2026 20:39:16 +0000 (-0700) Subject: sched/cache: Avoid cache-aware scheduling for memory-heavy processes X-Git-Url: http://git.ipfire.org/gitweb/index.cgi?a=commitdiff_plain;h=808915f982c2a52f5d148510ecfab52284de67cf;p=thirdparty%2Flinux.git sched/cache: Avoid cache-aware scheduling for memory-heavy processes Prateek and Tingyin reported that memory-intensive workloads (such as stream) can saturate memory bandwidth and caches on the preferred LLC when sched_cache aggregates too many threads. To mitigate this, estimate a process's memory footprint by comparing its NUMA balancing fault statistics to the size of the LLC. If the footprint exceeds the LLC size, skip cache-aware scheduling. Note that footprint is only an approximation of the memory footprint, since the kernel lacks suitable metrics to estimate the real working set. If a user-provided hint is available in the future, it would be more accurate. A later patch will allow users to provide a hint to adjust this threshold. Suggested-by: K Prateek Nayak Suggested-by: Vern Hao Signed-off-by: Chen Yu Co-developed-by: Tim Chen Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Tested-by: Tingyin Duan Link: https://patch.msgid.link/95cf64a385bcc12f18dcebe9d59e8d3ba8bb318f.1778703694.git.tim.c.chen@linux.intel.com --- diff --git a/include/linux/sched.h b/include/linux/sched.h index 6701911eaaf73..95729670929cd 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2425,6 +2425,7 @@ struct sched_cache_stat { unsigned long epoch; u64 nr_running_avg; unsigned long next_scan; + unsigned long footprint; int cpu; } ____cacheline_aligned_in_smp; diff --git a/kernel/exit.c b/kernel/exit.c index ede3117fa7d41..77275c26a2a17 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -543,6 +543,32 @@ void mm_update_next_owner(struct mm_struct *mm) } #endif /* CONFIG_MEMCG */ +#if defined(CONFIG_SCHED_CACHE) && defined(CONFIG_NUMA_BALANCING) +/* + * Subtract the memory footprint of the current task from + * mm. + */ +static void exit_mm_sched_cache(struct mm_struct *mm) +{ + unsigned long fp, sub; + + if (!current->total_numa_faults) + return; + /* + * No lock protection due to performance considerations. + * Make sure mm->sc_stat.footprint does not become + * negative. + */ + fp = READ_ONCE(mm->sc_stat.footprint); + sub = min(fp, current->total_numa_faults); + WRITE_ONCE(mm->sc_stat.footprint, fp - sub); +} +#else +static inline void exit_mm_sched_cache(struct mm_struct *mm) +{ +} +#endif /* CONFIG_SCHED_CACHE CONFIG_NUMA_BALANCING */ + /* * Turn us into a lazy TLB process if we * aren't already.. @@ -554,6 +580,9 @@ static void exit_mm(void) exit_mm_release(current, mm); if (!mm) return; + + exit_mm_sched_cache(mm); + mmap_read_lock(mm); mmgrab_lazy_tlb(mm); BUG_ON(mm != current->active_mm); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index df21366ba1cab..a10116ffe0d1f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1384,6 +1384,32 @@ static int llc_id(int cpu) return per_cpu(sd_llc_id, cpu); } +static bool exceed_llc_capacity(struct mm_struct *mm, int cpu) +{ +#ifdef CONFIG_NUMA_BALANCING + unsigned long llc, footprint; + struct sched_domain *sd; + + guard(rcu)(); + + sd = rcu_dereference_sched_domain(cpu_rq(cpu)->sd); + if (!sd) + return true; + + if (static_branch_likely(&sched_numa_balancing)) { + /* + * TBD: RDT exclusive LLC ways reserved should be + * excluded. + */ + llc = sd->llc_bytes; + footprint = READ_ONCE(mm->sc_stat.footprint); + + return (llc < (footprint * PAGE_SIZE)); + } +#endif + return false; +} + static bool invalid_llc_nr(struct mm_struct *mm, struct task_struct *p, int cpu) { @@ -1463,6 +1489,7 @@ void mm_init_sched(struct mm_struct *mm, mm->sc_stat.cpu = -1; mm->sc_stat.next_scan = jiffies; mm->sc_stat.nr_running_avg = 0; + mm->sc_stat.footprint = 0; /* * The update to mm->sc_stat should not be reordered * before initialization to mm's other fields, in case @@ -1585,7 +1612,8 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) * its preferred state. */ if (epoch - READ_ONCE(mm->sc_stat.epoch) > EPOCH_LLC_AFFINITY_TIMEOUT || - invalid_llc_nr(mm, p, cpu_of(rq))) { + invalid_llc_nr(mm, p, cpu_of(rq)) || + exceed_llc_capacity(mm, cpu_of(rq))) { if (mm->sc_stat.cpu != -1) mm->sc_stat.cpu = -1; } @@ -1716,7 +1744,8 @@ static void task_cache_work(struct callback_head *work) return; curr_cpu = task_cpu(p); - if (invalid_llc_nr(mm, p, curr_cpu)) { + if (invalid_llc_nr(mm, p, curr_cpu) || + exceed_llc_capacity(mm, curr_cpu)) { if (mm->sc_stat.cpu != -1) mm->sc_stat.cpu = -1; @@ -3515,6 +3544,7 @@ static void task_numa_placement(struct task_struct *p) unsigned long total_faults; u64 runtime, period; spinlock_t *group_lock = NULL; + long __maybe_unused new_fp; struct numa_group *ng; /* @@ -3589,6 +3619,31 @@ static void task_numa_placement(struct task_struct *p) ng->total_faults += diff; group_faults += ng->faults[mem_idx]; } +#ifdef CONFIG_SCHED_CACHE + /* + * Per task p->numa_faults[mem_idx] converges, + * so the accumulation of each task's faults + * converges too - Given the number of threads, + * it cannot overflow an unsigned long. + * Racy with concurrent updates from other threads + * sharing this mm. Acceptable since footprint is a + * heuristic and occasional lost updates are tolerable. + * + * If a task exits, its corresponding footprint must + * be subtracted from the mm->sc_stat.footprint, otherwise + * the mm->sc_stat.footprint will not converge: + * the exiting thread's footprint remains unchanged/undecayed + * in mm->sc_stat.footprint. See exit_mm(). + * + * Lost updates and unsynchronized subtraction + * in exit_mm() can cause footprint + diff to + * go negative. Clamp to zero to prevent the + * unsigned footprint from wrapping. + */ + new_fp = (long)READ_ONCE(p->mm->sc_stat.footprint) + diff; + WRITE_ONCE(p->mm->sc_stat.footprint, + max(new_fp, 0L)); +#endif } if (!ng) { @@ -10338,7 +10393,8 @@ static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu, return mig_unrestricted; /* skip cache aware load balance for too many threads */ - if (invalid_llc_nr(mm, p, dst_cpu)) { + if (invalid_llc_nr(mm, p, dst_cpu) || + exceed_llc_capacity(mm, dst_cpu)) { if (mm->sc_stat.cpu != -1) mm->sc_stat.cpu = -1; return mig_unrestricted;