sched/cache: Avoid cache-aware scheduling for memory-heavy processes

author Chen Yu <yu.c.chen@intel.com>

Wed, 13 May 2026 20:39:16 +0000 (13:39 -0700)

committer Peter Zijlstra <peterz@infradead.org>

Mon, 18 May 2026 19:33:15 +0000 (21:33 +0200)
author Chen Yu <yu.c.chen@intel.com>
Wed, 13 May 2026 20:39:16 +0000 (13:39 -0700)
committer Peter Zijlstra <peterz@infradead.org>
Mon, 18 May 2026 19:33:15 +0000 (21:33 +0200)
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 6701911eaaf73557d0f26cf466ca72c31941127a..95729670929cda28d6e477c2543d1f36bbd989a7 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2425,6 +2425,7 @@ struct sched_cache_stat {
         unsigned long epoch;
         u64 nr_running_avg;
         unsigned long next_scan;
+       unsigned long footprint;
         int cpu;
  } ____cacheline_aligned_in_smp;
  
diff --git a/kernel/exit.c b/kernel/exit.c

index ede3117fa7d413a40105af85be6da21583cbaa88..77275c26a2a17145ef5d342f3ab1c4cac8b24ae8 100644 (file)
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -543,6 +543,32 @@ void mm_update_next_owner(struct mm_struct *mm)
  }
  #endif /* CONFIG_MEMCG */
  
+#if defined(CONFIG_SCHED_CACHE) && defined(CONFIG_NUMA_BALANCING)
+/*
+ * Subtract the memory footprint of the current task from
+ * mm.
+ */
+static void exit_mm_sched_cache(struct mm_struct *mm)
+{
+       unsigned long fp, sub;
+
+       if (!current->total_numa_faults)
+               return;
+       /*
+        * No lock protection due to performance considerations.
+        * Make sure mm->sc_stat.footprint does not become
+        * negative.
+        */
+       fp = READ_ONCE(mm->sc_stat.footprint);
+       sub = min(fp, current->total_numa_faults);
+       WRITE_ONCE(mm->sc_stat.footprint, fp - sub);
+}
+#else
+static inline void exit_mm_sched_cache(struct mm_struct *mm)
+{
+}
+#endif /* CONFIG_SCHED_CACHE CONFIG_NUMA_BALANCING */
+
  /*
   * Turn us into a lazy TLB process if we
   * aren't already..
@@ -554,6 +580,9 @@ static void exit_mm(void)
         exit_mm_release(current, mm);
         if (!mm)
                 return;
+
+       exit_mm_sched_cache(mm);
+
         mmap_read_lock(mm);
         mmgrab_lazy_tlb(mm);
         BUG_ON(mm != current->active_mm);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index df21366ba1cab171754f245f9621560c374e6015..a10116ffe0d1fe0b611c5ecd9243e8c7b0e4a76e 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1384,6 +1384,32 @@ static int llc_id(int cpu)
         return per_cpu(sd_llc_id, cpu);
  }
  
+static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
+{
+#ifdef CONFIG_NUMA_BALANCING
+       unsigned long llc, footprint;
+       struct sched_domain *sd;
+
+       guard(rcu)();
+
+       sd = rcu_dereference_sched_domain(cpu_rq(cpu)->sd);
+       if (!sd)
+               return true;
+
+       if (static_branch_likely(&sched_numa_balancing)) {
+               /*
+                * TBD: RDT exclusive LLC ways reserved should be
+                * excluded.
+                */
+               llc = sd->llc_bytes;
+               footprint = READ_ONCE(mm->sc_stat.footprint);
+
+               return (llc < (footprint * PAGE_SIZE));
+       }
+#endif
+       return false;
+}
+
  static bool invalid_llc_nr(struct mm_struct *mm, struct task_struct *p,
                            int cpu)
  {
@@ -1463,6 +1489,7 @@ void mm_init_sched(struct mm_struct *mm,
         mm->sc_stat.cpu = -1;
         mm->sc_stat.next_scan = jiffies;
         mm->sc_stat.nr_running_avg = 0;
+       mm->sc_stat.footprint = 0;
         /*
          * The update to mm->sc_stat should not be reordered
          * before initialization to mm's other fields, in case
@@ -1585,7 +1612,8 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
          * its preferred state.
          */
         if (epoch - READ_ONCE(mm->sc_stat.epoch) > EPOCH_LLC_AFFINITY_TIMEOUT ||
-           invalid_llc_nr(mm, p, cpu_of(rq))) {
+           invalid_llc_nr(mm, p, cpu_of(rq)) ||
+           exceed_llc_capacity(mm, cpu_of(rq))) {
                 if (mm->sc_stat.cpu != -1)
                         mm->sc_stat.cpu = -1;
         }
@@ -1716,7 +1744,8 @@ static void task_cache_work(struct callback_head *work)
                 return;
  
         curr_cpu = task_cpu(p);
-       if (invalid_llc_nr(mm, p, curr_cpu)) {
+       if (invalid_llc_nr(mm, p, curr_cpu) ||
+           exceed_llc_capacity(mm, curr_cpu)) {
                 if (mm->sc_stat.cpu != -1)
                         mm->sc_stat.cpu = -1;
  
@@ -3515,6 +3544,7 @@ static void task_numa_placement(struct task_struct *p)
         unsigned long total_faults;
         u64 runtime, period;
         spinlock_t *group_lock = NULL;
+       long __maybe_unused new_fp;
         struct numa_group *ng;
  
         /*
@@ -3589,6 +3619,31 @@ static void task_numa_placement(struct task_struct *p)
                                 ng->total_faults += diff;
                                 group_faults += ng->faults[mem_idx];
                         }
+#ifdef CONFIG_SCHED_CACHE
+                       /*
+                        * Per task p->numa_faults[mem_idx] converges,
+                        * so the accumulation of each task's faults
+                        * converges too - Given the number of threads,
+                        * it cannot overflow an unsigned long.
+                        * Racy with concurrent updates from other threads
+                        * sharing this mm. Acceptable since footprint is a
+                        * heuristic and occasional lost updates are tolerable.
+                        *
+                        * If a task exits, its corresponding footprint must
+                        * be subtracted from the mm->sc_stat.footprint, otherwise
+                        * the mm->sc_stat.footprint will not converge:
+                        * the exiting thread's footprint remains unchanged/undecayed
+                        * in mm->sc_stat.footprint. See exit_mm().
+                        *
+                        * Lost updates and unsynchronized subtraction
+                        * in exit_mm() can cause footprint + diff to
+                        * go negative. Clamp to zero to prevent the
+                        * unsigned footprint from wrapping.
+                        */
+                       new_fp = (long)READ_ONCE(p->mm->sc_stat.footprint) + diff;
+                       WRITE_ONCE(p->mm->sc_stat.footprint,
+                                  max(new_fp, 0L));
+#endif
                 }
  
                 if (!ng) {
@@ -10338,7 +10393,8 @@ static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu,
                 return mig_unrestricted;
  
         /* skip cache aware load balance for too many threads */
-       if (invalid_llc_nr(mm, p, dst_cpu)) {
+       if (invalid_llc_nr(mm, p, dst_cpu) ||
+           exceed_llc_capacity(mm, dst_cpu)) {
                 if (mm->sc_stat.cpu != -1)
                         mm->sc_stat.cpu = -1;
                 return mig_unrestricted;
author	Chen Yu <yu.c.chen@intel.com>
	Wed, 13 May 2026 20:39:16 +0000 (13:39 -0700)
committer	Peter Zijlstra <peterz@infradead.org>
	Mon, 18 May 2026 19:33:15 +0000 (21:33 +0200)
include/linux/sched.h		patch \| blob \| blame \| history
kernel/exit.c		patch \| blob \| blame \| history
kernel/sched/fair.c		patch \| blob \| blame \| history