sched/cache: Disable cache aware scheduling for processes with high thread counts

author Chen Yu <yu.c.chen@intel.com>

Wed, 13 May 2026 20:39:13 +0000 (13:39 -0700)

committer Peter Zijlstra <peterz@infradead.org>

Mon, 18 May 2026 19:33:14 +0000 (21:33 +0200)
author Chen Yu <yu.c.chen@intel.com>
Wed, 13 May 2026 20:39:13 +0000 (13:39 -0700)
committer Peter Zijlstra <peterz@infradead.org>
Mon, 18 May 2026 19:33:14 +0000 (21:33 +0200)
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 6d883f109ba378f7a1076c2b8a4188a1632072bd..6701911eaaf73557d0f26cf466ca72c31941127a 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2423,6 +2423,7 @@ struct sched_cache_stat {
         struct sched_cache_time __percpu *pcpu_sched;
         raw_spinlock_t lock;
         unsigned long epoch;
+       u64 nr_running_avg;
         unsigned long next_scan;
         int cpu;
  } ____cacheline_aligned_in_smp;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index a759ea669d74d257573af727d0c3370b47e4a79b..808f614fc2d2646a66180bfb586e99abf388b39d 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1384,6 +1384,12 @@ static int llc_id(int cpu)
         return per_cpu(sd_llc_id, cpu);
  }
  
+static bool invalid_llc_nr(struct mm_struct *mm, int cpu)
+{
+       return !fits_capacity((mm->sc_stat.nr_running_avg * cpu_smt_num_threads),
+                       per_cpu(sd_llc_size, cpu));
+}
+
  static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
  {
         struct sched_domain *sd;
@@ -1452,7 +1458,7 @@ void mm_init_sched(struct mm_struct *mm,
         mm->sc_stat.epoch = epoch;
         mm->sc_stat.cpu = -1;
         mm->sc_stat.next_scan = jiffies;
-
+       mm->sc_stat.nr_running_avg = 0;
         /*
          * The update to mm->sc_stat should not be reordered
          * before initialization to mm's other fields, in case
@@ -1574,7 +1580,8 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
          * If this process hasn't hit task_cache_work() for a while invalidate
          * its preferred state.
          */
-       if (epoch - READ_ONCE(mm->sc_stat.epoch) > EPOCH_LLC_AFFINITY_TIMEOUT) {
+       if (epoch - READ_ONCE(mm->sc_stat.epoch) > EPOCH_LLC_AFFINITY_TIMEOUT ||
+           invalid_llc_nr(mm, cpu_of(rq))) {
                 if (mm->sc_stat.cpu != -1)
                         mm->sc_stat.cpu = -1;
         }
@@ -1660,14 +1667,32 @@ out:
         cpumask_copy(cpus, cpu_online_mask);
  }
  
+static inline void update_avg_scale(u64 *avg, u64 sample)
+{
+       int factor = per_cpu(sd_llc_size, raw_smp_processor_id());
+       s64 diff = sample - *avg;
+       u32 divisor;
+
+       /*
+        * Scale the divisor based on the number of CPUs contained
+        * in the LLC. This scaling ensures smaller LLC domains use
+        * a smaller divisor to achieve more precise sensitivity to
+        * changes in nr_running, while larger LLC domains are capped
+        * at a maximum divisor of 8 which is the default smoothing
+        * factor of EWMA in update_avg().
+        */
+       divisor = clamp_t(u32, (factor >> 2), 2, 8);
+       *avg += div64_s64(diff, divisor);
+}
+
  static void task_cache_work(struct callback_head *work)
  {
         unsigned long next_scan, now = jiffies;
-       struct task_struct *p = current;
+       struct task_struct *p = current, *cur;
+       int cpu, m_a_cpu = -1, nr_running = 0;
+       unsigned long curr_m_a_occ = 0;
         struct mm_struct *mm = p->mm;
         unsigned long m_a_occ = 0;
-       unsigned long curr_m_a_occ = 0;
-       int cpu, m_a_cpu = -1;
         cpumask_var_t cpus;
  
         WARN_ON_ONCE(work != &p->cache_work);
@@ -1711,6 +1736,11 @@ static void task_cache_work(struct callback_head *work)
                                         m_occ = occ;
                                         m_cpu = i;
                                 }
+
+                               cur = rcu_dereference_all(cpu_rq(i)->curr);
+                               if (cur && !(cur->flags & (PF_EXITING | PF_KTHREAD)) &&
+                                   cur->mm == mm)
+                                       nr_running++;
                         }
  
                         /*
@@ -1754,6 +1784,7 @@ static void task_cache_work(struct callback_head *work)
                 mm->sc_stat.cpu = m_a_cpu;
         }
  
+       update_avg_scale(&mm->sc_stat.nr_running_avg, nr_running);
         free_cpumask_var(cpus);
  }
  
@@ -10294,6 +10325,13 @@ static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu,
         if (cpu < 0 || cpus_share_cache(src_cpu, dst_cpu))
                 return mig_unrestricted;
  
+       /* skip cache aware load balance for too many threads */
+       if (invalid_llc_nr(mm, dst_cpu)) {
+               if (mm->sc_stat.cpu != -1)
+                       mm->sc_stat.cpu = -1;
+               return mig_unrestricted;
+       }
+
         if (cpus_share_cache(dst_cpu, cpu))
                 to_pref = true;
         else if (cpus_share_cache(src_cpu, cpu))
author	Chen Yu <yu.c.chen@intel.com>
	Wed, 13 May 2026 20:39:13 +0000 (13:39 -0700)
committer	Peter Zijlstra <peterz@infradead.org>
	Mon, 18 May 2026 19:33:14 +0000 (21:33 +0200)
include/linux/sched.h		patch \| blob \| blame \| history
kernel/sched/fair.c		patch \| blob \| blame \| history