}
#endif /* CONFIG_MEMCG */
+#if defined(CONFIG_SCHED_CACHE) && defined(CONFIG_NUMA_BALANCING)
+/*
+ * Subtract the memory footprint of the current task from
+ * mm.
+ */
+static void exit_mm_sched_cache(struct mm_struct *mm)
+{
+ unsigned long fp, sub;
+
+ if (!current->total_numa_faults)
+ return;
+ /*
+ * No lock protection due to performance considerations.
+ * Make sure mm->sc_stat.footprint does not become
+ * negative.
+ */
+ fp = READ_ONCE(mm->sc_stat.footprint);
+ sub = min(fp, current->total_numa_faults);
+ WRITE_ONCE(mm->sc_stat.footprint, fp - sub);
+}
+#else
+static inline void exit_mm_sched_cache(struct mm_struct *mm)
+{
+}
+#endif /* CONFIG_SCHED_CACHE CONFIG_NUMA_BALANCING */
+
/*
* Turn us into a lazy TLB process if we
* aren't already..
exit_mm_release(current, mm);
if (!mm)
return;
+
+ exit_mm_sched_cache(mm);
+
mmap_read_lock(mm);
mmgrab_lazy_tlb(mm);
BUG_ON(mm != current->active_mm);
return per_cpu(sd_llc_id, cpu);
}
+static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
+{
+#ifdef CONFIG_NUMA_BALANCING
+ unsigned long llc, footprint;
+ struct sched_domain *sd;
+
+ guard(rcu)();
+
+ sd = rcu_dereference_sched_domain(cpu_rq(cpu)->sd);
+ if (!sd)
+ return true;
+
+ if (static_branch_likely(&sched_numa_balancing)) {
+ /*
+ * TBD: RDT exclusive LLC ways reserved should be
+ * excluded.
+ */
+ llc = sd->llc_bytes;
+ footprint = READ_ONCE(mm->sc_stat.footprint);
+
+ return (llc < (footprint * PAGE_SIZE));
+ }
+#endif
+ return false;
+}
+
static bool invalid_llc_nr(struct mm_struct *mm, struct task_struct *p,
int cpu)
{
mm->sc_stat.cpu = -1;
mm->sc_stat.next_scan = jiffies;
mm->sc_stat.nr_running_avg = 0;
+ mm->sc_stat.footprint = 0;
/*
* The update to mm->sc_stat should not be reordered
* before initialization to mm's other fields, in case
* its preferred state.
*/
if (epoch - READ_ONCE(mm->sc_stat.epoch) > EPOCH_LLC_AFFINITY_TIMEOUT ||
- invalid_llc_nr(mm, p, cpu_of(rq))) {
+ invalid_llc_nr(mm, p, cpu_of(rq)) ||
+ exceed_llc_capacity(mm, cpu_of(rq))) {
if (mm->sc_stat.cpu != -1)
mm->sc_stat.cpu = -1;
}
return;
curr_cpu = task_cpu(p);
- if (invalid_llc_nr(mm, p, curr_cpu)) {
+ if (invalid_llc_nr(mm, p, curr_cpu) ||
+ exceed_llc_capacity(mm, curr_cpu)) {
if (mm->sc_stat.cpu != -1)
mm->sc_stat.cpu = -1;
unsigned long total_faults;
u64 runtime, period;
spinlock_t *group_lock = NULL;
+ long __maybe_unused new_fp;
struct numa_group *ng;
/*
ng->total_faults += diff;
group_faults += ng->faults[mem_idx];
}
+#ifdef CONFIG_SCHED_CACHE
+ /*
+ * Per task p->numa_faults[mem_idx] converges,
+ * so the accumulation of each task's faults
+ * converges too - Given the number of threads,
+ * it cannot overflow an unsigned long.
+ * Racy with concurrent updates from other threads
+ * sharing this mm. Acceptable since footprint is a
+ * heuristic and occasional lost updates are tolerable.
+ *
+ * If a task exits, its corresponding footprint must
+ * be subtracted from the mm->sc_stat.footprint, otherwise
+ * the mm->sc_stat.footprint will not converge:
+ * the exiting thread's footprint remains unchanged/undecayed
+ * in mm->sc_stat.footprint. See exit_mm().
+ *
+ * Lost updates and unsynchronized subtraction
+ * in exit_mm() can cause footprint + diff to
+ * go negative. Clamp to zero to prevent the
+ * unsigned footprint from wrapping.
+ */
+ new_fp = (long)READ_ONCE(p->mm->sc_stat.footprint) + diff;
+ WRITE_ONCE(p->mm->sc_stat.footprint,
+ max(new_fp, 0L));
+#endif
}
if (!ng) {
return mig_unrestricted;
/* skip cache aware load balance for too many threads */
- if (invalid_llc_nr(mm, p, dst_cpu)) {
+ if (invalid_llc_nr(mm, p, dst_cpu) ||
+ exceed_llc_capacity(mm, dst_cpu)) {
if (mm->sc_stat.cpu != -1)
mm->sc_stat.cpu = -1;
return mig_unrestricted;