llc = debugfs_create_dir("llc_balancing", debugfs_sched);
debugfs_create_file("enabled", 0644, llc, NULL,
&sched_cache_enable_fops);
+ debugfs_create_u32("aggr_tolerance", 0644, llc,
+ &llc_aggr_tolerance);
+ debugfs_create_u32("epoch_period", 0644, llc,
+ &llc_epoch_period);
+ debugfs_create_u32("epoch_affinity_timeout", 0644, llc,
+ &llc_epoch_affinity_timeout);
+ debugfs_create_u32("overaggr_pct", 0644, llc,
+ &llc_overaggr_pct);
+ debugfs_create_u32("imb_pct", 0644, llc,
+ &llc_imb_pct);
#endif
debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
*/
#define EPOCH_PERIOD (HZ / 100) /* 10 ms */
#define EPOCH_LLC_AFFINITY_TIMEOUT 5 /* 50 ms */
+__read_mostly unsigned int llc_aggr_tolerance = 1;
+__read_mostly unsigned int llc_epoch_period = EPOCH_PERIOD;
+__read_mostly unsigned int llc_epoch_affinity_timeout = EPOCH_LLC_AFFINITY_TIMEOUT;
+__read_mostly unsigned int llc_imb_pct = 20;
+__read_mostly unsigned int llc_overaggr_pct = 50;
static int llc_id(int cpu)
{
return per_cpu(sd_llc_id, cpu);
}
+static inline int get_sched_cache_scale(int mul)
+{
+ unsigned int tol = READ_ONCE(llc_aggr_tolerance);
+
+ if (!tol)
+ return 0;
+
+ if (tol >= 100)
+ return INT_MAX;
+
+ return (1 + (tol - 1) * mul);
+}
+
static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
{
#ifdef CONFIG_NUMA_BALANCING
unsigned long llc, footprint;
struct sched_domain *sd;
+ int scale;
guard(rcu)();
llc = sd->llc_bytes;
footprint = READ_ONCE(mm->sc_stat.footprint);
- return (llc < (footprint * PAGE_SIZE));
+ /*
+ * Scale the LLC size by 256*llc_aggr_tolerance
+ * and compare it to the task's footprint.
+ *
+ * Suppose the L3 size is 32MB. If the
+ * llc_aggr_tolerance is 1:
+ * When the footprint is larger than 32MB, the
+ * process is regarded as exceeding the LLC
+ * capacity. If the llc_aggr_tolerance is 99:
+ * When the footprint is larger than 784GB, the
+ * process is regarded as exceeding the LLC
+ * capacity:
+ * 784GB = (1 + (99 - 1) * 256) * 32MB
+ * If the llc_aggr_tolerance is 100:
+ * ignore the footprint and do the aggregation
+ * anyway.
+ */
+ scale = get_sched_cache_scale(256);
+ if (scale == INT_MAX)
+ return false;
+
+ return ((llc * (u64)scale) < (footprint * PAGE_SIZE));
}
#endif
return false;
static bool invalid_llc_nr(struct mm_struct *mm, struct task_struct *p,
int cpu)
{
+ int scale;
+
if (get_nr_threads(p) <= 1)
return true;
+ /*
+ * Scale the number of 'cores' in a LLC by llc_aggr_tolerance
+ * and compare it to the task's active threads.
+ */
+ scale = get_sched_cache_scale(1);
+ if (scale == INT_MAX)
+ return false;
+
return !fits_capacity((mm->sc_stat.nr_running_avg * cpu_smt_num_threads),
- per_cpu(sd_llc_size, cpu));
+ (scale * per_cpu(sd_llc_size, cpu)));
}
static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
{
lockdep_assert_held(&rq->cpu_epoch_lock);
+ unsigned int period = max(READ_ONCE(llc_epoch_period), 1U);
unsigned long n, now = jiffies;
long delta = now - rq->cpu_epoch_next;
if (delta > 0) {
- n = (delta + EPOCH_PERIOD - 1) / EPOCH_PERIOD;
+ n = (delta + period - 1) / period;
rq->cpu_epoch += n;
- rq->cpu_epoch_next += n * EPOCH_PERIOD;
+ rq->cpu_epoch_next += n * period;
__shr_u64(&rq->cpu_runtime, n);
}
* If this process hasn't hit task_cache_work() for a while invalidate
* its preferred state.
*/
- if (epoch - READ_ONCE(mm->sc_stat.epoch) > EPOCH_LLC_AFFINITY_TIMEOUT ||
+ if ((long)(epoch - READ_ONCE(mm->sc_stat.epoch)) > llc_epoch_affinity_timeout ||
invalid_llc_nr(mm, p, cpu_of(rq)) ||
exceed_llc_capacity(mm, cpu_of(rq))) {
if (mm->sc_stat.cpu != -1)
/* only 1 thread is allowed to scan */
if (!try_cmpxchg(&mm->sc_stat.next_scan, &next_scan,
- now + EPOCH_PERIOD))
+ now + max_t(unsigned long,
+ READ_ONCE(llc_epoch_period), 1)))
return;
curr_cpu = task_cpu(p);
*/
static bool fits_llc_capacity(unsigned long util, unsigned long max)
{
- u32 aggr_pct = 50;
+ u32 aggr_pct = llc_overaggr_pct;
/*
* For single core systems, raise the aggregation
*/
/* Allows dst util to be bigger than src util by up to bias percent */
#define util_greater(util1, util2) \
- ((util1) * 100 > (util2) * 120)
+ ((util1) * 100 > (util2) * (100 + llc_imb_pct))
static __maybe_unused bool get_llc_stats(int cpu, unsigned long *util,
unsigned long *cap)