From: John Stultz Date: Sat, 12 Jul 2025 03:33:46 +0000 (+0000) Subject: sched: Fix runtime accounting w/ split exec & sched contexts X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=aa4f74dfd42ba4399f785fb9c460a11bd1756f0a;p=thirdparty%2Flinux.git sched: Fix runtime accounting w/ split exec & sched contexts Without proxy-exec, we normally charge the "current" task for both its vruntime as well as its sum_exec_runtime. With proxy, however, we have two "current" contexts: the scheduler context and the execution context. We want to charge the execution context rq->curr (ie: proxy/lock holder) execution time to its sum_exec_runtime (so it's clear to userland the rq->curr task *is* running), as well as its thread group. However the rest of the time accounting (such a vruntime and cgroup accounting), we charge against the scheduler context (rq->donor) task, because it is from that task that the time is being "donated". If the donor and curr tasks are the same, then it's the same as without proxy. Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Tested-by: K Prateek Nayak Link: https://lkml.kernel.org/r/20250712033407.2383110-6-jstultz@google.com --- diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8334580ed3a30..97176458f2be5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1152,30 +1152,40 @@ void post_init_entity_util_avg(struct task_struct *p) sa->runnable_avg = sa->util_avg; } -static s64 update_curr_se(struct rq *rq, struct sched_entity *curr) +static s64 update_se(struct rq *rq, struct sched_entity *se) { u64 now = rq_clock_task(rq); s64 delta_exec; - delta_exec = now - curr->exec_start; + delta_exec = now - se->exec_start; if (unlikely(delta_exec <= 0)) return delta_exec; - curr->exec_start = now; - curr->sum_exec_runtime += delta_exec; + se->exec_start = now; + if (entity_is_task(se)) { + struct task_struct *donor = task_of(se); + struct task_struct *running = rq->curr; + /* + * If se is a task, we account the time against the running + * task, as w/ proxy-exec they may not be the same. + */ + running->se.exec_start = now; + running->se.sum_exec_runtime += delta_exec; - if (entity_is_task(curr)) { - struct task_struct *p = task_of(curr); + trace_sched_stat_runtime(running, delta_exec); + account_group_exec_runtime(running, delta_exec); - trace_sched_stat_runtime(p, delta_exec); - account_group_exec_runtime(p, delta_exec); - cgroup_account_cputime(p, delta_exec); + /* cgroup time is always accounted against the donor */ + cgroup_account_cputime(donor, delta_exec); + } else { + /* If not task, account the time against donor se */ + se->sum_exec_runtime += delta_exec; } if (schedstat_enabled()) { struct sched_statistics *stats; - stats = __schedstats_from_se(curr); + stats = __schedstats_from_se(se); __schedstat_set(stats->exec_max, max(delta_exec, stats->exec_max)); } @@ -1188,9 +1198,7 @@ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr) */ s64 update_curr_common(struct rq *rq) { - struct task_struct *donor = rq->donor; - - return update_curr_se(rq, &donor->se); + return update_se(rq, &rq->donor->se); } /* @@ -1198,6 +1206,12 @@ s64 update_curr_common(struct rq *rq) */ static void update_curr(struct cfs_rq *cfs_rq) { + /* + * Note: cfs_rq->curr corresponds to the task picked to + * run (ie: rq->donor.se) which due to proxy-exec may + * not necessarily be the actual task running + * (rq->curr.se). This is easy to confuse! + */ struct sched_entity *curr = cfs_rq->curr; struct rq *rq = rq_of(cfs_rq); s64 delta_exec; @@ -1206,7 +1220,7 @@ static void update_curr(struct cfs_rq *cfs_rq) if (unlikely(!curr)) return; - delta_exec = update_curr_se(rq, curr); + delta_exec = update_se(rq, curr); if (unlikely(delta_exec <= 0)) return;