From: Peter Zijlstra Date: Tue, 2 Jun 2026 07:10:05 +0000 (+0000) Subject: sched/fair: Unify cfs_rq throttling via account_cfs_rq_runtime() X-Git-Url: http://git.ipfire.org/gitweb/?a=commitdiff_plain;h=f666241e6bd5d9a494beca982e1953208dce531c;p=thirdparty%2Fkernel%2Flinux.git sched/fair: Unify cfs_rq throttling via account_cfs_rq_runtime() assign_cfs_rq_runtime() during update_curr() sets the resched indicator and relies on check_cfs_rq_runtime() during pick_next_task() / put_prev_entity() to throttle the hierarchy once current task is preempted / blocks. Per-task throttle, on the other hand, uses throttle_cfs_rq() to simply propagate the throttle signals, and then relies on task work to individually throttle the runnable tasks on their way out to the userspace. Remove check_cfs_rq_runtime() and unify throttling into account_cfs_rq_runtime() which only sets the cfs_rq->throttled, cfs_rq->throttle_count indicators via throttle_cfs_rq() and optionally adds the task work to the current task (donor) it is on the throttled hierarchy. throttle_cfs_rq() requests for sched_cfs_bandwidth_slice() worth of bandwidth for the current hierarchy that enable it to continue running uninterrupted when selected. For the rest, it requests a bare minimum of "1" to ensure some bandwidth is available and pass the "runtime_remaining > 0" checks once selected. For SCHED_PROXY_EXEC, a mutex holder cannot exit to userspace without dropping it first and the mutex_unlock() ensures proxy is stopped before the mutex handoff which preserves the current semantics for running a throttled task until it exits to the userspace even if it acts as a donor. [ prateek: rebased on tip, comments, commit message. ] Reviewed-By: Benjamin Segall Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Tested-by: Aaron Lu Link: https://patch.msgid.link/20260602071005.11942-1-kprateek.nayak@amd.com --- diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3f3f09a021db..f4ed841f766f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -525,7 +525,7 @@ static int se_is_idle(struct sched_entity *se) #endif /* !CONFIG_FAIR_GROUP_SCHED */ static __always_inline -void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec); +bool account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec); /************************************************************** * Scheduling class tree data structure manipulation methods: @@ -6388,8 +6388,6 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq, bool protect) return se; } -static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq); - static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) { /* @@ -6399,9 +6397,6 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) if (prev->on_rq) update_curr(cfs_rq); - /* throttle cfs_rqs exceeding runtime */ - check_cfs_rq_runtime(cfs_rq); - if (prev->on_rq) { update_stats_wait_start_fair(cfs_rq, prev); /* Put 'current' back into the tree. */ @@ -6536,41 +6531,32 @@ static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b, return cfs_rq->runtime_remaining > 0; } -/* returns 0 on failure to allocate runtime */ -static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) -{ - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - - guard(raw_spinlock)(&cfs_b->lock); +static bool throttle_cfs_rq(struct cfs_rq *cfs_rq); - return __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice()); -} - -static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) +static bool __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) { /* dock delta_exec before expiring quota (as it could span periods) */ cfs_rq->runtime_remaining -= delta_exec; if (likely(cfs_rq->runtime_remaining > 0)) - return; + return false; if (cfs_rq->throttled) - return; + return true; /* - * if we're unable to extend our runtime we resched so that the active - * hierarchy can be throttled + * throttle_cfs_rq() will try to extend the runtime first + * before throttling the hierarchy. */ - if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) - resched_curr(rq_of(cfs_rq)); + return throttle_cfs_rq(cfs_rq); } static __always_inline -void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) +bool account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) { if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) - return; + return false; - __account_cfs_rq_runtime(cfs_rq, delta_exec); + return __account_cfs_rq_runtime(cfs_rq, delta_exec); } static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) @@ -6858,10 +6844,24 @@ static int tg_throttle_down(struct task_group *tg, void *data) static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) { - struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + struct sched_entity *curr = cfs_rq->curr; + struct rq *rq = rq_of(cfs_rq); scoped_guard(raw_spinlock, &cfs_b->lock) { + u64 target_runtime = 1; + + /* + * If cfs_rq->curr is still runnable, we are here from an + * update_curr(). Request sysctl_sched_cfs_bandwidth_slice + * worth of bandwidth to continue running. + * + * If the curr is not runnable, just request enough bandwidth + * to be runnable next time the pick selects this cfs_rq. + */ + if (curr && curr->on_rq) + target_runtime = sched_cfs_bandwidth_slice(); + /* * Check if We have raced with bandwidth becoming available. If * we actually throttled the timer might not unthrottle us for @@ -6872,7 +6872,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) * * This will start the period timer if necessary. */ - if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) + if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, target_runtime)) return false; /* @@ -6893,6 +6893,17 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) */ cfs_rq->throttled = 1; WARN_ON_ONCE(cfs_rq->throttled_clock); + + /* + * If current hierarchy was throttled, add throttle work to the + * current donor. In case of proxy-execution, the execution + * context cannot exit to the userspace while holding a mutex + * and the rule of throttle deferral to only throttle the + * throttled context at exit to userspace is still preserved. + */ + if (curr && curr->on_rq) + task_throttle_setup_work(rq->donor); + return true; } @@ -7283,7 +7294,7 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) if (!cfs_bandwidth_used()) return; - /* an active group must be handled by the update_curr()->put() path */ + /* an active group must be handled by the update_curr() path */ if (!cfs_rq->runtime_enabled || cfs_rq->curr) return; @@ -7293,8 +7304,6 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) /* update runtime allocation */ account_cfs_rq_runtime(cfs_rq, 0); - if (cfs_rq->runtime_remaining <= 0) - throttle_cfs_rq(cfs_rq); } static void sync_throttle(struct task_group *tg, int cpu) @@ -7324,25 +7333,6 @@ static void sync_throttle(struct task_group *tg, int cpu) cfs_rq->pelt_clock_throttled = 1; } -/* conditionally throttle active cfs_rq's from put_prev_entity() */ -static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) -{ - if (!cfs_bandwidth_used()) - return false; - - if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) - return false; - - /* - * it's possible for a throttled entity to be forced into a running - * state (e.g. set_curr_task), in this case we're finished. - */ - if (cfs_rq_throttled(cfs_rq)) - return true; - - return throttle_cfs_rq(cfs_rq); -} - static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) { struct cfs_bandwidth *cfs_b = @@ -7596,8 +7586,7 @@ static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) #else /* !CONFIG_CFS_BANDWIDTH: */ -static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} -static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } +static bool account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) { return false; } static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} static inline void sync_throttle(struct task_group *tg, int cpu) {} static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} @@ -9934,8 +9923,6 @@ again: if (cfs_rq->curr && cfs_rq->curr->on_rq) update_curr(cfs_rq); - throttled |= check_cfs_rq_runtime(cfs_rq); - se = pick_next_entity(rq, cfs_rq, true); if (!se) goto again; @@ -14853,8 +14840,8 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {} */ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) { - struct cfs_rq *cfs_rq; struct sched_entity *se = &curr->se; + struct cfs_rq *cfs_rq; for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); @@ -15036,6 +15023,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) { struct sched_entity *se = &p->se; + bool throttled = false; for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -15046,9 +15034,12 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) set_next_entity(cfs_rq, se, first); /* ensure bandwidth has been allocated on our new cfs_rq */ - account_cfs_rq_runtime(cfs_rq, 0); + throttled |= account_cfs_rq_runtime(cfs_rq, 0); } + if (throttled) + task_throttle_setup_work(p); + se = &p->se; if (task_on_rq_queued(p)) {