From: Peter Zijlstra Date: Mon, 11 May 2026 11:31:13 +0000 (+0200) Subject: sched: Remove sched_class::pick_next_task() X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=5ad278dd20bdf59714443894d7b3044471af97d0;p=thirdparty%2Fkernel%2Flinux.git sched: Remove sched_class::pick_next_task() The reason for pick_next_task_fair() is the put/set optimization that avoids touching the common ancestors. However, it is possible to implement this in the put_prev_task() and set_next_task() calls as used in put_prev_set_next_task(). Notably, put_prev_set_next_task() is the only site that: - calls put_prev_task() with a .next argument; - calls set_next_task() with .first = true. This means that put_prev_task() can determine the common hierarchy and stop there, and then set_next_task() can terminate where put_prev_task stopped. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://patch.msgid.link/20260511120628.057634261@infradead.org --- diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 83202f090f0c4..3c8bfd697e2c1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6030,16 +6030,15 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) && rq->nr_running == rq->cfs.h_nr_queued)) { - p = pick_next_task_fair(rq, prev, rf); + p = pick_task_fair(rq, rf); if (unlikely(p == RETRY_TASK)) goto restart; /* Assume the next prioritized class is idle_sched_class */ - if (!p) { + if (!p) p = pick_task_idle(rq, rf); - put_prev_set_next_task(rq, prev, p); - } + put_prev_set_next_task(rq, prev, p); return p; } @@ -6047,20 +6046,12 @@ restart: prev_balance(rq, prev, rf); for_each_active_class(class) { - if (class->pick_next_task) { - p = class->pick_next_task(rq, prev, rf); - if (unlikely(p == RETRY_TASK)) - goto restart; - if (p) - return p; - } else { - p = class->pick_task(rq, rf); - if (unlikely(p == RETRY_TASK)) - goto restart; - if (p) { - put_prev_set_next_task(rq, prev, p); - return p; - } + p = class->pick_task(rq, rf); + if (unlikely(p == RETRY_TASK)) + goto restart; + if (p) { + put_prev_set_next_task(rq, prev, p); + return p; } } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5f48af700fd44..62a2dcb0d03e6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9862,7 +9862,7 @@ preempt: resched_curr_lazy(rq); } -static struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf) +struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf) __must_hold(__rq_lockp(rq)) { struct sched_entity *se; @@ -9905,72 +9905,6 @@ idle: return NULL; } -static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first); -static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first); - -struct task_struct * -pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) - __must_hold(__rq_lockp(rq)) -{ - struct sched_entity *se; - struct task_struct *p; - - p = pick_task_fair(rq, rf); - if (unlikely(p == RETRY_TASK)) - return p; - if (!p) - return p; - se = &p->se; - -#ifdef CONFIG_FAIR_GROUP_SCHED - if (prev->sched_class != &fair_sched_class) - goto simple; - - __put_prev_set_next_dl_server(rq, prev, p); - - /* - * Because of the set_next_buddy() in dequeue_task_fair() it is rather - * likely that a next task is from the same cgroup as the current. - * - * Therefore attempt to avoid putting and setting the entire cgroup - * hierarchy, only change the part that actually changes. - * - * Since we haven't yet done put_prev_entity and if the selected task - * is a different task than we started out with, try and touch the - * least amount of cfs_rqs. - */ - if (prev != p) { - struct sched_entity *pse = &prev->se; - struct cfs_rq *cfs_rq; - - while (!(cfs_rq = is_same_group(se, pse))) { - int se_depth = se->depth; - int pse_depth = pse->depth; - - if (se_depth <= pse_depth) { - put_prev_entity(cfs_rq_of(pse), pse); - pse = parent_entity(pse); - } - if (se_depth >= pse_depth) { - set_next_entity(cfs_rq_of(se), se, true); - se = parent_entity(se); - } - } - - put_prev_entity(cfs_rq, pse); - set_next_entity(cfs_rq, se, true); - - __set_next_task_fair(rq, p, true); - } - - return p; - -simple: -#endif /* CONFIG_FAIR_GROUP_SCHED */ - put_prev_set_next_task(rq, prev, p); - return p; -} - static struct task_struct * fair_server_pick_task(struct sched_dl_entity *dl_se, struct rq_flags *rf) __must_hold(__rq_lockp(dl_se->rq)) @@ -9994,10 +9928,33 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct t { struct sched_entity *se = &prev->se; struct cfs_rq *cfs_rq; + struct sched_entity *nse = NULL; - for_each_sched_entity(se) { +#ifdef CONFIG_FAIR_GROUP_SCHED + if (next && next->sched_class == &fair_sched_class) + nse = &next->se; +#endif + + while (se) { cfs_rq = cfs_rq_of(se); - put_prev_entity(cfs_rq, se); + if (!nse || cfs_rq->curr) + put_prev_entity(cfs_rq, se); +#ifdef CONFIG_FAIR_GROUP_SCHED + if (nse) { + if (is_same_group(se, nse)) + break; + + int d = nse->depth - se->depth; + if (d >= 0) { + /* nse has equal or greater depth, ascend */ + nse = parent_entity(nse); + /* if nse is the deeper, do not ascend se */ + if (d > 0) + continue; + } + } +#endif + se = parent_entity(se); } } @@ -15021,10 +14978,30 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) } } -static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) +/* + * Account for a task changing its policy or group. + * + * This routine is mostly called to set cfs_rq->curr field when a task + * migrates between groups/classes. + */ +static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) { struct sched_entity *se = &p->se; + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + if (IS_ENABLED(CONFIG_FAIR_GROUP_SCHED) && + first && cfs_rq->curr) + break; + + set_next_entity(cfs_rq, se, first); + /* ensure bandwidth has been allocated on our new cfs_rq */ + account_cfs_rq_runtime(cfs_rq, 0); + } + + se = &p->se; + if (task_on_rq_queued(p)) { /* * Move the next running task to the front of the list, so our @@ -15044,27 +15021,6 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs sched_fair_update_stop_tick(rq, p); } -/* - * Account for a task changing its policy or group. - * - * This routine is mostly called to set cfs_rq->curr field when a task - * migrates between groups/classes. - */ -static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) -{ - struct sched_entity *se = &p->se; - - for_each_sched_entity(se) { - struct cfs_rq *cfs_rq = cfs_rq_of(se); - - set_next_entity(cfs_rq, se, first); - /* ensure bandwidth has been allocated on our new cfs_rq */ - account_cfs_rq_runtime(cfs_rq, 0); - } - - __set_next_task_fair(rq, p, first); -} - void init_cfs_rq(struct cfs_rq *cfs_rq) { cfs_rq->tasks_timeline = RB_ROOT_CACHED; @@ -15376,7 +15332,6 @@ DEFINE_SCHED_CLASS(fair) = { .wakeup_preempt = wakeup_preempt_fair, .pick_task = pick_task_fair, - .pick_next_task = pick_next_task_fair, .put_prev_task = put_prev_task_fair, .set_next_task = set_next_task_fair, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 8eb8f83db6b04..6b48bb3074fe2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2589,17 +2589,6 @@ struct sched_class { * schedule/pick_next_task: rq->lock */ struct task_struct *(*pick_task)(struct rq *rq, struct rq_flags *rf); - /* - * Optional! When implemented pick_next_task() should be equivalent to: - * - * next = pick_task(); - * if (next) { - * put_prev_task(prev); - * set_next_task_first(next); - * } - */ - struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev, - struct rq_flags *rf); /* * sched_change: @@ -2823,8 +2812,7 @@ static inline bool sched_fair_runnable(struct rq *rq) return rq->cfs.nr_queued > 0; } -extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, - struct rq_flags *rf); +extern struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf); extern struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf); #define SCA_CHECK 0x01