]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
sched: Employ sched_change guards
authorPeter Zijlstra <peterz@infradead.org>
Wed, 30 Oct 2024 12:43:43 +0000 (13:43 +0100)
committerPeter Zijlstra <peterz@infradead.org>
Thu, 16 Oct 2025 09:13:50 +0000 (11:13 +0200)
As proposed a long while ago -- and half done by scx -- wrap the
scheduler's 'change' pattern in a guard helper.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Juri Lelli <juri.lelli@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
include/linux/cleanup.h
kernel/sched/core.c
kernel/sched/ext.c
kernel/sched/sched.h
kernel/sched/syscalls.c

index 2573585b7f068abe992af1ac05f478fef7b34306..ae381675455db1c0832c9480f57bf0700e710cbe 100644 (file)
@@ -340,6 +340,11 @@ _label:                                                         \
 #define __DEFINE_CLASS_IS_CONDITIONAL(_name, _is_cond) \
 static __maybe_unused const bool class_##_name##_is_conditional = _is_cond
 
+#define DEFINE_CLASS_IS_UNCONDITIONAL(_name)           \
+       __DEFINE_CLASS_IS_CONDITIONAL(_name, false);    \
+       static inline void * class_##_name##_lock_ptr(class_##_name##_t *_T) \
+       { return (void *)1; }
+
 #define __GUARD_IS_ERR(_ptr)                                       \
        ({                                                         \
                unsigned long _rc = (__force unsigned long)(_ptr); \
index 198d2dd45f59cbcc0e6e7689eb7a9811969241d0..eca40df4b6d33688d825eac44cff937e4f5046a7 100644 (file)
@@ -7326,7 +7326,7 @@ void rt_mutex_post_schedule(void)
  */
 void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
 {
-       int prio, oldprio, queued, running, queue_flag =
+       int prio, oldprio, queue_flag =
                DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
        const struct sched_class *prev_class, *next_class;
        struct rq_flags rf;
@@ -7391,52 +7391,42 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
        if (prev_class != next_class && p->se.sched_delayed)
                dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
 
-       queued = task_on_rq_queued(p);
-       running = task_current_donor(rq, p);
-       if (queued)
-               dequeue_task(rq, p, queue_flag);
-       if (running)
-               put_prev_task(rq, p);
-
-       /*
-        * Boosting condition are:
-        * 1. -rt task is running and holds mutex A
-        *      --> -dl task blocks on mutex A
-        *
-        * 2. -dl task is running and holds mutex A
-        *      --> -dl task blocks on mutex A and could preempt the
-        *          running task
-        */
-       if (dl_prio(prio)) {
-               if (!dl_prio(p->normal_prio) ||
-                   (pi_task && dl_prio(pi_task->prio) &&
-                    dl_entity_preempt(&pi_task->dl, &p->dl))) {
-                       p->dl.pi_se = pi_task->dl.pi_se;
-                       queue_flag |= ENQUEUE_REPLENISH;
+       scoped_guard (sched_change, p, queue_flag) {
+               /*
+                * Boosting condition are:
+                * 1. -rt task is running and holds mutex A
+                *      --> -dl task blocks on mutex A
+                *
+                * 2. -dl task is running and holds mutex A
+                *      --> -dl task blocks on mutex A and could preempt the
+                *          running task
+                */
+               if (dl_prio(prio)) {
+                       if (!dl_prio(p->normal_prio) ||
+                           (pi_task && dl_prio(pi_task->prio) &&
+                            dl_entity_preempt(&pi_task->dl, &p->dl))) {
+                               p->dl.pi_se = pi_task->dl.pi_se;
+                               scope->flags |= ENQUEUE_REPLENISH;
+                       } else {
+                               p->dl.pi_se = &p->dl;
+                       }
+               } else if (rt_prio(prio)) {
+                       if (dl_prio(oldprio))
+                               p->dl.pi_se = &p->dl;
+                       if (oldprio < prio)
+                               scope->flags |= ENQUEUE_HEAD;
                } else {
-                       p->dl.pi_se = &p->dl;
+                       if (dl_prio(oldprio))
+                               p->dl.pi_se = &p->dl;
+                       if (rt_prio(oldprio))
+                               p->rt.timeout = 0;
                }
-       } else if (rt_prio(prio)) {
-               if (dl_prio(oldprio))
-                       p->dl.pi_se = &p->dl;
-               if (oldprio < prio)
-                       queue_flag |= ENQUEUE_HEAD;
-       } else {
-               if (dl_prio(oldprio))
-                       p->dl.pi_se = &p->dl;
-               if (rt_prio(oldprio))
-                       p->rt.timeout = 0;
-       }
 
-       p->sched_class = next_class;
-       p->prio = prio;
+               p->sched_class = next_class;
+               p->prio = prio;
 
-       check_class_changing(rq, p, prev_class);
-
-       if (queued)
-               enqueue_task(rq, p, queue_flag);
-       if (running)
-               set_next_task(rq, p);
+               check_class_changing(rq, p, prev_class);
+       }
 
        check_class_changed(rq, p, prev_class, oldprio);
 out_unlock:
@@ -8084,26 +8074,9 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
  */
 void sched_setnuma(struct task_struct *p, int nid)
 {
-       bool queued, running;
-       struct rq_flags rf;
-       struct rq *rq;
-
-       rq = task_rq_lock(p, &rf);
-       queued = task_on_rq_queued(p);
-       running = task_current_donor(rq, p);
-
-       if (queued)
-               dequeue_task(rq, p, DEQUEUE_SAVE);
-       if (running)
-               put_prev_task(rq, p);
-
-       p->numa_preferred_nid = nid;
-
-       if (queued)
-               enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
-       if (running)
-               set_next_task(rq, p);
-       task_rq_unlock(rq, p, &rf);
+       guard(task_rq_lock)(p);
+       scoped_guard (sched_change, p, DEQUEUE_SAVE)
+               p->numa_preferred_nid = nid;
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
@@ -9205,8 +9178,9 @@ static void sched_change_group(struct task_struct *tsk)
  */
 void sched_move_task(struct task_struct *tsk, bool for_autogroup)
 {
-       int queued, running, queue_flags =
+       unsigned int queue_flags =
                DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
+       bool resched = false;
        struct rq *rq;
 
        CLASS(task_rq_lock, rq_guard)(tsk);
@@ -9214,29 +9188,16 @@ void sched_move_task(struct task_struct *tsk, bool for_autogroup)
 
        update_rq_clock(rq);
 
-       running = task_current_donor(rq, tsk);
-       queued = task_on_rq_queued(tsk);
-
-       if (queued)
-               dequeue_task(rq, tsk, queue_flags);
-       if (running)
-               put_prev_task(rq, tsk);
-
-       sched_change_group(tsk);
-       if (!for_autogroup)
-               scx_cgroup_move_task(tsk);
+       scoped_guard (sched_change, tsk, queue_flags) {
+               sched_change_group(tsk);
+               if (!for_autogroup)
+                       scx_cgroup_move_task(tsk);
+               if (scope->running)
+                       resched = true;
+       }
 
-       if (queued)
-               enqueue_task(rq, tsk, queue_flags);
-       if (running) {
-               set_next_task(rq, tsk);
-               /*
-                * After changing group, the running task may have joined a
-                * throttled one but it's still the running task. Trigger a
-                * resched to make sure that task can still run.
-                */
+       if (resched)
                resched_curr(rq);
-       }
 }
 
 static struct cgroup_subsys_state *
@@ -10892,37 +10853,39 @@ void sched_mm_cid_fork(struct task_struct *t)
 }
 #endif /* CONFIG_SCHED_MM_CID */
 
-#ifdef CONFIG_SCHED_CLASS_EXT
-void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
-                           struct sched_enq_and_set_ctx *ctx)
+static DEFINE_PER_CPU(struct sched_change_ctx, sched_change_ctx);
+
+struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags)
 {
+       struct sched_change_ctx *ctx = this_cpu_ptr(&sched_change_ctx);
        struct rq *rq = task_rq(p);
 
        lockdep_assert_rq_held(rq);
 
-       *ctx = (struct sched_enq_and_set_ctx){
+       *ctx = (struct sched_change_ctx){
                .p = p,
-               .queue_flags = queue_flags,
+               .flags = flags,
                .queued = task_on_rq_queued(p),
-               .running = task_current(rq, p),
+               .running = task_current_donor(rq, p),
        };
 
-       update_rq_clock(rq);
        if (ctx->queued)
-               dequeue_task(rq, p, queue_flags | DEQUEUE_NOCLOCK);
+               dequeue_task(rq, p, flags);
        if (ctx->running)
                put_prev_task(rq, p);
+
+       return ctx;
 }
 
-void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx)
+void sched_change_end(struct sched_change_ctx *ctx)
 {
-       struct rq *rq = task_rq(ctx->p);
+       struct task_struct *p = ctx->p;
+       struct rq *rq = task_rq(p);
 
        lockdep_assert_rq_held(rq);
 
        if (ctx->queued)
-               enqueue_task(rq, ctx->p, ctx->queue_flags | ENQUEUE_NOCLOCK);
+               enqueue_task(rq, p, ctx->flags | ENQUEUE_NOCLOCK);
        if (ctx->running)
-               set_next_task(rq, ctx->p);
+               set_next_task(rq, p);
 }
-#endif /* CONFIG_SCHED_CLASS_EXT */
index 2b0e88206d0768634ab7014fb037f7860c04ca5b..4566a7c813603375524e919c0acebe8dac8355f5 100644 (file)
@@ -3780,11 +3780,10 @@ static void scx_bypass(bool bypass)
                 */
                list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list,
                                                 scx.runnable_node) {
-                       struct sched_enq_and_set_ctx ctx;
-
                        /* cycling deq/enq is enough, see the function comment */
-                       sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
-                       sched_enq_and_set_task(&ctx);
+                       scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
+                               /* nothing */ ;
+                       }
                }
 
                /* resched to restore ticks and idle state */
@@ -3916,17 +3915,16 @@ static void scx_disable_workfn(struct kthread_work *work)
                const struct sched_class *old_class = p->sched_class;
                const struct sched_class *new_class =
                        __setscheduler_class(p->policy, p->prio);
-               struct sched_enq_and_set_ctx ctx;
 
-               if (old_class != new_class && p->se.sched_delayed)
-                       dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+               update_rq_clock(task_rq(p));
 
-               sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
-
-               p->sched_class = new_class;
-               check_class_changing(task_rq(p), p, old_class);
+               if (old_class != new_class && p->se.sched_delayed)
+                       dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
 
-               sched_enq_and_set_task(&ctx);
+               scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK) {
+                       p->sched_class = new_class;
+                       check_class_changing(task_rq(p), p, old_class);
+               }
 
                check_class_changed(task_rq(p), p, old_class, p->prio);
                scx_exit_task(p);
@@ -4660,21 +4658,20 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
                const struct sched_class *old_class = p->sched_class;
                const struct sched_class *new_class =
                        __setscheduler_class(p->policy, p->prio);
-               struct sched_enq_and_set_ctx ctx;
 
                if (!tryget_task_struct(p))
                        continue;
 
-               if (old_class != new_class && p->se.sched_delayed)
-                       dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
-
-               sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
+               update_rq_clock(task_rq(p));
 
-               p->scx.slice = SCX_SLICE_DFL;
-               p->sched_class = new_class;
-               check_class_changing(task_rq(p), p, old_class);
+               if (old_class != new_class && p->se.sched_delayed)
+                       dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
 
-               sched_enq_and_set_task(&ctx);
+               scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK) {
+                       p->scx.slice = SCX_SLICE_DFL;
+                       p->sched_class = new_class;
+                       check_class_changing(task_rq(p), p, old_class);
+               }
 
                check_class_changed(task_rq(p), p, old_class, p->prio);
                put_task_struct(p);
index 1f5d07067f60a3195c11b0bfe203e027ebc3e3c5..6546849aa075f2ea9f54b33501b66e995751f43a 100644 (file)
@@ -3885,23 +3885,38 @@ extern void check_class_changed(struct rq *rq, struct task_struct *p,
 extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
 extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
 
-#ifdef CONFIG_SCHED_CLASS_EXT
 /*
- * Used by SCX in the enable/disable paths to move tasks between sched_classes
- * and establish invariants.
+ * The 'sched_change' pattern is the safe, easy and slow way of changing a
+ * task's scheduling properties. It dequeues a task, such that the scheduler
+ * is fully unaware of it; at which point its properties can be modified;
+ * after which it is enqueued again.
+ *
+ * Typically this must be called while holding task_rq_lock, since most/all
+ * properties are serialized under those locks. There is currently one
+ * exception to this rule in sched/ext which only holds rq->lock.
+ */
+
+/*
+ * This structure is a temporary, used to preserve/convey the queueing state
+ * of the task between sched_change_begin() and sched_change_end(). Ensuring
+ * the task's queueing state is idempotent across the operation.
  */
-struct sched_enq_and_set_ctx {
+struct sched_change_ctx {
        struct task_struct      *p;
-       int                     queue_flags;
+       int                     flags;
        bool                    queued;
        bool                    running;
 };
 
-void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
-                           struct sched_enq_and_set_ctx *ctx);
-void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx);
+struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags);
+void sched_change_end(struct sched_change_ctx *ctx);
 
-#endif /* CONFIG_SCHED_CLASS_EXT */
+DEFINE_CLASS(sched_change, struct sched_change_ctx *,
+            sched_change_end(_T),
+            sched_change_begin(p, flags),
+            struct task_struct *p, unsigned int flags)
+
+DEFINE_CLASS_IS_UNCONDITIONAL(sched_change)
 
 #include "ext.h"
 
index 77ae87f36e841296c13266910c34a1d9a79659de..09ffe91410b1b49108171af311d3cd3386dcffb2 100644 (file)
@@ -64,7 +64,6 @@ static int effective_prio(struct task_struct *p)
 
 void set_user_nice(struct task_struct *p, long nice)
 {
-       bool queued, running;
        struct rq *rq;
        int old_prio;
 
@@ -90,22 +89,12 @@ void set_user_nice(struct task_struct *p, long nice)
                return;
        }
 
-       queued = task_on_rq_queued(p);
-       running = task_current_donor(rq, p);
-       if (queued)
-               dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
-       if (running)
-               put_prev_task(rq, p);
-
-       p->static_prio = NICE_TO_PRIO(nice);
-       set_load_weight(p, true);
-       old_prio = p->prio;
-       p->prio = effective_prio(p);
-
-       if (queued)
-               enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
-       if (running)
-               set_next_task(rq, p);
+       scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK) {
+               p->static_prio = NICE_TO_PRIO(nice);
+               set_load_weight(p, true);
+               old_prio = p->prio;
+               p->prio = effective_prio(p);
+       }
 
        /*
         * If the task increased its priority or is running and
@@ -515,7 +504,7 @@ int __sched_setscheduler(struct task_struct *p,
                         bool user, bool pi)
 {
        int oldpolicy = -1, policy = attr->sched_policy;
-       int retval, oldprio, newprio, queued, running;
+       int retval, oldprio, newprio;
        const struct sched_class *prev_class, *next_class;
        struct balance_callback *head;
        struct rq_flags rf;
@@ -698,33 +687,25 @@ change:
        if (prev_class != next_class && p->se.sched_delayed)
                dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
 
-       queued = task_on_rq_queued(p);
-       running = task_current_donor(rq, p);
-       if (queued)
-               dequeue_task(rq, p, queue_flags);
-       if (running)
-               put_prev_task(rq, p);
-
-       if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
-               __setscheduler_params(p, attr);
-               p->sched_class = next_class;
-               p->prio = newprio;
-       }
-       __setscheduler_uclamp(p, attr);
-       check_class_changing(rq, p, prev_class);
+       scoped_guard (sched_change, p, queue_flags) {
 
-       if (queued) {
-               /*
-                * We enqueue to tail when the priority of a task is
-                * increased (user space view).
-                */
-               if (oldprio < p->prio)
-                       queue_flags |= ENQUEUE_HEAD;
+               if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
+                       __setscheduler_params(p, attr);
+                       p->sched_class = next_class;
+                       p->prio = newprio;
+               }
+               __setscheduler_uclamp(p, attr);
+               check_class_changing(rq, p, prev_class);
 
-               enqueue_task(rq, p, queue_flags);
+               if (scope->queued) {
+                       /*
+                        * We enqueue to tail when the priority of a task is
+                        * increased (user space view).
+                        */
+                       if (oldprio < p->prio)
+                               scope->flags |= ENQUEUE_HEAD;
+               }
        }
-       if (running)
-               set_next_task(rq, p);
 
        check_class_changed(rq, p, prev_class, oldprio);