sched/core: Rework sched_class::wakeup_preempt() and rq_modified_*()

author Peter Zijlstra <peterz@infradead.org>

Wed, 10 Dec 2025 08:06:50 +0000 (09:06 +0100)

committer Peter Zijlstra <peterz@infradead.org>

Wed, 17 Dec 2025 09:53:25 +0000 (10:53 +0100)
author Peter Zijlstra <peterz@infradead.org>
Wed, 10 Dec 2025 08:06:50 +0000 (09:06 +0100)
committer Peter Zijlstra <peterz@infradead.org>
Wed, 17 Dec 2025 09:53:25 +0000 (10:53 +0100)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 4479f7d1fdfb60742705ba4001debeca1e7a09de..7d0a862a8c75ca03b483c0edacb980b013d32d39 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2090,7 +2090,6 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
          */
         uclamp_rq_inc(rq, p, flags);
  
-       rq->queue_mask |= p->sched_class->queue_mask;
         p->sched_class->enqueue_task(rq, p, flags);
  
         psi_enqueue(p, flags);
@@ -2123,7 +2122,6 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
          * and mark the task ->sched_delayed.
          */
         uclamp_rq_dec(rq, p);
-       rq->queue_mask |= p->sched_class->queue_mask;
         return p->sched_class->dequeue_task(rq, p, flags);
  }
  
@@ -2174,10 +2172,14 @@ void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags)
  {
         struct task_struct *donor = rq->donor;
  
-       if (p->sched_class == donor->sched_class)
-               donor->sched_class->wakeup_preempt(rq, p, flags);
-       else if (sched_class_above(p->sched_class, donor->sched_class))
+       if (p->sched_class == rq->next_class) {
+               rq->next_class->wakeup_preempt(rq, p, flags);
+
+       } else if (sched_class_above(p->sched_class, rq->next_class)) {
+               rq->next_class->wakeup_preempt(rq, p, flags);
                 resched_curr(rq);
+               rq->next_class = p->sched_class;
+       }
  
         /*
          * A queue event has occurred, and we're going to schedule.  In
@@ -6804,6 +6806,7 @@ static void __sched notrace __schedule(int sched_mode)
  pick_again:
         next = pick_next_task(rq, rq->donor, &rf);
         rq_set_donor(rq, next);
+       rq->next_class = next->sched_class;
         if (unlikely(task_is_blocked(next))) {
                 next = find_proxy_task(rq, next, &rf);
                 if (!next)
@@ -8650,6 +8653,8 @@ void __init sched_init(void)
                 rq->rt.rt_runtime = global_rt_runtime();
                 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
  #endif
+               rq->next_class = &idle_sched_class;
+
                 rq->sd = NULL;
                 rq->rd = NULL;
                 rq->cpu_capacity = SCHED_CAPACITY_SCALE;
@@ -10775,10 +10780,8 @@ struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int
                 flags |= DEQUEUE_NOCLOCK;
         }
  
-       if (flags & DEQUEUE_CLASS) {
-               if (p->sched_class->switching_from)
-                       p->sched_class->switching_from(rq, p);
-       }
+       if ((flags & DEQUEUE_CLASS) && p->sched_class->switching_from)
+               p->sched_class->switching_from(rq, p);
  
         *ctx = (struct sched_change_ctx){
                 .p = p,
@@ -10830,6 +10833,17 @@ void sched_change_end(struct sched_change_ctx *ctx)
                 if (p->sched_class->switched_to)
                         p->sched_class->switched_to(rq, p);
  
+               /*
+                * If this was a class promotion; let the old class know it
+                * got preempted. Note that none of the switch*_from() methods
+                * know the new class and none of the switch*_to() methods
+                * know the old class.
+                */
+               if (ctx->running && sched_class_above(p->sched_class, ctx->class)) {
+                       rq->next_class->wakeup_preempt(rq, p, 0);
+                       rq->next_class = p->sched_class;
+               }
+
                 /*
                  * If this was a degradation in class someone should have set
                  * need_resched by now.
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c

index 319439fe18702687ee9b8ea5a927ddf073763cf2..80c9559a3e30ea2e24c000e810f2bc950eec862b 100644 (file)
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2499,9 +2499,16 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
   * Only called when both the current and waking task are -deadline
   * tasks.
   */
-static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p,
-                                 int flags)
+static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags)
  {
+       /*
+        * Can only get preempted by stop-class, and those should be
+        * few and short lived, doesn't really make sense to push
+        * anything away for that.
+        */
+       if (p->sched_class != &dl_sched_class)
+               return;
+
         if (dl_entity_preempt(&p->dl, &rq->donor->dl)) {
                 resched_curr(rq);
                 return;
@@ -3346,9 +3353,6 @@ static int task_is_throttled_dl(struct task_struct *p, int cpu)
  #endif
  
  DEFINE_SCHED_CLASS(dl) = {
-
-       .queue_mask             = 8,
-
         .enqueue_task           = enqueue_task_dl,
         .dequeue_task           = dequeue_task_dl,
         .yield_task             = yield_task_dl,
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c

index 05f5a49e9649a98e876da02f98d503e0376b262c..3b32e641b7ee2a3d1b06df6e7087742522ffbe59 100644 (file)
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2431,7 +2431,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
         /* see kick_cpus_irq_workfn() */
         smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
  
-       rq_modified_clear(rq);
+       rq->next_class = &ext_sched_class;
  
         rq_unpin_lock(rq, rf);
         balance_one(rq, prev);
@@ -2446,7 +2446,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
          * If @force_scx is true, always try to pick a SCHED_EXT task,
          * regardless of any higher-priority sched classes activity.
          */
-       if (!force_scx && rq_modified_above(rq, &ext_sched_class))
+       if (!force_scx && sched_class_above(rq->next_class, &ext_sched_class))
                 return RETRY_TASK;
  
         keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
@@ -3075,7 +3075,8 @@ static void switched_from_scx(struct rq *rq, struct task_struct *p)
         scx_disable_task(p);
  }
  
-static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
+static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_flags) {}
+
  static void switched_to_scx(struct rq *rq, struct task_struct *p) {}
  
  int scx_check_setscheduler(struct task_struct *p, int policy)
@@ -3336,8 +3337,6 @@ static void scx_cgroup_unlock(void) {}
   *   their current sched_class. Call them directly from sched core instead.
   */
  DEFINE_SCHED_CLASS(ext) = {
-       .queue_mask             = 1,
-
         .enqueue_task           = enqueue_task_scx,
         .dequeue_task           = dequeue_task_scx,
         .yield_task             = yield_task_scx,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index d588eb871657f1b6ed3557279b2f6336def91d19..76f5e4b78b3069909169ac683cd8c181944ec003 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8736,7 +8736,7 @@ preempt_sync(struct rq *rq, int wake_flags,
  /*
   * Preempt the current task with a newly woken task if needed:
   */
-static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags)
+static void wakeup_preempt_fair(struct rq *rq, struct task_struct *p, int wake_flags)
  {
         enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_PICK;
         struct task_struct *donor = rq->donor;
@@ -8744,6 +8744,12 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
         struct cfs_rq *cfs_rq = task_cfs_rq(donor);
         int cse_is_idle, pse_is_idle;
  
+       /*
+        * XXX Getting preempted by higher class, try and find idle CPU?
+        */
+       if (p->sched_class != &fair_sched_class)
+               return;
+
         if (unlikely(se == pse))
                 return;
  
@@ -12911,7 +12917,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
         t0 = sched_clock_cpu(this_cpu);
         __sched_balance_update_blocked_averages(this_rq);
  
-       rq_modified_clear(this_rq);
+       this_rq->next_class = &fair_sched_class;
         raw_spin_rq_unlock(this_rq);
  
         for_each_domain(this_cpu, sd) {
@@ -12978,7 +12984,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
                 pulled_task = 1;
  
         /* If a higher prio class was modified, restart the pick */
-       if (rq_modified_above(this_rq, &fair_sched_class))
+       if (sched_class_above(this_rq->next_class, &fair_sched_class))
                 pulled_task = -1;
  
  out:
@@ -13882,15 +13888,12 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
   * All the scheduling class methods:
   */
  DEFINE_SCHED_CLASS(fair) = {
-
-       .queue_mask             = 2,
-
         .enqueue_task           = enqueue_task_fair,
         .dequeue_task           = dequeue_task_fair,
         .yield_task             = yield_task_fair,
         .yield_to_task          = yield_to_task_fair,
  
-       .wakeup_preempt         = check_preempt_wakeup_fair,
+       .wakeup_preempt         = wakeup_preempt_fair,
  
         .pick_task              = pick_task_fair,
         .pick_next_task         = pick_next_task_fair,
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c

index c174afe1dd177a22535417be0de1fc1b690c0368..65eb8f8c1a5d3a05dcdc2945c57fd58420697f68 100644 (file)
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -536,9 +536,6 @@ static void update_curr_idle(struct rq *rq)
   * Simple, special scheduling class for the per-CPU idle tasks:
   */
  DEFINE_SCHED_CLASS(idle) = {
-
-       .queue_mask             = 0,
-
         /* no enqueue/yield_task for idle tasks */
  
         /* dequeue is not valid, we print a debug message there: */
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c

index f1867fe8e5c5353167b8cfa29cf0650956fd8a75..0a9b2cd6da7208d7837e05a0c8d6d29e5653cc29 100644 (file)
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1615,6 +1615,12 @@ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags)
  {
         struct task_struct *donor = rq->donor;
  
+       /*
+        * XXX If we're preempted by DL, queue a push?
+        */
+       if (p->sched_class != &rt_sched_class)
+               return;
+
         if (p->prio < donor->prio) {
                 resched_curr(rq);
                 return;
@@ -2568,9 +2574,6 @@ static int task_is_throttled_rt(struct task_struct *p, int cpu)
  #endif /* CONFIG_SCHED_CORE */
  
  DEFINE_SCHED_CLASS(rt) = {
-
-       .queue_mask             = 4,
-
         .enqueue_task           = enqueue_task_rt,
         .dequeue_task           = dequeue_task_rt,
         .yield_task             = yield_task_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index ab1bfa05e8941c3e58d7114cb7d91c0310360c1a..3ceaa9dc9a9e8a53aaf1ac4fa3f0fb6d55a99518 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1118,8 +1118,6 @@ struct rq {
         /* runqueue lock: */
         raw_spinlock_t          __lock;
  
-       /* Per class runqueue modification mask; bits in class order. */
-       unsigned int            queue_mask;
         unsigned int            nr_running;
  #ifdef CONFIG_NUMA_BALANCING
         unsigned int            nr_numa_running;
@@ -1179,6 +1177,7 @@ struct rq {
         struct sched_dl_entity  *dl_server;
         struct task_struct      *idle;
         struct task_struct      *stop;
+       const struct sched_class *next_class;
         unsigned long           next_balance;
         struct mm_struct        *prev_mm;
  
@@ -2426,15 +2425,6 @@ struct sched_class {
  #ifdef CONFIG_UCLAMP_TASK
         int uclamp_enabled;
  #endif
-       /*
-        * idle:  0
-        * ext:   1
-        * fair:  2
-        * rt:    4
-        * dl:    8
-        * stop: 16
-        */
-       unsigned int queue_mask;
  
         /*
          * move_queued_task/activate_task/enqueue_task: rq->lock
@@ -2593,20 +2583,6 @@ struct sched_class {
  #endif
  };
  
-/*
- * Does not nest; only used around sched_class::pick_task() rq-lock-breaks.
- */
-static inline void rq_modified_clear(struct rq *rq)
-{
-       rq->queue_mask = 0;
-}
-
-static inline bool rq_modified_above(struct rq *rq, const struct sched_class * class)
-{
-       unsigned int mask = class->queue_mask;
-       return rq->queue_mask & ~((mask << 1) - 1);
-}
-
  static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
  {
         WARN_ON_ONCE(rq->donor != prev);
@@ -3899,6 +3875,7 @@ void move_queued_task_locked(struct rq *src_rq, struct rq *dst_rq, struct task_s
         deactivate_task(src_rq, task, 0);
         set_task_cpu(task, dst_rq->cpu);
         activate_task(dst_rq, task, 0);
+       wakeup_preempt(dst_rq, task, 0);
  }
  
  static inline
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c

index 4f9192be4b5b0a2a81072c4349c25023a736be1d..f95798baddebbd9b2b721b3f828a69da3ea615f6 100644 (file)
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -97,9 +97,6 @@ static void update_curr_stop(struct rq *rq)
   * Simple, special scheduling class for the per-CPU stop tasks:
   */
  DEFINE_SCHED_CLASS(stop) = {
-
-       .queue_mask             = 16,
-
         .enqueue_task           = enqueue_task_stop,
         .dequeue_task           = dequeue_task_stop,
         .yield_task             = yield_task_stop,
author	Peter Zijlstra <peterz@infradead.org>
	Wed, 10 Dec 2025 08:06:50 +0000 (09:06 +0100)
committer	Peter Zijlstra <peterz@infradead.org>
	Wed, 17 Dec 2025 09:53:25 +0000 (10:53 +0100)
kernel/sched/core.c		patch \| blob \| blame \| history
kernel/sched/deadline.c		patch \| blob \| blame \| history
kernel/sched/ext.c		patch \| blob \| blame \| history
kernel/sched/fair.c		patch \| blob \| blame \| history
kernel/sched/idle.c		patch \| blob \| blame \| history
kernel/sched/rt.c		patch \| blob \| blame \| history
kernel/sched/sched.h		patch \| blob \| blame \| history
kernel/sched/stop_task.c		patch \| blob \| blame \| history