sched: Handle blocked-waiter migration (and return migration)

author John Stultz <jstultz@google.com>

Tue, 24 Mar 2026 19:13:25 +0000 (19:13 +0000)

committer Peter Zijlstra <peterz@infradead.org>

Fri, 3 Apr 2026 12:23:41 +0000 (14:23 +0200)
author John Stultz <jstultz@google.com>
Tue, 24 Mar 2026 19:13:25 +0000 (19:13 +0000)
committer Peter Zijlstra <peterz@infradead.org>
Fri, 3 Apr 2026 12:23:41 +0000 (14:23 +0200)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 162b24c76077e4419176f82bf5e2787d1bd5fae9..c15c9865299e748bb79132c772ceeecdd8c6836c 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4239,13 +4239,6 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
                 ttwu_queue(p, cpu, wake_flags);
         }
  out:
-       /*
-        * For now, if we've been woken up, clear the task->blocked_on
-        * regardless if it was set to a mutex or PROXY_WAKING so the
-        * task can run. We will need to be more careful later when
-        * properly handling proxy migration
-        */
-       clear_task_blocked_on(p, NULL);
         if (success)
                 ttwu_stat(p, task_cpu(p), wake_flags);
  
@@ -6530,6 +6523,8 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p,
         if (signal_pending_state(task_state, p)) {
                 WRITE_ONCE(p->__state, TASK_RUNNING);
                 *task_state_p = TASK_RUNNING;
+               set_task_blocked_on_waking(p, NULL);
+
                 return false;
         }
  
@@ -6567,6 +6562,21 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p,
  }
  
  #ifdef CONFIG_SCHED_PROXY_EXEC
+static inline void proxy_set_task_cpu(struct task_struct *p, int cpu)
+{
+       unsigned int wake_cpu;
+
+       /*
+        * Since we are enqueuing a blocked task on a cpu it may
+        * not be able to run on, preserve wake_cpu when we
+        * __set_task_cpu so we can return the task to where it
+        * was previously runnable.
+        */
+       wake_cpu = p->wake_cpu;
+       __set_task_cpu(p, cpu);
+       p->wake_cpu = wake_cpu;
+}
+
  static inline struct task_struct *proxy_resched_idle(struct rq *rq)
  {
         put_prev_set_next_task(rq, rq->donor, rq->idle);
@@ -6575,7 +6585,7 @@ static inline struct task_struct *proxy_resched_idle(struct rq *rq)
         return rq->idle;
  }
  
-static bool __proxy_deactivate(struct rq *rq, struct task_struct *donor)
+static bool proxy_deactivate(struct rq *rq, struct task_struct *donor)
  {
         unsigned long state = READ_ONCE(donor->__state);
  
@@ -6595,17 +6605,140 @@ static bool __proxy_deactivate(struct rq *rq, struct task_struct *donor)
         return try_to_block_task(rq, donor, &state, true);
  }
  
-static struct task_struct *proxy_deactivate(struct rq *rq, struct task_struct *donor)
+static inline void proxy_release_rq_lock(struct rq *rq, struct rq_flags *rf)
+       __releases(__rq_lockp(rq))
+{
+       /*
+        * The class scheduler may have queued a balance callback
+        * from pick_next_task() called earlier.
+        *
+        * So here we have to zap callbacks before unlocking the rq
+        * as another CPU may jump in and call sched_balance_rq
+        * which can trip the warning in rq_pin_lock() if we
+        * leave callbacks set.
+        *
+        * After we later reaquire the rq lock, we will force __schedule()
+        * to pick_again, so the callbacks will get re-established.
+        */
+       zap_balance_callbacks(rq);
+       rq_unpin_lock(rq, rf);
+       raw_spin_rq_unlock(rq);
+}
+
+static inline void proxy_reacquire_rq_lock(struct rq *rq, struct rq_flags *rf)
+       __acquires(__rq_lockp(rq))
+{
+       raw_spin_rq_lock(rq);
+       rq_repin_lock(rq, rf);
+       update_rq_clock(rq);
+}
+
+/*
+ * If the blocked-on relationship crosses CPUs, migrate @p to the
+ * owner's CPU.
+ *
+ * This is because we must respect the CPU affinity of execution
+ * contexts (owner) but we can ignore affinity for scheduling
+ * contexts (@p). So we have to move scheduling contexts towards
+ * potential execution contexts.
+ *
+ * Note: The owner can disappear, but simply migrate to @target_cpu
+ * and leave that CPU to sort things out.
+ */
+static void proxy_migrate_task(struct rq *rq, struct rq_flags *rf,
+                              struct task_struct *p, int target_cpu)
+       __must_hold(__rq_lockp(rq))
+{
+       struct rq *target_rq = cpu_rq(target_cpu);
+
+       lockdep_assert_rq_held(rq);
+       WARN_ON(p == rq->curr);
+       /*
+        * Since we are migrating a blocked donor, it could be rq->donor,
+        * and we want to make sure there aren't any references from this
+        * rq to it before we drop the lock. This avoids another cpu
+        * jumping in and grabbing the rq lock and referencing rq->donor
+        * or cfs_rq->curr, etc after we have migrated it to another cpu,
+        * and before we pick_again in __schedule.
+        *
+        * So call proxy_resched_idle() to drop the rq->donor references
+        * before we release the lock.
+        */
+       proxy_resched_idle(rq);
+
+       deactivate_task(rq, p, DEQUEUE_NOCLOCK);
+       proxy_set_task_cpu(p, target_cpu);
+
+       proxy_release_rq_lock(rq, rf);
+
+       attach_one_task(target_rq, p);
+
+       proxy_reacquire_rq_lock(rq, rf);
+}
+
+static void proxy_force_return(struct rq *rq, struct rq_flags *rf,
+                              struct task_struct *p)
+       __must_hold(__rq_lockp(rq))
  {
-       if (!__proxy_deactivate(rq, donor)) {
+       struct rq *task_rq, *target_rq = NULL;
+       int cpu, wake_flag = WF_TTWU;
+
+       lockdep_assert_rq_held(rq);
+       WARN_ON(p == rq->curr);
+
+       if (p == rq->donor)
+               proxy_resched_idle(rq);
+
+       proxy_release_rq_lock(rq, rf);
+       /*
+        * We drop the rq lock, and re-grab task_rq_lock to get
+        * the pi_lock (needed for select_task_rq) as well.
+        */
+       scoped_guard (task_rq_lock, p) {
+               task_rq = scope.rq;
+
                 /*
-                * XXX: For now, if deactivation failed, set donor
-                * as unblocked, as we aren't doing proxy-migrations
-                * yet (more logic will be needed then).
+                * Since we let go of the rq lock, the task may have been
+                * woken or migrated to another rq before we  got the
+                * task_rq_lock. So re-check we're on the same RQ. If
+                * not, the task has already been migrated and that CPU
+                * will handle any futher migrations.
                  */
-               clear_task_blocked_on(donor, NULL);
+               if (task_rq != rq)
+                       break;
+
+               /*
+                * Similarly, if we've been dequeued, someone else will
+                * wake us
+                */
+               if (!task_on_rq_queued(p))
+                       break;
+
+               /*
+                * Since we should only be calling here from __schedule()
+                * -> find_proxy_task(), no one else should have
+                * assigned current out from under us. But check and warn
+                * if we see this, then bail.
+                */
+               if (task_current(task_rq, p) || task_on_cpu(task_rq, p)) {
+                       WARN_ONCE(1, "%s rq: %i current/on_cpu task %s %d  on_cpu: %i\n",
+                                 __func__, cpu_of(task_rq),
+                                 p->comm, p->pid, p->on_cpu);
+                       break;
+               }
+
+               update_rq_clock(task_rq);
+               deactivate_task(task_rq, p, DEQUEUE_NOCLOCK);
+               cpu = select_task_rq(p, p->wake_cpu, &wake_flag);
+               set_task_cpu(p, cpu);
+               target_rq = cpu_rq(cpu);
+               clear_task_blocked_on(p, NULL);
         }
-       return NULL;
+
+       if (target_rq)
+               attach_one_task(target_rq, p);
+
+       proxy_reacquire_rq_lock(rq, rf);
  }
  
  /*
@@ -6626,18 +6759,25 @@ static struct task_struct *proxy_deactivate(struct rq *rq, struct task_struct *d
   */
  static struct task_struct *
  find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
+       __must_hold(__rq_lockp(rq))
  {
-       enum { FOUND, DEACTIVATE_DONOR } action = FOUND;
         struct task_struct *owner = NULL;
+       bool curr_in_chain = false;
         int this_cpu = cpu_of(rq);
         struct task_struct *p;
         struct mutex *mutex;
+       int owner_cpu;
  
         /* Follow blocked_on chain. */
         for (p = donor; (mutex = p->blocked_on); p = owner) {
-               /* if its PROXY_WAKING, resched_idle so ttwu can complete */
-               if (mutex == PROXY_WAKING)
-                       return proxy_resched_idle(rq);
+               /* if its PROXY_WAKING, do return migration or run if current */
+               if (mutex == PROXY_WAKING) {
+                       if (task_current(rq, p)) {
+                               clear_task_blocked_on(p, PROXY_WAKING);
+                               return p;
+                       }
+                       goto force_return;
+               }
  
                 /*
                  * By taking mutex->wait_lock we hold off concurrent mutex_unlock()
@@ -6657,27 +6797,39 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
                         return NULL;
                 }
  
+               if (task_current(rq, p))
+                       curr_in_chain = true;
+
                 owner = __mutex_owner(mutex);
                 if (!owner) {
                         /*
-                        * If there is no owner, clear blocked_on
-                        * and return p so it can run and try to
-                        * acquire the lock
+                        * If there is no owner, either clear blocked_on
+                        * and return p (if it is current and safe to
+                        * just run on this rq), or return-migrate the task.
                          */
-                       __clear_task_blocked_on(p, mutex);
-                       return p;
+                       if (task_current(rq, p)) {
+                               __clear_task_blocked_on(p, NULL);
+                               return p;
+                       }
+                       goto force_return;
                 }
  
                 if (!READ_ONCE(owner->on_rq) || owner->se.sched_delayed) {
                         /* XXX Don't handle blocked owners/delayed dequeue yet */
-                       action = DEACTIVATE_DONOR;
-                       break;
+                       if (curr_in_chain)
+                               return proxy_resched_idle(rq);
+                       goto deactivate;
                 }
  
-               if (task_cpu(owner) != this_cpu) {
-                       /* XXX Don't handle migrations yet */
-                       action = DEACTIVATE_DONOR;
-                       break;
+               owner_cpu = task_cpu(owner);
+               if (owner_cpu != this_cpu) {
+                       /*
+                        * @owner can disappear, simply migrate to @owner_cpu
+                        * and leave that CPU to sort things out.
+                        */
+                       if (curr_in_chain)
+                               return proxy_resched_idle(rq);
+                       goto migrate_task;
                 }
  
                 if (task_on_rq_migrating(owner)) {
@@ -6734,16 +6886,20 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
                  * guarantee its existence, as per ttwu_remote().
                  */
         }
-
-       /* Handle actions we need to do outside of the guard() scope */
-       switch (action) {
-       case DEACTIVATE_DONOR:
-               return proxy_deactivate(rq, donor);
-       case FOUND:
-               /* fallthrough */;
-       }
         WARN_ON_ONCE(owner && !owner->on_rq);
         return owner;
+
+deactivate:
+       if (proxy_deactivate(rq, donor))
+               return NULL;
+       /* If deactivate fails, force return */
+       p = donor;
+force_return:
+       proxy_force_return(rq, rf, p);
+       return NULL;
+migrate_task:
+       proxy_migrate_task(rq, rf, p, owner_cpu);
+       return NULL;
  }
  #else /* SCHED_PROXY_EXEC */
  static struct task_struct *
author	John Stultz <jstultz@google.com>
	Tue, 24 Mar 2026 19:13:25 +0000 (19:13 +0000)
committer	Peter Zijlstra <peterz@infradead.org>
	Fri, 3 Apr 2026 12:23:41 +0000 (14:23 +0200)