sched: Start blocked_on chain processing in find_proxy_task()

author Peter Zijlstra <peterz@infradead.org>

Sat, 12 Jul 2025 03:33:49 +0000 (03:33 +0000)

committer Peter Zijlstra <peterz@infradead.org>

Mon, 14 Jul 2025 15:16:33 +0000 (17:16 +0200)
author Peter Zijlstra <peterz@infradead.org>
Sat, 12 Jul 2025 03:33:49 +0000 (03:33 +0000)
committer Peter Zijlstra <peterz@infradead.org>
Mon, 14 Jul 2025 15:16:33 +0000 (17:16 +0200)
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h

index cbff35b9b7ae333640ac41bb672a5b182efa1961..2e8080a9bee376aea40fbb2e1f018d43a6c206aa 100644 (file)
--- a/kernel/locking/mutex.h
+++ b/kernel/locking/mutex.h
@@ -6,7 +6,7 @@
   *
   *  Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
   */
-
+#ifndef CONFIG_PREEMPT_RT
  /*
   * This is the control structure for tasks blocked on mutex, which resides
   * on the blocked task's kernel stack:
@@ -70,3 +70,4 @@ extern void debug_mutex_init(struct mutex *lock, const char *name,
  # define debug_mutex_unlock(lock)                      do { } while (0)
  # define debug_mutex_init(lock, name, key)             do { } while (0)
  #endif /* !CONFIG_DEBUG_MUTEXES */
+#endif /* CONFIG_PREEMPT_RT */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index a0b11201a7b4e903362011e475933c969b46a235..f7f576ad9b22362bc4b042db9f522dbd481f92a5 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -96,6 +96,7 @@
  #include "../workqueue_internal.h"
  #include "../../io_uring/io-wq.h"
  #include "../smpboot.h"
+#include "../locking/mutex.h"
  
  EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu);
  EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask);
@@ -2933,8 +2934,15 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
         struct set_affinity_pending my_pending = { }, *pending = NULL;
         bool stop_pending, complete = false;
  
-       /* Can the task run on the task's current CPU? If so, we're done */
-       if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
+       /*
+        * Can the task run on the task's current CPU? If so, we're done
+        *
+        * We are also done if the task is the current donor, boosting a lock-
+        * holding proxy, (and potentially has been migrated outside its
+        * current or previous affinity mask)
+        */
+       if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask) ||
+           (task_current_donor(rq, p) && !task_current(rq, p))) {
                 struct task_struct *push_task = NULL;
  
                 if ((flags & SCA_MIGRATE_ENABLE) &&
@@ -6573,11 +6581,12 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p,
  }
  
  #ifdef CONFIG_SCHED_PROXY_EXEC
-static inline void proxy_resched_idle(struct rq *rq)
+static inline struct task_struct *proxy_resched_idle(struct rq *rq)
  {
         put_prev_set_next_task(rq, rq->donor, rq->idle);
         rq_set_donor(rq, rq->idle);
         set_tsk_need_resched(rq->idle);
+       return rq->idle;
  }
  
  static bool __proxy_deactivate(struct rq *rq, struct task_struct *donor)
@@ -6614,36 +6623,124 @@ static struct task_struct *proxy_deactivate(struct rq *rq, struct task_struct *d
  }
  
  /*
- * Initial simple sketch that just deactivates the blocked task
- * chosen by pick_next_task() so we can then pick something that
- * isn't blocked.
+ * Find runnable lock owner to proxy for mutex blocked donor
+ *
+ * Follow the blocked-on relation:
+ *   task->blocked_on -> mutex->owner -> task...
+ *
+ * Lock order:
+ *
+ *   p->pi_lock
+ *     rq->lock
+ *       mutex->wait_lock
+ *
+ * Returns the task that is going to be used as execution context (the one
+ * that is actually going to be run on cpu_of(rq)).
   */
  static struct task_struct *
  find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
  {
+       struct task_struct *owner = NULL;
+       int this_cpu = cpu_of(rq);
+       struct task_struct *p;
         struct mutex *mutex;
  
-       mutex = donor->blocked_on;
-       /* Something changed in the chain, so pick again */
-       if (!mutex)
-               return NULL;
-       /*
-        * By taking mutex->wait_lock we hold off concurrent mutex_unlock()
-        * and ensure @owner sticks around.
-        */
-       guard(raw_spinlock)(&mutex->wait_lock);
+       /* Follow blocked_on chain. */
+       for (p = donor; task_is_blocked(p); p = owner) {
+               mutex = p->blocked_on;
+               /* Something changed in the chain, so pick again */
+               if (!mutex)
+                       return NULL;
+               /*
+                * By taking mutex->wait_lock we hold off concurrent mutex_unlock()
+                * and ensure @owner sticks around.
+                */
+               guard(raw_spinlock)(&mutex->wait_lock);
  
-       /* Check again that donor is blocked with blocked_lock held */
-       if (!task_is_blocked(donor) || mutex != __get_task_blocked_on(donor)) {
+               /* Check again that p is blocked with wait_lock held */
+               if (mutex != __get_task_blocked_on(p)) {
+                       /*
+                        * Something changed in the blocked_on chain and
+                        * we don't know if only at this level. So, let's
+                        * just bail out completely and let __schedule()
+                        * figure things out (pick_again loop).
+                        */
+                       return NULL;
+               }
+
+               owner = __mutex_owner(mutex);
+               if (!owner) {
+                       __clear_task_blocked_on(p, mutex);
+                       return p;
+               }
+
+               if (!READ_ONCE(owner->on_rq) || owner->se.sched_delayed) {
+                       /* XXX Don't handle blocked owners/delayed dequeue yet */
+                       return proxy_deactivate(rq, donor);
+               }
+
+               if (task_cpu(owner) != this_cpu) {
+                       /* XXX Don't handle migrations yet */
+                       return proxy_deactivate(rq, donor);
+               }
+
+               if (task_on_rq_migrating(owner)) {
+                       /*
+                        * One of the chain of mutex owners is currently migrating to this
+                        * CPU, but has not yet been enqueued because we are holding the
+                        * rq lock. As a simple solution, just schedule rq->idle to give
+                        * the migration a chance to complete. Much like the migrate_task
+                        * case we should end up back in find_proxy_task(), this time
+                        * hopefully with all relevant tasks already enqueued.
+                        */
+                       return proxy_resched_idle(rq);
+               }
+
+               /*
+                * Its possible to race where after we check owner->on_rq
+                * but before we check (owner_cpu != this_cpu) that the
+                * task on another cpu was migrated back to this cpu. In
+                * that case it could slip by our  checks. So double check
+                * we are still on this cpu and not migrating. If we get
+                * inconsistent results, try again.
+                */
+               if (!task_on_rq_queued(owner) || task_cpu(owner) != this_cpu)
+                       return NULL;
+
+               if (owner == p) {
+                       /*
+                        * It's possible we interleave with mutex_unlock like:
+                        *
+                        *                              lock(&rq->lock);
+                        *                                find_proxy_task()
+                        * mutex_unlock()
+                        *   lock(&wait_lock);
+                        *   donor(owner) = current->blocked_donor;
+                        *   unlock(&wait_lock);
+                        *
+                        *   wake_up_q();
+                        *     ...
+                        *       ttwu_runnable()
+                        *         __task_rq_lock()
+                        *                                lock(&wait_lock);
+                        *                                owner == p
+                        *
+                        * Which leaves us to finish the ttwu_runnable() and make it go.
+                        *
+                        * So schedule rq->idle so that ttwu_runnable() can get the rq
+                        * lock and mark owner as running.
+                        */
+                       return proxy_resched_idle(rq);
+               }
                 /*
-                * Something changed in the blocked_on chain and
-                * we don't know if only at this level. So, let's
-                * just bail out completely and let __schedule()
-                * figure things out (pick_again loop).
+                * OK, now we're absolutely sure @owner is on this
+                * rq, therefore holding @rq->lock is sufficient to
+                * guarantee its existence, as per ttwu_remote().
                  */
-               return NULL; /* do pick_next_task() again */
         }
-       return proxy_deactivate(rq, donor);
+
+       WARN_ON_ONCE(owner && !owner->on_rq);
+       return owner;
  }
  #else /* SCHED_PROXY_EXEC */
  static struct task_struct *
@@ -6801,10 +6898,13 @@ pick_again:
                 next = find_proxy_task(rq, next, &rf);
                 if (!next)
                         goto pick_again;
+               if (next == rq->idle)
+                       goto keep_resched;
         }
  picked:
         clear_tsk_need_resched(prev);
         clear_preempt_need_resched();
+keep_resched:
         rq->last_seen_need_resched_ns = 0;
  
         is_switch = prev != next;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 97176458f2be5160a60f47a599658bf5f9de02db..b173a059315c2e076ef1d8347f8816d9c2c33937 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9291,7 +9291,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
          * 2) throttled_lb_pair, or
          * 3) cannot be migrated to this CPU due to cpus_ptr, or
          * 4) running (obviously), or
-        * 5) are cache-hot on their current CPU.
+        * 5) are cache-hot on their current CPU, or
+        * 6) are blocked on mutexes (if SCHED_PROXY_EXEC is enabled)
          */
         if ((p->se.sched_delayed) && (env->migration_type != migrate_load))
                 return 0;
@@ -9313,6 +9314,9 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
         if (kthread_is_per_cpu(p))
                 return 0;
  
+       if (task_is_blocked(p))
+               return 0;
+
         if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
                 int cpu;
  
@@ -9348,7 +9352,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
         /* Record that we found at least one task that could run on dst_cpu */
         env->flags &= ~LBF_ALL_PINNED;
  
-       if (task_on_cpu(env->src_rq, p)) {
+       if (task_on_cpu(env->src_rq, p) ||
+           task_current_donor(env->src_rq, p)) {
                 schedstat_inc(p->stats.nr_failed_migrations_running);
                 return 0;
         }
@@ -9392,6 +9397,9 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
                 schedstat_inc(p->stats.nr_forced_migrations);
         }
  
+       WARN_ON(task_current(env->src_rq, p));
+       WARN_ON(task_current_donor(env->src_rq, p));
+
         deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
         set_task_cpu(p, env->dst_cpu);
  }
author	Peter Zijlstra <peterz@infradead.org>
	Sat, 12 Jul 2025 03:33:49 +0000 (03:33 +0000)
committer	Peter Zijlstra <peterz@infradead.org>
	Mon, 14 Jul 2025 15:16:33 +0000 (17:16 +0200)
kernel/locking/mutex.h		patch \| blob \| blame \| history
kernel/sched/core.c		patch \| blob \| blame \| history
kernel/sched/fair.c		patch \| blob \| blame \| history