sched_ext: Make watchdog sub-sched aware

author Tejun Heo <tj@kernel.org>

Fri, 6 Mar 2026 17:58:04 +0000 (07:58 -1000)

committer Tejun Heo <tj@kernel.org>

Fri, 6 Mar 2026 17:58:04 +0000 (07:58 -1000)
author Tejun Heo <tj@kernel.org>
Fri, 6 Mar 2026 17:58:04 +0000 (07:58 -1000)
committer Tejun Heo <tj@kernel.org>
Fri, 6 Mar 2026 17:58:04 +0000 (07:58 -1000)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c

index 43569d138bd23211346b29f28069d58046e574ec..1452c20b6483e2a76b3fd74962379e96459328d1 100644 (file)
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -59,11 +59,10 @@ static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);
  static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0);
  
  /*
- * The maximum amount of time in jiffies that a task may be runnable without
- * being scheduled on a CPU. If this timeout is exceeded, it will trigger
- * scx_error().
+ * Watchdog interval. All scx_sched's share a single watchdog timer and the
+ * interval is half of the shortest sch->watchdog_timeout.
   */
-static unsigned long scx_watchdog_timeout;
+static unsigned long scx_watchdog_interval;
  
  /*
   * The last time the delayed work was run. This delayed work relies on
@@ -3038,10 +3037,11 @@ static bool check_rq_for_timeouts(struct rq *rq)
                 goto out_unlock;
  
         list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) {
+               struct scx_sched *sch = scx_task_sched(p);
                 unsigned long last_runnable = p->scx.runnable_at;
  
                 if (unlikely(time_after(jiffies,
-                                       last_runnable + READ_ONCE(scx_watchdog_timeout)))) {
+                                       last_runnable + READ_ONCE(sch->watchdog_timeout)))) {
                         u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable);
  
                         scx_exit(sch, SCX_EXIT_ERROR_STALL, 0,
@@ -3058,6 +3058,7 @@ out_unlock:
  
  static void scx_watchdog_workfn(struct work_struct *work)
  {
+       unsigned long intv;
         int cpu;
  
         WRITE_ONCE(scx_watchdog_timestamp, jiffies);
@@ -3068,28 +3069,31 @@ static void scx_watchdog_workfn(struct work_struct *work)
  
                 cond_resched();
         }
-       queue_delayed_work(system_unbound_wq, to_delayed_work(work),
-                          READ_ONCE(scx_watchdog_timeout) / 2);
+
+       intv = READ_ONCE(scx_watchdog_interval);
+       if (intv < ULONG_MAX)
+               queue_delayed_work(system_unbound_wq, to_delayed_work(work),
+                                  intv);
  }
  
  void scx_tick(struct rq *rq)
  {
-       struct scx_sched *sch;
+       struct scx_sched *root;
         unsigned long last_check;
  
         if (!scx_enabled())
                 return;
  
-       sch = rcu_dereference_bh(scx_root);
-       if (unlikely(!sch))
+       root = rcu_dereference_bh(scx_root);
+       if (unlikely(!root))
                 return;
  
         last_check = READ_ONCE(scx_watchdog_timestamp);
         if (unlikely(time_after(jiffies,
-                               last_check + READ_ONCE(scx_watchdog_timeout)))) {
+                               last_check + READ_ONCE(root->watchdog_timeout)))) {
                 u32 dur_ms = jiffies_to_msecs(jiffies - last_check);
  
-               scx_exit(sch, SCX_EXIT_ERROR_STALL, 0,
+               scx_exit(root, SCX_EXIT_ERROR_STALL, 0,
                          "watchdog failed to check in for %u.%03us",
                          dur_ms / 1000, dur_ms % 1000);
         }
@@ -4760,6 +4764,26 @@ static void free_kick_syncs(void)
         }
  }
  
+static void refresh_watchdog(void)
+{
+       struct scx_sched *sch;
+       unsigned long intv = ULONG_MAX;
+
+       /* take the shortest timeout and use its half for watchdog interval */
+       rcu_read_lock();
+       list_for_each_entry_rcu(sch, &scx_sched_all, all)
+               intv = max(min(intv, sch->watchdog_timeout / 2), 1);
+       rcu_read_unlock();
+
+       WRITE_ONCE(scx_watchdog_timestamp, jiffies);
+       WRITE_ONCE(scx_watchdog_interval, intv);
+
+       if (intv < ULONG_MAX)
+               mod_delayed_work(system_unbound_wq, &scx_watchdog_work, intv);
+       else
+               cancel_delayed_work_sync(&scx_watchdog_work);
+}
+
  #ifdef CONFIG_EXT_SUB_SCHED
  static DECLARE_WAIT_QUEUE_HEAD(scx_unlink_waitq);
  
@@ -4798,6 +4822,8 @@ static void scx_sub_disable(struct scx_sched *sch)
         list_del_rcu(&sch->all);
         raw_spin_unlock_irq(&scx_sched_lock);
  
+       refresh_watchdog();
+
         mutex_unlock(&scx_enable_mutex);
  
         /*
@@ -4932,12 +4958,12 @@ static void scx_root_disable(struct scx_sched *sch)
         if (sch->ops.exit)
                 SCX_CALL_OP(sch, SCX_KF_UNLOCKED, exit, NULL, ei);
  
-       cancel_delayed_work_sync(&scx_watchdog_work);
-
         raw_spin_lock_irq(&scx_sched_lock);
         list_del_rcu(&sch->all);
         raw_spin_unlock_irq(&scx_sched_lock);
  
+       refresh_watchdog();
+
         /*
          * scx_root clearing must be inside cpus_read_lock(). See
          * handle_hotplug().
@@ -5473,6 +5499,11 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
         sch->ancestors[level] = sch;
         sch->level = level;
  
+       if (ops->timeout_ms)
+               sch->watchdog_timeout = msecs_to_jiffies(ops->timeout_ms);
+       else
+               sch->watchdog_timeout = SCX_WATCHDOG_MAX_TIMEOUT;
+
         sch->slice_dfl = SCX_SLICE_DFL;
         atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
         init_irq_work(&sch->error_irq_work, scx_error_irq_workfn);
@@ -5615,7 +5646,6 @@ static void scx_root_enable_workfn(struct kthread_work *work)
         struct scx_sched *sch;
         struct scx_task_iter sti;
         struct task_struct *p;
-       unsigned long timeout;
         int i, cpu, ret;
  
         mutex_lock(&scx_enable_mutex);
@@ -5667,6 +5697,8 @@ static void scx_root_enable_workfn(struct kthread_work *work)
         list_add_tail_rcu(&sch->all, &scx_sched_all);
         raw_spin_unlock_irq(&scx_sched_lock);
  
+       refresh_watchdog();
+
         scx_idle_enable(ops);
  
         if (sch->ops.init) {
@@ -5697,16 +5729,6 @@ static void scx_root_enable_workfn(struct kthread_work *work)
         if (ret)
                 goto err_disable;
  
-       if (ops->timeout_ms)
-               timeout = msecs_to_jiffies(ops->timeout_ms);
-       else
-               timeout = SCX_WATCHDOG_MAX_TIMEOUT;
-
-       WRITE_ONCE(scx_watchdog_timeout, timeout);
-       WRITE_ONCE(scx_watchdog_timestamp, jiffies);
-       queue_delayed_work(system_unbound_wq, &scx_watchdog_work,
-                          READ_ONCE(scx_watchdog_timeout) / 2);
-
         /*
          * Once __scx_enabled is set, %current can be switched to SCX anytime.
          * This can lead to stalls as some BPF schedulers (e.g. userspace
@@ -5928,6 +5950,8 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
         list_add_tail_rcu(&sch->all, &scx_sched_all);
         raw_spin_unlock_irq(&scx_sched_lock);
  
+       refresh_watchdog();
+
         if (sch->level >= SCX_SUB_MAX_DEPTH) {
                 scx_error(sch, "max nesting depth %d violated",
                           SCX_SUB_MAX_DEPTH);
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h

index dba8c2ce8ed9bc4d3aa561bd597b406d476bba9b..9268df30be7f7208b842998b7d349734940a81fe 100644 (file)
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -1019,6 +1019,13 @@ struct scx_sched {
         bool                    sub_attached;
  #endif /* CONFIG_EXT_SUB_SCHED */
  
+       /*
+        * The maximum amount of time in jiffies that a task may be runnable
+        * without being scheduled on a CPU. If this timeout is exceeded, it
+        * will trigger scx_error().
+        */
+       unsigned long           watchdog_timeout;
+
         atomic_t                exit_kind;
         struct scx_exit_info    *exit_info;
author	Tejun Heo <tj@kernel.org>
	Fri, 6 Mar 2026 17:58:04 +0000 (07:58 -1000)
committer	Tejun Heo <tj@kernel.org>
	Fri, 6 Mar 2026 17:58:04 +0000 (07:58 -1000)
kernel/sched/ext.c		patch \| blob \| blame \| history
kernel/sched/ext_internal.h		patch \| blob \| blame \| history