static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0);
/*
- * The maximum amount of time in jiffies that a task may be runnable without
- * being scheduled on a CPU. If this timeout is exceeded, it will trigger
- * scx_error().
+ * Watchdog interval. All scx_sched's share a single watchdog timer and the
+ * interval is half of the shortest sch->watchdog_timeout.
*/
-static unsigned long scx_watchdog_timeout;
+static unsigned long scx_watchdog_interval;
/*
* The last time the delayed work was run. This delayed work relies on
goto out_unlock;
list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) {
+ struct scx_sched *sch = scx_task_sched(p);
unsigned long last_runnable = p->scx.runnable_at;
if (unlikely(time_after(jiffies,
- last_runnable + READ_ONCE(scx_watchdog_timeout)))) {
+ last_runnable + READ_ONCE(sch->watchdog_timeout)))) {
u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable);
scx_exit(sch, SCX_EXIT_ERROR_STALL, 0,
static void scx_watchdog_workfn(struct work_struct *work)
{
+ unsigned long intv;
int cpu;
WRITE_ONCE(scx_watchdog_timestamp, jiffies);
cond_resched();
}
- queue_delayed_work(system_unbound_wq, to_delayed_work(work),
- READ_ONCE(scx_watchdog_timeout) / 2);
+
+ intv = READ_ONCE(scx_watchdog_interval);
+ if (intv < ULONG_MAX)
+ queue_delayed_work(system_unbound_wq, to_delayed_work(work),
+ intv);
}
void scx_tick(struct rq *rq)
{
- struct scx_sched *sch;
+ struct scx_sched *root;
unsigned long last_check;
if (!scx_enabled())
return;
- sch = rcu_dereference_bh(scx_root);
- if (unlikely(!sch))
+ root = rcu_dereference_bh(scx_root);
+ if (unlikely(!root))
return;
last_check = READ_ONCE(scx_watchdog_timestamp);
if (unlikely(time_after(jiffies,
- last_check + READ_ONCE(scx_watchdog_timeout)))) {
+ last_check + READ_ONCE(root->watchdog_timeout)))) {
u32 dur_ms = jiffies_to_msecs(jiffies - last_check);
- scx_exit(sch, SCX_EXIT_ERROR_STALL, 0,
+ scx_exit(root, SCX_EXIT_ERROR_STALL, 0,
"watchdog failed to check in for %u.%03us",
dur_ms / 1000, dur_ms % 1000);
}
}
}
+static void refresh_watchdog(void)
+{
+ struct scx_sched *sch;
+ unsigned long intv = ULONG_MAX;
+
+ /* take the shortest timeout and use its half for watchdog interval */
+ rcu_read_lock();
+ list_for_each_entry_rcu(sch, &scx_sched_all, all)
+ intv = max(min(intv, sch->watchdog_timeout / 2), 1);
+ rcu_read_unlock();
+
+ WRITE_ONCE(scx_watchdog_timestamp, jiffies);
+ WRITE_ONCE(scx_watchdog_interval, intv);
+
+ if (intv < ULONG_MAX)
+ mod_delayed_work(system_unbound_wq, &scx_watchdog_work, intv);
+ else
+ cancel_delayed_work_sync(&scx_watchdog_work);
+}
+
#ifdef CONFIG_EXT_SUB_SCHED
static DECLARE_WAIT_QUEUE_HEAD(scx_unlink_waitq);
list_del_rcu(&sch->all);
raw_spin_unlock_irq(&scx_sched_lock);
+ refresh_watchdog();
+
mutex_unlock(&scx_enable_mutex);
/*
if (sch->ops.exit)
SCX_CALL_OP(sch, SCX_KF_UNLOCKED, exit, NULL, ei);
- cancel_delayed_work_sync(&scx_watchdog_work);
-
raw_spin_lock_irq(&scx_sched_lock);
list_del_rcu(&sch->all);
raw_spin_unlock_irq(&scx_sched_lock);
+ refresh_watchdog();
+
/*
* scx_root clearing must be inside cpus_read_lock(). See
* handle_hotplug().
sch->ancestors[level] = sch;
sch->level = level;
+ if (ops->timeout_ms)
+ sch->watchdog_timeout = msecs_to_jiffies(ops->timeout_ms);
+ else
+ sch->watchdog_timeout = SCX_WATCHDOG_MAX_TIMEOUT;
+
sch->slice_dfl = SCX_SLICE_DFL;
atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
init_irq_work(&sch->error_irq_work, scx_error_irq_workfn);
struct scx_sched *sch;
struct scx_task_iter sti;
struct task_struct *p;
- unsigned long timeout;
int i, cpu, ret;
mutex_lock(&scx_enable_mutex);
list_add_tail_rcu(&sch->all, &scx_sched_all);
raw_spin_unlock_irq(&scx_sched_lock);
+ refresh_watchdog();
+
scx_idle_enable(ops);
if (sch->ops.init) {
if (ret)
goto err_disable;
- if (ops->timeout_ms)
- timeout = msecs_to_jiffies(ops->timeout_ms);
- else
- timeout = SCX_WATCHDOG_MAX_TIMEOUT;
-
- WRITE_ONCE(scx_watchdog_timeout, timeout);
- WRITE_ONCE(scx_watchdog_timestamp, jiffies);
- queue_delayed_work(system_unbound_wq, &scx_watchdog_work,
- READ_ONCE(scx_watchdog_timeout) / 2);
-
/*
* Once __scx_enabled is set, %current can be switched to SCX anytime.
* This can lead to stalls as some BPF schedulers (e.g. userspace
list_add_tail_rcu(&sch->all, &scx_sched_all);
raw_spin_unlock_irq(&scx_sched_lock);
+ refresh_watchdog();
+
if (sch->level >= SCX_SUB_MAX_DEPTH) {
scx_error(sch, "max nesting depth %d violated",
SCX_SUB_MAX_DEPTH);