cond_resched();
}
- queue_delayed_work(system_dfl_wq, to_delayed_work(work),
- READ_ONCE(scx_watchdog_timeout) / 2);
+
+ intv = READ_ONCE(scx_watchdog_interval);
+ if (intv < ULONG_MAX)
- queue_delayed_work(system_unbound_wq, to_delayed_work(work),
- intv);
++ queue_delayed_work(system_dfl_wq, to_delayed_work(work), intv);
}
void scx_tick(struct rq *rq)
}
}
-static void scx_disable_workfn(struct kthread_work *work)
+static void refresh_watchdog(void)
{
- struct scx_sched *sch = container_of(work, struct scx_sched, disable_work);
- struct scx_exit_info *ei = sch->exit_info;
+ struct scx_sched *sch;
+ unsigned long intv = ULONG_MAX;
+
+ /* take the shortest timeout and use its half for watchdog interval */
+ rcu_read_lock();
+ list_for_each_entry_rcu(sch, &scx_sched_all, all)
+ intv = max(min(intv, sch->watchdog_timeout / 2), 1);
+ rcu_read_unlock();
+
+ WRITE_ONCE(scx_watchdog_timestamp, jiffies);
+ WRITE_ONCE(scx_watchdog_interval, intv);
+
+ if (intv < ULONG_MAX)
- mod_delayed_work(system_unbound_wq, &scx_watchdog_work, intv);
++ mod_delayed_work(system_dfl_wq, &scx_watchdog_work, intv);
+ else
+ cancel_delayed_work_sync(&scx_watchdog_work);
+}
+
+static s32 scx_link_sched(struct scx_sched *sch)
+{
+ scoped_guard(raw_spinlock_irq, &scx_sched_lock) {
+#ifdef CONFIG_EXT_SUB_SCHED
+ struct scx_sched *parent = scx_parent(sch);
+ s32 ret;
+
+ if (parent) {
+ ret = rhashtable_lookup_insert_fast(&scx_sched_hash,
+ &sch->hash_node, scx_sched_hash_params);
+ if (ret) {
+ scx_error(sch, "failed to insert into scx_sched_hash (%d)", ret);
+ return ret;
+ }
+
+ list_add_tail(&sch->sibling, &parent->children);
+ }
+#endif /* CONFIG_EXT_SUB_SCHED */
+
+ list_add_tail_rcu(&sch->all, &scx_sched_all);
+ }
+
+ refresh_watchdog();
+ return 0;
+}
+
+static void scx_unlink_sched(struct scx_sched *sch)
+{
+ scoped_guard(raw_spinlock_irq, &scx_sched_lock) {
+#ifdef CONFIG_EXT_SUB_SCHED
+ if (scx_parent(sch)) {
+ rhashtable_remove_fast(&scx_sched_hash, &sch->hash_node,
+ scx_sched_hash_params);
+ list_del_init(&sch->sibling);
+ }
+#endif /* CONFIG_EXT_SUB_SCHED */
+ list_del_rcu(&sch->all);
+ }
+
+ refresh_watchdog();
+}
+
+#ifdef CONFIG_EXT_SUB_SCHED
+static DECLARE_WAIT_QUEUE_HEAD(scx_unlink_waitq);
+
+static void drain_descendants(struct scx_sched *sch)
+{
+ /*
+ * Child scheds that finished the critical part of disabling will take
+ * themselves off @sch->children. Wait for it to drain. As propagation
+ * is recursive, empty @sch->children means that all proper descendant
+ * scheds reached unlinking stage.
+ */
+ wait_event(scx_unlink_waitq, list_empty(&sch->children));
+}
+
+static void scx_fail_parent(struct scx_sched *sch,
+ struct task_struct *failed, s32 fail_code)
+{
+ struct scx_sched *parent = scx_parent(sch);
struct scx_task_iter sti;
struct task_struct *p;
- int kind, cpu;
- kind = atomic_read(&sch->exit_kind);
- while (true) {
- if (kind == SCX_EXIT_DONE) /* already disabled? */
- return;
- WARN_ON_ONCE(kind == SCX_EXIT_NONE);
- if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE))
+ scx_error(parent, "ops.init_task() failed (%d) for %s[%d] while disabling a sub-scheduler",
+ fail_code, failed->comm, failed->pid);
+
+ /*
+ * Once $parent is bypassed, it's safe to put SCX_TASK_NONE tasks into
+ * it. This may cause downstream failures on the BPF side but $parent is
+ * dying anyway.
+ */
+ scx_bypass(parent, true);
+
+ scx_task_iter_start(&sti, sch->cgrp);
+ while ((p = scx_task_iter_next_locked(&sti))) {
+ if (scx_task_on_sched(parent, p))
+ continue;
+
+ scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
+ scx_disable_and_exit_task(sch, p);
+ rcu_assign_pointer(p->scx.sched, parent);
+ }
+ }
+ scx_task_iter_stop(&sti);
+}
+
+static void scx_sub_disable(struct scx_sched *sch)
+{
+ struct scx_sched *parent = scx_parent(sch);
+ struct scx_task_iter sti;
+ struct task_struct *p;
+ int ret;
+
+ /*
+ * Guarantee forward progress and wait for descendants to be disabled.
+ * To limit disruptions, $parent is not bypassed. Tasks are fully
+ * prepped and then inserted back into $parent.
+ */
+ scx_bypass(sch, true);
+ drain_descendants(sch);
+
+ /*
+ * Here, every runnable task is guaranteed to make forward progress and
+ * we can safely use blocking synchronization constructs. Actually
+ * disable ops.
+ */
+ mutex_lock(&scx_enable_mutex);
+ percpu_down_write(&scx_fork_rwsem);
+ scx_cgroup_lock();
+
+ set_cgroup_sched(sch_cgroup(sch), parent);
+
+ scx_task_iter_start(&sti, sch->cgrp);
+ while ((p = scx_task_iter_next_locked(&sti))) {
+ struct rq *rq;
+ struct rq_flags rf;
+
+ /* filter out duplicate visits */
+ if (scx_task_on_sched(parent, p))
+ continue;
+
+ /*
+ * By the time control reaches here, all descendant schedulers
+ * should already have been disabled.
+ */
+ WARN_ON_ONCE(!scx_task_on_sched(sch, p));
+
+ /*
+ * If $p is about to be freed, nothing prevents $sch from
+ * unloading before $p reaches sched_ext_free(). Disable and
+ * exit $p right away.
+ */
+ if (!tryget_task_struct(p)) {
+ scx_disable_and_exit_task(sch, p);
+ continue;
+ }
+
+ scx_task_iter_unlock(&sti);
+
+ /*
+ * $p is READY or ENABLED on @sch. Initialize for $parent,
+ * disable and exit from @sch, and then switch over to $parent.
+ *
+ * If a task fails to initialize for $parent, the only available
+ * action is disabling $parent too. While this allows disabling
+ * of a child sched to cause the parent scheduler to fail, the
+ * failure can only originate from ops.init_task() of the
+ * parent. A child can't directly affect the parent through its
+ * own failures.
+ */
+ ret = __scx_init_task(parent, p, false);
+ if (ret) {
+ scx_fail_parent(sch, p, ret);
+ put_task_struct(p);
break;
+ }
+
+ rq = task_rq_lock(p, &rf);
+ scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
+ /*
+ * $p is initialized for $parent and still attached to
+ * @sch. Disable and exit for @sch, switch over to
+ * $parent, override the state to READY to account for
+ * $p having already been initialized, and then enable.
+ */
+ scx_disable_and_exit_task(sch, p);
+ scx_set_task_state(p, SCX_TASK_INIT);
+ rcu_assign_pointer(p->scx.sched, parent);
+ scx_set_task_state(p, SCX_TASK_READY);
+ scx_enable_task(parent, p);
+ }
+ task_rq_unlock(rq, p, &rf);
+
+ put_task_struct(p);
}
- ei->kind = kind;
- ei->reason = scx_exit_reason(ei->kind);
+ scx_task_iter_stop(&sti);
- /* guarantee forward progress by bypassing scx_ops */
- scx_bypass(true);
- WRITE_ONCE(scx_aborting, false);
+ scx_cgroup_unlock();
+ percpu_up_write(&scx_fork_rwsem);
+
+ /*
+ * All tasks are moved off of @sch but there may still be on-going
+ * operations (e.g. ops.select_cpu()). Drain them by flushing RCU. Use
+ * the expedited version as ancestors may be waiting in bypass mode.
+ * Also, tell the parent that there is no need to keep running bypass
+ * DSQs for us.
+ */
+ synchronize_rcu_expedited();
+ disable_bypass_dsp(sch);
+
+ scx_unlink_sched(sch);
+
+ mutex_unlock(&scx_enable_mutex);
+
+ /*
+ * @sch is now unlinked from the parent's children list. Notify and call
+ * ops.sub_detach/exit(). Note that ops.sub_detach/exit() must be called
+ * after unlinking and releasing all locks. See scx_claim_exit().
+ */
+ wake_up_all(&scx_unlink_waitq);
+
+ if (sch->ops.sub_detach && sch->sub_attached) {
+ struct scx_sub_detach_args sub_detach_args = {
+ .ops = &sch->ops,
+ .cgroup_path = sch->cgrp_path,
+ };
+ SCX_CALL_OP(parent, SCX_KF_UNLOCKED, sub_detach, NULL,
+ &sub_detach_args);
+ }
+
+ if (sch->ops.exit)
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, exit, NULL, sch->exit_info);
+ kobject_del(&sch->kobj);
+}
+#else /* CONFIG_EXT_SUB_SCHED */
+static void drain_descendants(struct scx_sched *sch) { }
+static void scx_sub_disable(struct scx_sched *sch) { }
+#endif /* CONFIG_EXT_SUB_SCHED */
+
+static void scx_root_disable(struct scx_sched *sch)
+{
+ struct scx_exit_info *ei = sch->exit_info;
+ struct scx_task_iter sti;
+ struct task_struct *p;
+ int cpu;
+
+ /* guarantee forward progress and wait for descendants to be disabled */
+ scx_bypass(sch, true);
+ drain_descendants(sch);
switch (scx_set_enable_state(SCX_DISABLING)) {
case SCX_DISABLING: