static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0);
static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);
+#ifdef CONFIG_EXT_SUB_SCHED
+/*
+ * The sub sched being enabled. Used by scx_disable_and_exit_task() to exit
+ * tasks for the sub-sched being enabled. Use a global variable instead of a
+ * per-task field as all enables are serialized.
+ */
+static struct scx_sched *scx_enabling_sub_sched;
+#else
+#define scx_enabling_sub_sched (struct scx_sched *)NULL
+#endif /* CONFIG_EXT_SUB_SCHED */
+
/*
* A monotically increasing sequence number that is incremented every time a
* scheduler is enabled. This can be used by to check if any custom sched_ext
{
__scx_disable_and_exit_task(sch, p);
+ /*
+ * If set, @p exited between __scx_init_task() and scx_enable_task() in
+ * scx_sub_enable() and is initialized for both the associated sched and
+ * its parent. Disable and exit for the child too.
+ */
+ if ((p->scx.flags & SCX_TASK_SUB_INIT) &&
+ !WARN_ON_ONCE(!scx_enabling_sub_sched)) {
+ __scx_disable_and_exit_task(scx_enabling_sub_sched, p);
+ p->scx.flags &= ~SCX_TASK_SUB_INIT;
+ }
+
scx_set_task_sched(p, NULL);
scx_set_task_state(p, SCX_TASK_NONE);
}
percpu_rwsem_assert_held(&scx_fork_rwsem);
if (scx_init_task_enabled) {
- ret = scx_init_task(scx_root, p, true);
+#ifdef CONFIG_EXT_SUB_SCHED
+ struct scx_sched *sch = kargs->cset->dfl_cgrp->scx_sched;
+#else
+ struct scx_sched *sch = scx_root;
+#endif
+ ret = scx_init_task(sch, p, true);
if (!ret)
- scx_set_task_sched(p, scx_root);
+ scx_set_task_sched(p, sch);
return ret;
}
struct rq *rq = cpu_rq(cpu);
struct task_struct *p, *n;
+ raw_spin_lock(&scx_sched_lock);
raw_spin_rq_lock(rq);
- raw_spin_lock(&scx_sched_lock);
scx_for_each_descendant_pre(pos, sch) {
struct scx_sched_pcpu *pcpu = per_cpu_ptr(pos->pcpu, cpu);
else
pcpu->flags &= ~SCX_SCHED_PCPU_BYPASSING;
}
+
raw_spin_unlock(&scx_sched_lock);
/*
wait_event(scx_unlink_waitq, list_empty(&sch->children));
}
+static void scx_fail_parent(struct scx_sched *sch,
+ struct task_struct *failed, s32 fail_code)
+{
+ struct scx_sched *parent = scx_parent(sch);
+ struct scx_task_iter sti;
+ struct task_struct *p;
+
+ scx_error(parent, "ops.init_task() failed (%d) for %s[%d] while disabling a sub-scheduler",
+ fail_code, failed->comm, failed->pid);
+
+ /*
+ * Once $parent is bypassed, it's safe to put SCX_TASK_NONE tasks into
+ * it. This may cause downstream failures on the BPF side but $parent is
+ * dying anyway.
+ */
+ scx_bypass(parent, true);
+
+ scx_task_iter_start(&sti, sch->cgrp);
+ while ((p = scx_task_iter_next_locked(&sti))) {
+ if (scx_task_on_sched(parent, p))
+ continue;
+
+ scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
+ scx_disable_and_exit_task(sch, p);
+ rcu_assign_pointer(p->scx.sched, parent);
+ }
+ }
+ scx_task_iter_stop(&sti);
+}
+
static void scx_sub_disable(struct scx_sched *sch)
{
struct scx_sched *parent = scx_parent(sch);
+ struct scx_task_iter sti;
+ struct task_struct *p;
+ int ret;
+ /*
+ * Guarantee forward progress and wait for descendants to be disabled.
+ * To limit
+ * disruptions, $parent is not bypassed. Tasks are fully prepped and
+ * then inserted back into $parent.
+ */
+ scx_bypass(sch, true);
drain_descendants(sch);
+ /*
+ * Here, every runnable task is guaranteed to make forward progress and
+ * we can safely use blocking synchronization constructs. Actually
+ * disable ops.
+ */
mutex_lock(&scx_enable_mutex);
percpu_down_write(&scx_fork_rwsem);
scx_cgroup_lock();
set_cgroup_sched(sch_cgroup(sch), parent);
- /* TODO - perform actual disabling here */
+ scx_task_iter_start(&sti, sch->cgrp);
+ while ((p = scx_task_iter_next_locked(&sti))) {
+ struct rq *rq;
+ struct rq_flags rf;
+
+ /* filter out duplicate visits */
+ if (scx_task_on_sched(parent, p))
+ continue;
+
+ /*
+ * By the time control reaches here, all descendant schedulers
+ * should already have been disabled.
+ */
+ WARN_ON_ONCE(!scx_task_on_sched(sch, p));
+
+ /*
+ * If $p is about to be freed, nothing prevents $sch from
+ * unloading before $p reaches sched_ext_free(). Disable and
+ * exit $p right away.
+ */
+ if (!tryget_task_struct(p)) {
+ scx_disable_and_exit_task(sch, p);
+ continue;
+ }
+
+ scx_task_iter_unlock(&sti);
+
+ /*
+ * $p is READY or ENABLED on @sch. Initialize for $parent,
+ * disable and exit from @sch, and then switch over to $parent.
+ *
+ * If a task fails to initialize for $parent, the only available
+ * action is disabling $parent too. While this allows disabling
+ * of a child sched to cause the parent scheduler to fail, the
+ * failure can only originate from ops.init_task() of the
+ * parent. A child can't directly affect the parent through its
+ * own failures.
+ */
+ ret = __scx_init_task(parent, p, false);
+ if (ret) {
+ scx_fail_parent(sch, p, ret);
+ put_task_struct(p);
+ break;
+ }
+
+ rq = task_rq_lock(p, &rf);
+ scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
+ /*
+ * $p is initialized for $parent and still attached to
+ * @sch. Disable and exit for @sch, switch over to
+ * $parent, override the state to READY to account for
+ * $p having already been initialized, and then enable.
+ */
+ scx_disable_and_exit_task(sch, p);
+ scx_set_task_state(p, SCX_TASK_INIT);
+ rcu_assign_pointer(p->scx.sched, parent);
+ scx_set_task_state(p, SCX_TASK_READY);
+ scx_enable_task(parent, p);
+ }
+ task_rq_unlock(rq, p, &rf);
+
+ put_task_struct(p);
+ }
+ scx_task_iter_stop(&sti);
scx_cgroup_unlock();
percpu_up_write(&scx_fork_rwsem);
+ /*
+ * All tasks are moved off of @sch but there may still be on-going
+ * operations (e.g. ops.select_cpu()). Drain them by flushing RCU. Use
+ * the expedited version as ancestors may be waiting in bypass mode.
+ * Also, tell the parent that there is no need to keep running bypass
+ * DSQs for us.
+ */
+ synchronize_rcu_expedited();
disable_bypass_dsp(sch);
raw_spin_lock_irq(&scx_sched_lock);
return parent;
}
+static bool assert_task_ready_or_enabled(struct task_struct *p)
+{
+ enum scx_task_state state = scx_get_task_state(p);
+
+ switch (state) {
+ case SCX_TASK_READY:
+ case SCX_TASK_ENABLED:
+ return true;
+ default:
+ WARN_ONCE(true, "sched_ext: Invalid task state %d for %s[%d] during enabling sub sched",
+ state, p->comm, p->pid);
+ return false;
+ }
+}
+
static void scx_sub_enable_workfn(struct kthread_work *work)
{
struct scx_enable_cmd *cmd = container_of(work, struct scx_enable_cmd, work);
struct sched_ext_ops *ops = cmd->ops;
struct cgroup *cgrp;
struct scx_sched *parent, *sch;
- s32 ret;
+ struct scx_task_iter sti;
+ struct task_struct *p;
+ s32 i, ret;
mutex_lock(&scx_enable_mutex);
}
sch->sub_attached = true;
+ scx_bypass(sch, true);
+
+ for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++)
+ if (((void (**)(void))ops)[i])
+ set_bit(i, sch->has_op);
+
percpu_down_write(&scx_fork_rwsem);
scx_cgroup_lock();
goto err_unlock_and_disable;
}
- /* TODO - perform actual enabling here */
+ /*
+ * Initialize tasks for the new child $sch without exiting them for
+ * $parent so that the tasks can always be reverted back to $parent
+ * sched on child init failure.
+ */
+ WARN_ON_ONCE(scx_enabling_sub_sched);
+ scx_enabling_sub_sched = sch;
+
+ scx_task_iter_start(&sti, sch->cgrp);
+ while ((p = scx_task_iter_next_locked(&sti))) {
+ struct rq *rq;
+ struct rq_flags rf;
+
+ /*
+ * Task iteration may visit the same task twice when racing
+ * against exiting. Use %SCX_TASK_SUB_INIT to mark tasks which
+ * finished __scx_init_task() and skip if set.
+ *
+ * A task may exit and get freed between __scx_init_task()
+ * completion and scx_enable_task(). In such cases,
+ * scx_disable_and_exit_task() must exit the task for both the
+ * parent and child scheds.
+ */
+ if (p->scx.flags & SCX_TASK_SUB_INIT)
+ continue;
+
+ /* see scx_root_enable() */
+ if (!tryget_task_struct(p))
+ continue;
+
+ if (!assert_task_ready_or_enabled(p)) {
+ ret = -EINVAL;
+ goto abort;
+ }
+
+ scx_task_iter_unlock(&sti);
+
+ /*
+ * As $p is still on $parent, it can't be transitioned to INIT.
+ * Let's worry about task state later. Use __scx_init_task().
+ */
+ ret = __scx_init_task(sch, p, false);
+ if (ret)
+ goto abort;
+
+ rq = task_rq_lock(p, &rf);
+ p->scx.flags |= SCX_TASK_SUB_INIT;
+ task_rq_unlock(rq, p, &rf);
+
+ put_task_struct(p);
+ }
+ scx_task_iter_stop(&sti);
+
+ /*
+ * All tasks are prepped. Disable/exit tasks for $parent and enable for
+ * the new @sch.
+ */
+ scx_task_iter_start(&sti, sch->cgrp);
+ while ((p = scx_task_iter_next_locked(&sti))) {
+ /*
+ * Use clearing of %SCX_TASK_SUB_INIT to detect and skip
+ * duplicate iterations.
+ */
+ if (!(p->scx.flags & SCX_TASK_SUB_INIT))
+ continue;
+
+ scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
+ /*
+ * $p must be either READY or ENABLED. If ENABLED,
+ * __scx_disabled_and_exit_task() first disables and
+ * makes it READY. However, after exiting $p, it will
+ * leave $p as READY.
+ */
+ assert_task_ready_or_enabled(p);
+ __scx_disable_and_exit_task(parent, p);
+
+ /*
+ * $p is now only initialized for @sch and READY, which
+ * is what we want. Assign it to @sch and enable.
+ */
+ rcu_assign_pointer(p->scx.sched, sch);
+ scx_enable_task(sch, p);
+
+ p->scx.flags &= ~SCX_TASK_SUB_INIT;
+ }
+ }
+ scx_task_iter_stop(&sti);
+
+ scx_enabling_sub_sched = NULL;
scx_cgroup_unlock();
percpu_up_write(&scx_fork_rwsem);
+ scx_bypass(sch, false);
+
pr_info("sched_ext: BPF sub-scheduler \"%s\" enabled\n", sch->ops.name);
kobject_uevent(&sch->kobj, KOBJ_ADD);
ret = 0;
goto out_unlock;
+abort:
+ put_task_struct(p);
+ scx_task_iter_stop(&sti);
+ scx_enabling_sub_sched = NULL;
+
+ scx_task_iter_start(&sti, sch->cgrp);
+ while ((p = scx_task_iter_next_locked(&sti))) {
+ if (p->scx.flags & SCX_TASK_SUB_INIT) {
+ __scx_disable_and_exit_task(sch, p);
+ p->scx.flags &= ~SCX_TASK_SUB_INIT;
+ }
+ }
+ scx_task_iter_stop(&sti);
+ scx_cgroup_unlock();
+ percpu_up_write(&scx_fork_rwsem);
out_put_cgrp:
cgroup_put(cgrp);
out_unlock:
return;
err_unlock_and_disable:
+ /* we'll soon enter disable path, keep bypass on */
scx_cgroup_unlock();
percpu_up_write(&scx_fork_rwsem);
err_disable: