struct rq *rq = dl_se->rq;
dl_se->dl_defer_idle = 0;
- if (!dl_server(dl_se) || dl_se->dl_server_active || !dl_se->dl_runtime)
+ if (!dl_server(dl_se) || dl_se->dl_server_active || !dl_se->dl_runtime ||
+ !dl_se->dl_bw_attached)
return;
/*
dl_se->dl_server = 1;
dl_se->dl_defer = 1;
setup_new_dl_entity(dl_se);
+
+ /*
+ * No BPF scheduler is loaded at boot, so the ext_server has no
+ * tasks to protect. Detach its bandwidth reservation, it will
+ * be attached when a BPF scheduler is loaded.
+ */
+ dl_server_detach_bw(dl_se);
#endif
}
}
int cpu = cpu_of(rq);
struct dl_bw *dl_b;
+ if (!dl_se->dl_bw_attached)
+ return;
+
dl_b = dl_bw_of(cpu_of(rq));
guard(raw_spinlock)(&dl_b->lock);
int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 period, bool init)
{
- u64 old_bw = init ? 0 : to_ratio(dl_se->dl_period, dl_se->dl_runtime);
+ u64 old_bw = (init || !dl_se->dl_bw_attached) ? 0 :
+ to_ratio(dl_se->dl_period, dl_se->dl_runtime);
u64 new_bw = to_ratio(period, runtime);
struct rq *rq = dl_se->rq;
int cpu = cpu_of(rq);
if (init) {
__add_rq_bw(new_bw, &rq->dl);
__dl_add(dl_b, new_bw, cpus);
- } else {
+ dl_se->dl_bw_attached = 1;
+ } else if (dl_se->dl_bw_attached) {
__dl_sub(dl_b, dl_se->dl_bw, cpus);
__dl_add(dl_b, new_bw, cpus);
return 0;
}
+/*
+ * Add @dl_se's bw to the root-domain accounting.
+ *
+ * Return -EBUSY if attaching would overflow root domain capacity.
+ */
+static int __dl_server_attach_bw_locked(struct sched_dl_entity *dl_se,
+ struct dl_bw *dl_b, int cpus)
+{
+ struct rq *rq = dl_se->rq;
+ unsigned long cap;
+
+ /*
+ * Always update @rq->dl.this_bw, but only update @dl_b->total_bw
+ * (and run the overflow check it gates) while this CPU is active.
+ *
+ * This mirrors dl_server_add_bw() during root-domain rebuilds, which
+ * only publishes bandwidth from active CPUs into @dl_b.
+ */
+ if (cpu_active(cpu_of(rq))) {
+ cap = dl_bw_capacity(cpu_of(rq));
+ if (__dl_overflow(dl_b, cap, 0, dl_se->dl_bw))
+ return -EBUSY;
+ __dl_add(dl_b, dl_se->dl_bw, cpus);
+ }
+ __add_rq_bw(dl_se->dl_bw, &rq->dl);
+ dl_se->dl_bw_attached = 1;
+
+ return 0;
+}
+
+/*
+ * Drain @dl_se and remove its bw from the root-domain accounting.
+ */
+static void __dl_server_detach_bw_locked(struct sched_dl_entity *dl_se,
+ struct dl_bw *dl_b, int cpus)
+{
+ struct rq *rq = dl_se->rq;
+
+ /*
+ * If the server is still active (on_rq), dequeue it via
+ * dl_server_stop(); task_non_contending() will either subtract
+ * @dl_bw from running_bw immediately (0-lag passed) or set
+ * dl_non_contending and arm the inactive_timer.
+ */
+ if (dl_se->dl_server_active)
+ dl_server_stop(dl_se);
+
+ /*
+ * Drop @dl_se's contribution from this rq's bandwidth accounting,
+ * mirroring the __add_rq_bw() done at attach time.
+ */
+ dl_rq_change_utilization(rq, dl_se, 0);
+
+ /*
+ * Update @dl_b only while this CPU is active, matching
+ * dl_server_add_bw() during root-domain rebuilds.
+ *
+ * If this CPU is inactive, its bandwidth is not currently accounted in
+ * @dl_b->total_bw: either attach skipped adding it, or a rebuild
+ * already dropped it while re-publishing active CPUs only.
+ *
+ * In that case there is nothing to subtract from @dl_b. Just clear
+ * @dl_se->dl_bw_attached; if the CPU becomes active again, the next
+ * rebuild will re-publish its bandwidth.
+ */
+ if (cpu_active(cpu_of(rq)))
+ __dl_sub(dl_b, dl_se->dl_bw, cpus);
+ dl_se->dl_bw_attached = 0;
+}
+
+/*
+ * Attach @dl_se's bandwidth to the root domain's total_bw accounting.
+ *
+ * Use to dynamically register a dl_server's bandwidth reservation while
+ * preserving its configured @dl_runtime / @dl_period. No-op if @dl_se is
+ * already attached.
+ *
+ * Returns -EBUSY if attaching would overflow the root domain capacity.
+ */
+int dl_server_attach_bw(struct sched_dl_entity *dl_se)
+{
+ struct rq *rq = dl_se->rq;
+ int cpu = cpu_of(rq);
+ struct dl_bw *dl_b;
+ int cpus, ret;
+
+ if (dl_se->dl_bw_attached)
+ return 0;
+
+ scoped_guard (raw_spinlock, &dl_bw_of(cpu)->lock) {
+ dl_b = dl_bw_of(cpu);
+ cpus = dl_bw_cpus(cpu);
+ ret = __dl_server_attach_bw_locked(dl_se, dl_b, cpus);
+ }
+ if (ret)
+ return ret;
+
+ /*
+ * The natural 0->nr_running transition that triggers dl_server_start()
+ * may have happened while @dl_se was still detached (e.g., between
+ * scx_bypass(false) and the scx_enable() re-balance loop), so kick a
+ * start here.
+ *
+ * dl_server_start() bails out cleanly if there's nothing to schedule or
+ * it's already active. Skip if @cpu is offline; the server will be
+ * started naturally on the first enqueue once @cpu comes back.
+ */
+ if (cpu_online(cpu))
+ dl_server_start(dl_se);
+
+ return 0;
+}
+
+/*
+ * Detach @dl_se's bandwidth from the root domain's total_bw accounting.
+ *
+ * Use to dynamically unregister a dl_server's bandwidth reservation while
+ * preserving its configured @dl_runtime / @dl_period. No-op if @dl_se is
+ * not currently attached.
+ */
+void dl_server_detach_bw(struct sched_dl_entity *dl_se)
+{
+ int cpu = cpu_of(dl_se->rq);
+ struct dl_bw *dl_b;
+ int cpus;
+
+ if (!dl_se->dl_bw_attached)
+ return;
+
+ dl_b = dl_bw_of(cpu);
+ guard(raw_spinlock)(&dl_b->lock);
+ cpus = dl_bw_cpus(cpu);
+ __dl_server_detach_bw_locked(dl_se, dl_b, cpus);
+}
+
+/*
+ * Atomically detach @detach_se and attach @attach_se on the same rq, holding
+ * @dl_b->lock across both operations so a concurrent sched_setattr() cannot
+ * steal the bandwidth freed by the detach before the attach can claim it.
+ *
+ * Both entities must live on the same rq (same root domain). Returns the
+ * result of the attach: -EBUSY if attaching @attach_se would overflow root
+ * domain capacity (in which case both servers end up detached).
+ */
+int dl_server_swap_bw(struct sched_dl_entity *detach_se,
+ struct sched_dl_entity *attach_se)
+{
+ struct rq *rq = detach_se->rq;
+ int cpu = cpu_of(rq);
+ struct dl_bw *dl_b;
+ int cpus, ret;
+
+ WARN_ON_ONCE(attach_se->rq != rq);
+
+ scoped_guard (raw_spinlock, &dl_bw_of(cpu)->lock) {
+ dl_b = dl_bw_of(cpu);
+ cpus = dl_bw_cpus(cpu);
+
+ if (detach_se->dl_bw_attached)
+ __dl_server_detach_bw_locked(detach_se, dl_b, cpus);
+
+ if (attach_se->dl_bw_attached)
+ ret = 0;
+ else
+ ret = __dl_server_attach_bw_locked(attach_se, dl_b, cpus);
+ }
+ if (ret)
+ return ret;
+
+ if (cpu_online(cpu))
+ dl_server_start(attach_se);
+
+ return 0;
+}
+
/*
* Update the current task's runtime statistics (provided it is still
* a -deadline task and has not been removed from the dl_rq).
struct sched_dl_entity *dl_se;
dl_se = &cpu_rq(cpu)->fair_server;
- if (dl_server(dl_se) && cpu_active(cpu))
+ if (dl_server(dl_se) && dl_se->dl_bw_attached && cpu_active(cpu))
__dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(cpu));
#ifdef CONFIG_SCHED_CLASS_EXT
dl_se = &cpu_rq(cpu)->ext_server;
- if (dl_server(dl_se) && cpu_active(cpu))
+ if (dl_server(dl_se) && dl_se->dl_bw_attached && cpu_active(cpu))
__dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(cpu));
#endif
}
{
u64 dl_bw = 0;
- if (cpu_rq(cpu)->fair_server.dl_server)
+ if (cpu_rq(cpu)->fair_server.dl_server &&
+ cpu_rq(cpu)->fair_server.dl_bw_attached)
dl_bw += cpu_rq(cpu)->fair_server.dl_bw;
#ifdef CONFIG_SCHED_CLASS_EXT
- if (cpu_rq(cpu)->ext_server.dl_server)
+ if (cpu_rq(cpu)->ext_server.dl_server &&
+ cpu_rq(cpu)->ext_server.dl_bw_attached)
dl_bw += cpu_rq(cpu)->ext_server.dl_bw;
#endif
struct scx_exit_info *ei = sch->exit_info;
struct scx_task_iter sti;
struct task_struct *p;
+ bool was_switched_all;
int cpu;
/* guarantee forward progress and wait for descendants to be disabled */
*/
mutex_lock(&scx_enable_mutex);
+ was_switched_all = scx_switched_all();
+
static_branch_disable(&__scx_switched_all);
WRITE_ONCE(scx_switching_all, false);
/*
* Invalidate all the rq clocks to prevent getting outdated
* rq clocks from a previous scx scheduler.
+ *
+ * Also re-balance the dl_server bandwidth reservations: detach
+ * ext_server (no more sched_ext tasks) and reinstate fair_server if it
+ * was previously detached because we were running in full mode.
+ *
+ * Unlike the enable path, this runs on a recovery path that cannot
+ * fail, so we use dl_server_swap_bw() to atomically free ext_server's
+ * bandwidth and reclaim it for fair_server under the same dl_b lock.
+ *
+ * The swap can still fail with -EBUSY if someone bumped ext_server's
+ * runtime via debugfs between enable and disable; in that narrow case
+ * both servers end up detached and we just WARN.
*/
for_each_possible_cpu(cpu) {
struct rq *rq = cpu_rq(cpu);
+
scx_rq_clock_invalidate(rq);
+
+ scoped_guard(rq_lock_irqsave, rq) {
+ update_rq_clock(rq);
+ if (was_switched_all) {
+ if (WARN_ON_ONCE(dl_server_swap_bw(&rq->ext_server,
+ &rq->fair_server)))
+ pr_warn("failed to re-attach fair_server on CPU %d\n", cpu);
+ } else {
+ dl_server_detach_bw(&rq->ext_server);
+ }
+ }
}
/* no task is on scx, turn off all the switches and flush in-progress calls */
if (ret)
goto err_disable;
+ /*
+ * Attach the ext_server bandwidth reservation before anything is
+ * committed so that we can fail the enable if the root domain cannot
+ * accommodate it. The matching fair_server detach is deferred to the
+ * tail of this function, after the switch is fully committed and can no
+ * longer fail.
+ *
+ * On failure, err_disable funnels into scx_root_disable() which
+ * detaches ext_server, so partially-attached state is cleaned up
+ * automatically.
+ */
+ for_each_possible_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+
+ scoped_guard(rq_lock_irqsave, rq) {
+ update_rq_clock(rq);
+ ret = dl_server_attach_bw(&rq->ext_server);
+ }
+ if (ret) {
+ pr_warn("sched_ext: failed to attach ext_server on CPU %d (%d)\n",
+ cpu, ret);
+ goto err_disable;
+ }
+ }
+
/*
* Once __scx_enabled is set, %current can be switched to SCX anytime.
* This can lead to stalls as some BPF schedulers (e.g. userspace
if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL))
static_branch_enable(&__scx_switched_all);
+ /*
+ * Detach the fair_server bandwidth reservation now that the switch
+ * is fully committed. In full mode (!SCX_OPS_SWITCH_PARTIAL) no
+ * task will ever run in the fair class, so give that bandwidth
+ * back to the RT class. The matching ext_server attach already
+ * happened earlier; this only releases bandwidth and cannot fail.
+ *
+ * In partial mode keep fair_server attached.
+ */
+ if (scx_switched_all()) {
+ for_each_possible_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+
+ guard(rq_lock_irqsave)(rq);
+ update_rq_clock(rq);
+ dl_server_detach_bw(&rq->fair_server);
+ }
+ }
+
pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n",
sch->ops.name, scx_switched_all() ? "" : " (partial)");
kobject_uevent(&sch->kobj, KOBJ_ADD);