sched_ext: Auto-register/unregister dl_server reservations

author Andrea Righi <arighi@nvidia.com>

Tue, 26 May 2026 16:42:48 +0000 (18:42 +0200)

committer Peter Zijlstra <peterz@infradead.org>

Fri, 29 May 2026 10:43:15 +0000 (12:43 +0200)
author Andrea Righi <arighi@nvidia.com>
Tue, 26 May 2026 16:42:48 +0000 (18:42 +0200)
committer Peter Zijlstra <peterz@infradead.org>
Fri, 29 May 2026 10:43:15 +0000 (12:43 +0200)
diff --git a/include/linux/sched.h b/include/linux/sched.h

index da6a0907a78c0ccf367e2827a93ba86a6287d924..8130d13850fc6c6ebc49ee2936aef08145cf6383 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -702,6 +702,11 @@ struct sched_dl_entity {
          * running, skipping the defer phase.
          *
          * @dl_defer_idle tracks idle state
+        *
+        * @dl_bw_attached tells if this server's bandwidth currently
+        * contributes to the root domain's total_bw. Only meaningful for server
+        * entities (@dl_server == 1). Allows toggling the reservation on/off
+        * without losing the configured @dl_runtime/@dl_period.
          */
         unsigned int                    dl_throttled      : 1;
         unsigned int                    dl_yielded        : 1;
@@ -713,6 +718,7 @@ struct sched_dl_entity {
         unsigned int                    dl_defer_armed    : 1;
         unsigned int                    dl_defer_running  : 1;
         unsigned int                    dl_defer_idle     : 1;
+       unsigned int                    dl_bw_attached    : 1;
  
         /*
          * Bandwidth enforcement timer. Each -deadline task has its
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c

index b60e2df8ff9da3c7465745525a9ac32b5e3f9eae..f9e62ed08d775d3ce75139f6992d0f40737ada2a 100644 (file)
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1797,7 +1797,8 @@ void dl_server_start(struct sched_dl_entity *dl_se)
         struct rq *rq = dl_se->rq;
  
         dl_se->dl_defer_idle = 0;
-       if (!dl_server(dl_se) || dl_se->dl_server_active || !dl_se->dl_runtime)
+       if (!dl_server(dl_se) || dl_se->dl_server_active || !dl_se->dl_runtime ||
+           !dl_se->dl_bw_attached)
                 return;
  
         /*
@@ -1872,6 +1873,13 @@ void sched_init_dl_servers(void)
                 dl_se->dl_server = 1;
                 dl_se->dl_defer = 1;
                 setup_new_dl_entity(dl_se);
+
+               /*
+                * No BPF scheduler is loaded at boot, so the ext_server has no
+                * tasks to protect. Detach its bandwidth reservation, it will
+                * be attached when a BPF scheduler is loaded.
+                */
+               dl_server_detach_bw(dl_se);
  #endif
         }
  }
@@ -1882,6 +1890,9 @@ void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq)
         int cpu = cpu_of(rq);
         struct dl_bw *dl_b;
  
+       if (!dl_se->dl_bw_attached)
+               return;
+
         dl_b = dl_bw_of(cpu_of(rq));
         guard(raw_spinlock)(&dl_b->lock);
  
@@ -1893,7 +1904,8 @@ void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq)
  
  int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 period, bool init)
  {
-       u64 old_bw = init ? 0 : to_ratio(dl_se->dl_period, dl_se->dl_runtime);
+       u64 old_bw = (init || !dl_se->dl_bw_attached) ? 0 :
+                    to_ratio(dl_se->dl_period, dl_se->dl_runtime);
         u64 new_bw = to_ratio(period, runtime);
         struct rq *rq = dl_se->rq;
         int cpu = cpu_of(rq);
@@ -1913,7 +1925,8 @@ int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 perio
         if (init) {
                 __add_rq_bw(new_bw, &rq->dl);
                 __dl_add(dl_b, new_bw, cpus);
-       } else {
+               dl_se->dl_bw_attached = 1;
+       } else if (dl_se->dl_bw_attached) {
                 __dl_sub(dl_b, dl_se->dl_bw, cpus);
                 __dl_add(dl_b, new_bw, cpus);
  
@@ -1933,6 +1946,181 @@ int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 perio
         return 0;
  }
  
+/*
+ * Add @dl_se's bw to the root-domain accounting.
+ *
+ * Return -EBUSY if attaching would overflow root domain capacity.
+ */
+static int __dl_server_attach_bw_locked(struct sched_dl_entity *dl_se,
+                                       struct dl_bw *dl_b, int cpus)
+{
+       struct rq *rq = dl_se->rq;
+       unsigned long cap;
+
+       /*
+        * Always update @rq->dl.this_bw, but only update @dl_b->total_bw
+        * (and run the overflow check it gates) while this CPU is active.
+        *
+        * This mirrors dl_server_add_bw() during root-domain rebuilds, which
+        * only publishes bandwidth from active CPUs into @dl_b.
+        */
+       if (cpu_active(cpu_of(rq))) {
+               cap = dl_bw_capacity(cpu_of(rq));
+               if (__dl_overflow(dl_b, cap, 0, dl_se->dl_bw))
+                       return -EBUSY;
+               __dl_add(dl_b, dl_se->dl_bw, cpus);
+       }
+       __add_rq_bw(dl_se->dl_bw, &rq->dl);
+       dl_se->dl_bw_attached = 1;
+
+       return 0;
+}
+
+/*
+ * Drain @dl_se and remove its bw from the root-domain accounting.
+ */
+static void __dl_server_detach_bw_locked(struct sched_dl_entity *dl_se,
+                                        struct dl_bw *dl_b, int cpus)
+{
+       struct rq *rq = dl_se->rq;
+
+       /*
+        * If the server is still active (on_rq), dequeue it via
+        * dl_server_stop(); task_non_contending() will either subtract
+        * @dl_bw from running_bw immediately (0-lag passed) or set
+        * dl_non_contending and arm the inactive_timer.
+        */
+       if (dl_se->dl_server_active)
+               dl_server_stop(dl_se);
+
+       /*
+        * Drop @dl_se's contribution from this rq's bandwidth accounting,
+        * mirroring the __add_rq_bw() done at attach time.
+        */
+       dl_rq_change_utilization(rq, dl_se, 0);
+
+       /*
+        * Update @dl_b only while this CPU is active, matching
+        * dl_server_add_bw() during root-domain rebuilds.
+        *
+        * If this CPU is inactive, its bandwidth is not currently accounted in
+        * @dl_b->total_bw: either attach skipped adding it, or a rebuild
+        * already dropped it while re-publishing active CPUs only.
+        *
+        * In that case there is nothing to subtract from @dl_b. Just clear
+        * @dl_se->dl_bw_attached; if the CPU becomes active again, the next
+        * rebuild will re-publish its bandwidth.
+        */
+       if (cpu_active(cpu_of(rq)))
+               __dl_sub(dl_b, dl_se->dl_bw, cpus);
+       dl_se->dl_bw_attached = 0;
+}
+
+/*
+ * Attach @dl_se's bandwidth to the root domain's total_bw accounting.
+ *
+ * Use to dynamically register a dl_server's bandwidth reservation while
+ * preserving its configured @dl_runtime / @dl_period. No-op if @dl_se is
+ * already attached.
+ *
+ * Returns -EBUSY if attaching would overflow the root domain capacity.
+ */
+int dl_server_attach_bw(struct sched_dl_entity *dl_se)
+{
+       struct rq *rq = dl_se->rq;
+       int cpu = cpu_of(rq);
+       struct dl_bw *dl_b;
+       int cpus, ret;
+
+       if (dl_se->dl_bw_attached)
+               return 0;
+
+       scoped_guard (raw_spinlock, &dl_bw_of(cpu)->lock) {
+               dl_b = dl_bw_of(cpu);
+               cpus = dl_bw_cpus(cpu);
+               ret = __dl_server_attach_bw_locked(dl_se, dl_b, cpus);
+       }
+       if (ret)
+               return ret;
+
+       /*
+        * The natural 0->nr_running transition that triggers dl_server_start()
+        * may have happened while @dl_se was still detached (e.g., between
+        * scx_bypass(false) and the scx_enable() re-balance loop), so kick a
+        * start here.
+        *
+        * dl_server_start() bails out cleanly if there's nothing to schedule or
+        * it's already active. Skip if @cpu is offline; the server will be
+        * started naturally on the first enqueue once @cpu comes back.
+        */
+       if (cpu_online(cpu))
+               dl_server_start(dl_se);
+
+       return 0;
+}
+
+/*
+ * Detach @dl_se's bandwidth from the root domain's total_bw accounting.
+ *
+ * Use to dynamically unregister a dl_server's bandwidth reservation while
+ * preserving its configured @dl_runtime / @dl_period. No-op if @dl_se is
+ * not currently attached.
+ */
+void dl_server_detach_bw(struct sched_dl_entity *dl_se)
+{
+       int cpu = cpu_of(dl_se->rq);
+       struct dl_bw *dl_b;
+       int cpus;
+
+       if (!dl_se->dl_bw_attached)
+               return;
+
+       dl_b = dl_bw_of(cpu);
+       guard(raw_spinlock)(&dl_b->lock);
+       cpus = dl_bw_cpus(cpu);
+       __dl_server_detach_bw_locked(dl_se, dl_b, cpus);
+}
+
+/*
+ * Atomically detach @detach_se and attach @attach_se on the same rq, holding
+ * @dl_b->lock across both operations so a concurrent sched_setattr() cannot
+ * steal the bandwidth freed by the detach before the attach can claim it.
+ *
+ * Both entities must live on the same rq (same root domain). Returns the
+ * result of the attach: -EBUSY if attaching @attach_se would overflow root
+ * domain capacity (in which case both servers end up detached).
+ */
+int dl_server_swap_bw(struct sched_dl_entity *detach_se,
+                     struct sched_dl_entity *attach_se)
+{
+       struct rq *rq = detach_se->rq;
+       int cpu = cpu_of(rq);
+       struct dl_bw *dl_b;
+       int cpus, ret;
+
+       WARN_ON_ONCE(attach_se->rq != rq);
+
+       scoped_guard (raw_spinlock, &dl_bw_of(cpu)->lock) {
+               dl_b = dl_bw_of(cpu);
+               cpus = dl_bw_cpus(cpu);
+
+               if (detach_se->dl_bw_attached)
+                       __dl_server_detach_bw_locked(detach_se, dl_b, cpus);
+
+               if (attach_se->dl_bw_attached)
+                       ret = 0;
+               else
+                       ret = __dl_server_attach_bw_locked(attach_se, dl_b, cpus);
+       }
+       if (ret)
+               return ret;
+
+       if (cpu_online(cpu))
+               dl_server_start(attach_se);
+
+       return 0;
+}
+
  /*
   * Update the current task's runtime statistics (provided it is still
   * a -deadline task and has not been removed from the dl_rq).
@@ -3233,12 +3421,12 @@ static void dl_server_add_bw(struct root_domain *rd, int cpu)
         struct sched_dl_entity *dl_se;
  
         dl_se = &cpu_rq(cpu)->fair_server;
-       if (dl_server(dl_se) && cpu_active(cpu))
+       if (dl_server(dl_se) && dl_se->dl_bw_attached && cpu_active(cpu))
                 __dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(cpu));
  
  #ifdef CONFIG_SCHED_CLASS_EXT
         dl_se = &cpu_rq(cpu)->ext_server;
-       if (dl_server(dl_se) && cpu_active(cpu))
+       if (dl_server(dl_se) && dl_se->dl_bw_attached && cpu_active(cpu))
                 __dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(cpu));
  #endif
  }
@@ -3247,11 +3435,13 @@ static u64 dl_server_read_bw(int cpu)
  {
         u64 dl_bw = 0;
  
-       if (cpu_rq(cpu)->fair_server.dl_server)
+       if (cpu_rq(cpu)->fair_server.dl_server &&
+           cpu_rq(cpu)->fair_server.dl_bw_attached)
                 dl_bw += cpu_rq(cpu)->fair_server.dl_bw;
  
  #ifdef CONFIG_SCHED_CLASS_EXT
-       if (cpu_rq(cpu)->ext_server.dl_server)
+       if (cpu_rq(cpu)->ext_server.dl_server &&
+           cpu_rq(cpu)->ext_server.dl_bw_attached)
                 dl_bw += cpu_rq(cpu)->ext_server.dl_bw;
  #endif
  
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c

index 345aa11b84b28e3e46c5319619724816a905c8d2..f412c4bb21c3e3846485f2008cf4f87a751e5599 100644 (file)
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -5814,6 +5814,7 @@ static void scx_root_disable(struct scx_sched *sch)
         struct scx_exit_info *ei = sch->exit_info;
         struct scx_task_iter sti;
         struct task_struct *p;
+       bool was_switched_all;
         int cpu;
  
         /* guarantee forward progress and wait for descendants to be disabled */
@@ -5840,6 +5841,8 @@ static void scx_root_disable(struct scx_sched *sch)
          */
         mutex_lock(&scx_enable_mutex);
  
+       was_switched_all = scx_switched_all();
+
         static_branch_disable(&__scx_switched_all);
         WRITE_ONCE(scx_switching_all, false);
  
@@ -5889,10 +5892,34 @@ static void scx_root_disable(struct scx_sched *sch)
         /*
          * Invalidate all the rq clocks to prevent getting outdated
          * rq clocks from a previous scx scheduler.
+        *
+        * Also re-balance the dl_server bandwidth reservations: detach
+        * ext_server (no more sched_ext tasks) and reinstate fair_server if it
+        * was previously detached because we were running in full mode.
+        *
+        * Unlike the enable path, this runs on a recovery path that cannot
+        * fail, so we use dl_server_swap_bw() to atomically free ext_server's
+        * bandwidth and reclaim it for fair_server under the same dl_b lock.
+        *
+        * The swap can still fail with -EBUSY if someone bumped ext_server's
+        * runtime via debugfs between enable and disable; in that narrow case
+        * both servers end up detached and we just WARN.
          */
         for_each_possible_cpu(cpu) {
                 struct rq *rq = cpu_rq(cpu);
+
                 scx_rq_clock_invalidate(rq);
+
+               scoped_guard(rq_lock_irqsave, rq) {
+                       update_rq_clock(rq);
+                       if (was_switched_all) {
+                               if (WARN_ON_ONCE(dl_server_swap_bw(&rq->ext_server,
+                                                                  &rq->fair_server)))
+                                       pr_warn("failed to re-attach fair_server on CPU %d\n", cpu);
+                       } else {
+                               dl_server_detach_bw(&rq->ext_server);
+                       }
+               }
         }
  
         /* no task is on scx, turn off all the switches and flush in-progress calls */
@@ -6810,6 +6837,31 @@ static void scx_root_enable_workfn(struct kthread_work *work)
         if (ret)
                 goto err_disable;
  
+       /*
+        * Attach the ext_server bandwidth reservation before anything is
+        * committed so that we can fail the enable if the root domain cannot
+        * accommodate it. The matching fair_server detach is deferred to the
+        * tail of this function, after the switch is fully committed and can no
+        * longer fail.
+        *
+        * On failure, err_disable funnels into scx_root_disable() which
+        * detaches ext_server, so partially-attached state is cleaned up
+        * automatically.
+        */
+       for_each_possible_cpu(cpu) {
+               struct rq *rq = cpu_rq(cpu);
+
+               scoped_guard(rq_lock_irqsave, rq) {
+                       update_rq_clock(rq);
+                       ret = dl_server_attach_bw(&rq->ext_server);
+               }
+               if (ret) {
+                       pr_warn("sched_ext: failed to attach ext_server on CPU %d (%d)\n",
+                               cpu, ret);
+                       goto err_disable;
+               }
+       }
+
         /*
          * Once __scx_enabled is set, %current can be switched to SCX anytime.
          * This can lead to stalls as some BPF schedulers (e.g. userspace
@@ -6926,6 +6978,25 @@ static void scx_root_enable_workfn(struct kthread_work *work)
         if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL))
                 static_branch_enable(&__scx_switched_all);
  
+       /*
+        * Detach the fair_server bandwidth reservation now that the switch
+        * is fully committed. In full mode (!SCX_OPS_SWITCH_PARTIAL) no
+        * task will ever run in the fair class, so give that bandwidth
+        * back to the RT class. The matching ext_server attach already
+        * happened earlier; this only releases bandwidth and cannot fail.
+        *
+        * In partial mode keep fair_server attached.
+        */
+       if (scx_switched_all()) {
+               for_each_possible_cpu(cpu) {
+                       struct rq *rq = cpu_rq(cpu);
+
+                       guard(rq_lock_irqsave)(rq);
+                       update_rq_clock(rq);
+                       dl_server_detach_bw(&rq->fair_server);
+               }
+       }
+
         pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n",
                 sch->ops.name, scx_switched_all() ? "" : " (partial)");
         kobject_uevent(&sch->kobj, KOBJ_ADD);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index 6b48bb3074fe280c2c908cd94e2fc1f6434a3590..332ecf8930b46347afceb6f0c1584ceb10fbd4ff 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -421,6 +421,10 @@ extern void ext_server_init(struct rq *rq);
  extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq);
  extern int dl_server_apply_params(struct sched_dl_entity *dl_se,
                     u64 runtime, u64 period, bool init);
+extern int dl_server_attach_bw(struct sched_dl_entity *dl_se);
+extern void dl_server_detach_bw(struct sched_dl_entity *dl_se);
+extern int dl_server_swap_bw(struct sched_dl_entity *detach_se,
+                            struct sched_dl_entity *attach_se);
  
  static inline bool dl_server_active(struct sched_dl_entity *dl_se)
  {
author	Andrea Righi <arighi@nvidia.com>
	Tue, 26 May 2026 16:42:48 +0000 (18:42 +0200)
committer	Peter Zijlstra <peterz@infradead.org>
	Fri, 29 May 2026 10:43:15 +0000 (12:43 +0200)
include/linux/sched.h		patch \| blob \| blame \| history
kernel/sched/deadline.c		patch \| blob \| blame \| history
kernel/sched/ext.c		patch \| blob \| blame \| history
kernel/sched/sched.h		patch \| blob \| blame \| history