Merge branch 'sched/core' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip...

author Tejun Heo <tj@kernel.org>

Mon, 9 Mar 2026 19:59:36 +0000 (09:59 -1000)

committer Tejun Heo <tj@kernel.org>

Mon, 9 Mar 2026 19:59:36 +0000 (09:59 -1000)
author Tejun Heo <tj@kernel.org>
Mon, 9 Mar 2026 19:59:36 +0000 (09:59 -1000)
committer Tejun Heo <tj@kernel.org>
Mon, 9 Mar 2026 19:59:36 +0000 (09:59 -1000)
diff --cc kernel/sched/core.c
Simple merge
diff --cc kernel/sched/ext.c

index 43fda125890352c71a0a51231a0e19ef78a83abf,7278d574964783194d97e6cb1042e97a3db51c61..b35b98020f3b2b9aef4a658f0e69dc8124d089ac
--- 1/kernel/sched/ext.c
--- 2/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@@ -3205,11 -2766,8 +3205,10 @@@ static void scx_watchdog_workfn(struct 
   
                 cond_resched();
         }
- -      queue_delayed_work(system_dfl_wq, to_delayed_work(work),
- -                         READ_ONCE(scx_watchdog_timeout) / 2);
+ +
+ +      intv = READ_ONCE(scx_watchdog_interval);
+ +      if (intv < ULONG_MAX)
-               queue_delayed_work(system_unbound_wq, to_delayed_work(work),
-                                  intv);
++              queue_delayed_work(system_dfl_wq, to_delayed_work(work), intv);
   }
   
   void scx_tick(struct rq *rq)
@@@ -5218,255 -4282,28 +5217,255 @@@ static void free_kick_syncs(void
         }
   }
   
- -static void scx_disable_workfn(struct kthread_work *work)
+ +static void refresh_watchdog(void)
   {
- -      struct scx_sched *sch = container_of(work, struct scx_sched, disable_work);
- -      struct scx_exit_info *ei = sch->exit_info;
+ +      struct scx_sched *sch;
+ +      unsigned long intv = ULONG_MAX;
+ +
+ +      /* take the shortest timeout and use its half for watchdog interval */
+ +      rcu_read_lock();
+ +      list_for_each_entry_rcu(sch, &scx_sched_all, all)
+ +              intv = max(min(intv, sch->watchdog_timeout / 2), 1);
+ +      rcu_read_unlock();
+ +
+ +      WRITE_ONCE(scx_watchdog_timestamp, jiffies);
+ +      WRITE_ONCE(scx_watchdog_interval, intv);
+ +
+ +      if (intv < ULONG_MAX)
-               mod_delayed_work(system_unbound_wq, &scx_watchdog_work, intv);
++              mod_delayed_work(system_dfl_wq, &scx_watchdog_work, intv);
+ +      else
+ +              cancel_delayed_work_sync(&scx_watchdog_work);
+ +}
+ +
+ +static s32 scx_link_sched(struct scx_sched *sch)
+ +{
+ +      scoped_guard(raw_spinlock_irq, &scx_sched_lock) {
+ +#ifdef CONFIG_EXT_SUB_SCHED
+ +              struct scx_sched *parent = scx_parent(sch);
+ +              s32 ret;
+ +
+ +              if (parent) {
+ +                      ret = rhashtable_lookup_insert_fast(&scx_sched_hash,
+ +                                      &sch->hash_node, scx_sched_hash_params);
+ +                      if (ret) {
+ +                              scx_error(sch, "failed to insert into scx_sched_hash (%d)", ret);
+ +                              return ret;
+ +                      }
+ +
+ +                      list_add_tail(&sch->sibling, &parent->children);
+ +              }
+ +#endif        /* CONFIG_EXT_SUB_SCHED */
+ +
+ +              list_add_tail_rcu(&sch->all, &scx_sched_all);
+ +      }
+ +
+ +      refresh_watchdog();
+ +      return 0;
+ +}
+ +
+ +static void scx_unlink_sched(struct scx_sched *sch)
+ +{
+ +      scoped_guard(raw_spinlock_irq, &scx_sched_lock) {
+ +#ifdef CONFIG_EXT_SUB_SCHED
+ +              if (scx_parent(sch)) {
+ +                      rhashtable_remove_fast(&scx_sched_hash, &sch->hash_node,
+ +                                             scx_sched_hash_params);
+ +                      list_del_init(&sch->sibling);
+ +              }
+ +#endif        /* CONFIG_EXT_SUB_SCHED */
+ +              list_del_rcu(&sch->all);
+ +      }
+ +
+ +      refresh_watchdog();
+ +}
+ +
+ +#ifdef CONFIG_EXT_SUB_SCHED
+ +static DECLARE_WAIT_QUEUE_HEAD(scx_unlink_waitq);
+ +
+ +static void drain_descendants(struct scx_sched *sch)
+ +{
+ +      /*
+ +       * Child scheds that finished the critical part of disabling will take
+ +       * themselves off @sch->children. Wait for it to drain. As propagation
+ +       * is recursive, empty @sch->children means that all proper descendant
+ +       * scheds reached unlinking stage.
+ +       */
+ +      wait_event(scx_unlink_waitq, list_empty(&sch->children));
+ +}
+ +
+ +static void scx_fail_parent(struct scx_sched *sch,
+ +                          struct task_struct *failed, s32 fail_code)
+ +{
+ +      struct scx_sched *parent = scx_parent(sch);
         struct scx_task_iter sti;
         struct task_struct *p;
- -      int kind, cpu;
   
- -      kind = atomic_read(&sch->exit_kind);
- -      while (true) {
- -              if (kind == SCX_EXIT_DONE)      /* already disabled? */
- -                      return;
- -              WARN_ON_ONCE(kind == SCX_EXIT_NONE);
- -              if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE))
+ +      scx_error(parent, "ops.init_task() failed (%d) for %s[%d] while disabling a sub-scheduler",
+ +                fail_code, failed->comm, failed->pid);
+ +
+ +      /*
+ +       * Once $parent is bypassed, it's safe to put SCX_TASK_NONE tasks into
+ +       * it. This may cause downstream failures on the BPF side but $parent is
+ +       * dying anyway.
+ +       */
+ +      scx_bypass(parent, true);
+ +
+ +      scx_task_iter_start(&sti, sch->cgrp);
+ +      while ((p = scx_task_iter_next_locked(&sti))) {
+ +              if (scx_task_on_sched(parent, p))
+ +                      continue;
+ +
+ +              scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
+ +                      scx_disable_and_exit_task(sch, p);
+ +                      rcu_assign_pointer(p->scx.sched, parent);
+ +              }
+ +      }
+ +      scx_task_iter_stop(&sti);
+ +}
+ +
+ +static void scx_sub_disable(struct scx_sched *sch)
+ +{
+ +      struct scx_sched *parent = scx_parent(sch);
+ +      struct scx_task_iter sti;
+ +      struct task_struct *p;
+ +      int ret;
+ +
+ +      /*
+ +       * Guarantee forward progress and wait for descendants to be disabled.
+ +       * To limit disruptions, $parent is not bypassed. Tasks are fully
+ +       * prepped and then inserted back into $parent.
+ +       */
+ +      scx_bypass(sch, true);
+ +      drain_descendants(sch);
+ +
+ +      /*
+ +       * Here, every runnable task is guaranteed to make forward progress and
+ +       * we can safely use blocking synchronization constructs. Actually
+ +       * disable ops.
+ +       */
+ +      mutex_lock(&scx_enable_mutex);
+ +      percpu_down_write(&scx_fork_rwsem);
+ +      scx_cgroup_lock();
+ +
+ +      set_cgroup_sched(sch_cgroup(sch), parent);
+ +
+ +      scx_task_iter_start(&sti, sch->cgrp);
+ +      while ((p = scx_task_iter_next_locked(&sti))) {
+ +              struct rq *rq;
+ +              struct rq_flags rf;
+ +
+ +              /* filter out duplicate visits */
+ +              if (scx_task_on_sched(parent, p))
+ +                      continue;
+ +
+ +              /*
+ +               * By the time control reaches here, all descendant schedulers
+ +               * should already have been disabled.
+ +               */
+ +              WARN_ON_ONCE(!scx_task_on_sched(sch, p));
+ +
+ +              /*
+ +               * If $p is about to be freed, nothing prevents $sch from
+ +               * unloading before $p reaches sched_ext_free(). Disable and
+ +               * exit $p right away.
+ +               */
+ +              if (!tryget_task_struct(p)) {
+ +                      scx_disable_and_exit_task(sch, p);
+ +                      continue;
+ +              }
+ +
+ +              scx_task_iter_unlock(&sti);
+ +
+ +              /*
+ +               * $p is READY or ENABLED on @sch. Initialize for $parent,
+ +               * disable and exit from @sch, and then switch over to $parent.
+ +               *
+ +               * If a task fails to initialize for $parent, the only available
+ +               * action is disabling $parent too. While this allows disabling
+ +               * of a child sched to cause the parent scheduler to fail, the
+ +               * failure can only originate from ops.init_task() of the
+ +               * parent. A child can't directly affect the parent through its
+ +               * own failures.
+ +               */
+ +              ret = __scx_init_task(parent, p, false);
+ +              if (ret) {
+ +                      scx_fail_parent(sch, p, ret);
+ +                      put_task_struct(p);
                         break;
+ +              }
+ +
+ +              rq = task_rq_lock(p, &rf);
+ +              scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
+ +                      /*
+ +                       * $p is initialized for $parent and still attached to
+ +                       * @sch. Disable and exit for @sch, switch over to
+ +                       * $parent, override the state to READY to account for
+ +                       * $p having already been initialized, and then enable.
+ +                       */
+ +                      scx_disable_and_exit_task(sch, p);
+ +                      scx_set_task_state(p, SCX_TASK_INIT);
+ +                      rcu_assign_pointer(p->scx.sched, parent);
+ +                      scx_set_task_state(p, SCX_TASK_READY);
+ +                      scx_enable_task(parent, p);
+ +              }
+ +              task_rq_unlock(rq, p, &rf);
+ +
+ +              put_task_struct(p);
         }
- -      ei->kind = kind;
- -      ei->reason = scx_exit_reason(ei->kind);
+ +      scx_task_iter_stop(&sti);
   
- -      /* guarantee forward progress by bypassing scx_ops */
- -      scx_bypass(true);
- -      WRITE_ONCE(scx_aborting, false);
+ +      scx_cgroup_unlock();
+ +      percpu_up_write(&scx_fork_rwsem);
+ +
+ +      /*
+ +       * All tasks are moved off of @sch but there may still be on-going
+ +       * operations (e.g. ops.select_cpu()). Drain them by flushing RCU. Use
+ +       * the expedited version as ancestors may be waiting in bypass mode.
+ +       * Also, tell the parent that there is no need to keep running bypass
+ +       * DSQs for us.
+ +       */
+ +      synchronize_rcu_expedited();
+ +      disable_bypass_dsp(sch);
+ +
+ +      scx_unlink_sched(sch);
+ +
+ +      mutex_unlock(&scx_enable_mutex);
+ +
+ +      /*
+ +       * @sch is now unlinked from the parent's children list. Notify and call
+ +       * ops.sub_detach/exit(). Note that ops.sub_detach/exit() must be called
+ +       * after unlinking and releasing all locks. See scx_claim_exit().
+ +       */
+ +      wake_up_all(&scx_unlink_waitq);
+ +
+ +      if (sch->ops.sub_detach && sch->sub_attached) {
+ +              struct scx_sub_detach_args sub_detach_args = {
+ +                      .ops = &sch->ops,
+ +                      .cgroup_path = sch->cgrp_path,
+ +              };
+ +              SCX_CALL_OP(parent, SCX_KF_UNLOCKED, sub_detach, NULL,
+ +                          &sub_detach_args);
+ +      }
+ +
+ +      if (sch->ops.exit)
+ +              SCX_CALL_OP(sch, SCX_KF_UNLOCKED, exit, NULL, sch->exit_info);
+ +      kobject_del(&sch->kobj);
+ +}
+ +#else /* CONFIG_EXT_SUB_SCHED */
+ +static void drain_descendants(struct scx_sched *sch) { }
+ +static void scx_sub_disable(struct scx_sched *sch) { }
+ +#endif        /* CONFIG_EXT_SUB_SCHED */
+ +
+ +static void scx_root_disable(struct scx_sched *sch)
+ +{
+ +      struct scx_exit_info *ei = sch->exit_info;
+ +      struct scx_task_iter sti;
+ +      struct task_struct *p;
+ +      int cpu;
+ +
+ +      /* guarantee forward progress and wait for descendants to be disabled */
+ +      scx_bypass(sch, true);
+ +      drain_descendants(sch);
   
         switch (scx_set_enable_state(SCX_DISABLING)) {
         case SCX_DISABLING:
diff --cc kernel/sched/sched.h
Simple merge
author	Tejun Heo <tj@kernel.org>
	Mon, 9 Mar 2026 19:59:36 +0000 (09:59 -1000)
committer	Tejun Heo <tj@kernel.org>
	Mon, 9 Mar 2026 19:59:36 +0000 (09:59 -1000)
		1	2
kernel/sched/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/ext.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history