]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
Merge branch 'sched/core' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip...
authorTejun Heo <tj@kernel.org>
Mon, 9 Mar 2026 19:59:36 +0000 (09:59 -1000)
committerTejun Heo <tj@kernel.org>
Mon, 9 Mar 2026 19:59:36 +0000 (09:59 -1000)
Pull sched/core to resolve conflicts between:

  c2a57380df9dd ("sched: Replace use of system_unbound_wq with system_dfl_wq")

from the tip tree and commit:

  cde94c032b32b ("sched_ext: Make watchdog sub-sched aware")

The latter moves around code modiefied by the former. Apply the changes in
the new locations.

Signed-off-by: Tejun Heo <tj@kernel.org>
1  2 
kernel/sched/core.c
kernel/sched/ext.c
kernel/sched/sched.h

Simple merge
index 43fda125890352c71a0a51231a0e19ef78a83abf,7278d574964783194d97e6cb1042e97a3db51c61..b35b98020f3b2b9aef4a658f0e69dc8124d089ac
@@@ -3205,11 -2766,8 +3205,10 @@@ static void scx_watchdog_workfn(struct 
  
                cond_resched();
        }
 -      queue_delayed_work(system_dfl_wq, to_delayed_work(work),
 -                         READ_ONCE(scx_watchdog_timeout) / 2);
 +
 +      intv = READ_ONCE(scx_watchdog_interval);
 +      if (intv < ULONG_MAX)
-               queue_delayed_work(system_unbound_wq, to_delayed_work(work),
-                                  intv);
++              queue_delayed_work(system_dfl_wq, to_delayed_work(work), intv);
  }
  
  void scx_tick(struct rq *rq)
@@@ -5218,255 -4282,28 +5217,255 @@@ static void free_kick_syncs(void
        }
  }
  
 -static void scx_disable_workfn(struct kthread_work *work)
 +static void refresh_watchdog(void)
  {
 -      struct scx_sched *sch = container_of(work, struct scx_sched, disable_work);
 -      struct scx_exit_info *ei = sch->exit_info;
 +      struct scx_sched *sch;
 +      unsigned long intv = ULONG_MAX;
 +
 +      /* take the shortest timeout and use its half for watchdog interval */
 +      rcu_read_lock();
 +      list_for_each_entry_rcu(sch, &scx_sched_all, all)
 +              intv = max(min(intv, sch->watchdog_timeout / 2), 1);
 +      rcu_read_unlock();
 +
 +      WRITE_ONCE(scx_watchdog_timestamp, jiffies);
 +      WRITE_ONCE(scx_watchdog_interval, intv);
 +
 +      if (intv < ULONG_MAX)
-               mod_delayed_work(system_unbound_wq, &scx_watchdog_work, intv);
++              mod_delayed_work(system_dfl_wq, &scx_watchdog_work, intv);
 +      else
 +              cancel_delayed_work_sync(&scx_watchdog_work);
 +}
 +
 +static s32 scx_link_sched(struct scx_sched *sch)
 +{
 +      scoped_guard(raw_spinlock_irq, &scx_sched_lock) {
 +#ifdef CONFIG_EXT_SUB_SCHED
 +              struct scx_sched *parent = scx_parent(sch);
 +              s32 ret;
 +
 +              if (parent) {
 +                      ret = rhashtable_lookup_insert_fast(&scx_sched_hash,
 +                                      &sch->hash_node, scx_sched_hash_params);
 +                      if (ret) {
 +                              scx_error(sch, "failed to insert into scx_sched_hash (%d)", ret);
 +                              return ret;
 +                      }
 +
 +                      list_add_tail(&sch->sibling, &parent->children);
 +              }
 +#endif        /* CONFIG_EXT_SUB_SCHED */
 +
 +              list_add_tail_rcu(&sch->all, &scx_sched_all);
 +      }
 +
 +      refresh_watchdog();
 +      return 0;
 +}
 +
 +static void scx_unlink_sched(struct scx_sched *sch)
 +{
 +      scoped_guard(raw_spinlock_irq, &scx_sched_lock) {
 +#ifdef CONFIG_EXT_SUB_SCHED
 +              if (scx_parent(sch)) {
 +                      rhashtable_remove_fast(&scx_sched_hash, &sch->hash_node,
 +                                             scx_sched_hash_params);
 +                      list_del_init(&sch->sibling);
 +              }
 +#endif        /* CONFIG_EXT_SUB_SCHED */
 +              list_del_rcu(&sch->all);
 +      }
 +
 +      refresh_watchdog();
 +}
 +
 +#ifdef CONFIG_EXT_SUB_SCHED
 +static DECLARE_WAIT_QUEUE_HEAD(scx_unlink_waitq);
 +
 +static void drain_descendants(struct scx_sched *sch)
 +{
 +      /*
 +       * Child scheds that finished the critical part of disabling will take
 +       * themselves off @sch->children. Wait for it to drain. As propagation
 +       * is recursive, empty @sch->children means that all proper descendant
 +       * scheds reached unlinking stage.
 +       */
 +      wait_event(scx_unlink_waitq, list_empty(&sch->children));
 +}
 +
 +static void scx_fail_parent(struct scx_sched *sch,
 +                          struct task_struct *failed, s32 fail_code)
 +{
 +      struct scx_sched *parent = scx_parent(sch);
        struct scx_task_iter sti;
        struct task_struct *p;
 -      int kind, cpu;
  
 -      kind = atomic_read(&sch->exit_kind);
 -      while (true) {
 -              if (kind == SCX_EXIT_DONE)      /* already disabled? */
 -                      return;
 -              WARN_ON_ONCE(kind == SCX_EXIT_NONE);
 -              if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE))
 +      scx_error(parent, "ops.init_task() failed (%d) for %s[%d] while disabling a sub-scheduler",
 +                fail_code, failed->comm, failed->pid);
 +
 +      /*
 +       * Once $parent is bypassed, it's safe to put SCX_TASK_NONE tasks into
 +       * it. This may cause downstream failures on the BPF side but $parent is
 +       * dying anyway.
 +       */
 +      scx_bypass(parent, true);
 +
 +      scx_task_iter_start(&sti, sch->cgrp);
 +      while ((p = scx_task_iter_next_locked(&sti))) {
 +              if (scx_task_on_sched(parent, p))
 +                      continue;
 +
 +              scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
 +                      scx_disable_and_exit_task(sch, p);
 +                      rcu_assign_pointer(p->scx.sched, parent);
 +              }
 +      }
 +      scx_task_iter_stop(&sti);
 +}
 +
 +static void scx_sub_disable(struct scx_sched *sch)
 +{
 +      struct scx_sched *parent = scx_parent(sch);
 +      struct scx_task_iter sti;
 +      struct task_struct *p;
 +      int ret;
 +
 +      /*
 +       * Guarantee forward progress and wait for descendants to be disabled.
 +       * To limit disruptions, $parent is not bypassed. Tasks are fully
 +       * prepped and then inserted back into $parent.
 +       */
 +      scx_bypass(sch, true);
 +      drain_descendants(sch);
 +
 +      /*
 +       * Here, every runnable task is guaranteed to make forward progress and
 +       * we can safely use blocking synchronization constructs. Actually
 +       * disable ops.
 +       */
 +      mutex_lock(&scx_enable_mutex);
 +      percpu_down_write(&scx_fork_rwsem);
 +      scx_cgroup_lock();
 +
 +      set_cgroup_sched(sch_cgroup(sch), parent);
 +
 +      scx_task_iter_start(&sti, sch->cgrp);
 +      while ((p = scx_task_iter_next_locked(&sti))) {
 +              struct rq *rq;
 +              struct rq_flags rf;
 +
 +              /* filter out duplicate visits */
 +              if (scx_task_on_sched(parent, p))
 +                      continue;
 +
 +              /*
 +               * By the time control reaches here, all descendant schedulers
 +               * should already have been disabled.
 +               */
 +              WARN_ON_ONCE(!scx_task_on_sched(sch, p));
 +
 +              /*
 +               * If $p is about to be freed, nothing prevents $sch from
 +               * unloading before $p reaches sched_ext_free(). Disable and
 +               * exit $p right away.
 +               */
 +              if (!tryget_task_struct(p)) {
 +                      scx_disable_and_exit_task(sch, p);
 +                      continue;
 +              }
 +
 +              scx_task_iter_unlock(&sti);
 +
 +              /*
 +               * $p is READY or ENABLED on @sch. Initialize for $parent,
 +               * disable and exit from @sch, and then switch over to $parent.
 +               *
 +               * If a task fails to initialize for $parent, the only available
 +               * action is disabling $parent too. While this allows disabling
 +               * of a child sched to cause the parent scheduler to fail, the
 +               * failure can only originate from ops.init_task() of the
 +               * parent. A child can't directly affect the parent through its
 +               * own failures.
 +               */
 +              ret = __scx_init_task(parent, p, false);
 +              if (ret) {
 +                      scx_fail_parent(sch, p, ret);
 +                      put_task_struct(p);
                        break;
 +              }
 +
 +              rq = task_rq_lock(p, &rf);
 +              scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
 +                      /*
 +                       * $p is initialized for $parent and still attached to
 +                       * @sch. Disable and exit for @sch, switch over to
 +                       * $parent, override the state to READY to account for
 +                       * $p having already been initialized, and then enable.
 +                       */
 +                      scx_disable_and_exit_task(sch, p);
 +                      scx_set_task_state(p, SCX_TASK_INIT);
 +                      rcu_assign_pointer(p->scx.sched, parent);
 +                      scx_set_task_state(p, SCX_TASK_READY);
 +                      scx_enable_task(parent, p);
 +              }
 +              task_rq_unlock(rq, p, &rf);
 +
 +              put_task_struct(p);
        }
 -      ei->kind = kind;
 -      ei->reason = scx_exit_reason(ei->kind);
 +      scx_task_iter_stop(&sti);
  
 -      /* guarantee forward progress by bypassing scx_ops */
 -      scx_bypass(true);
 -      WRITE_ONCE(scx_aborting, false);
 +      scx_cgroup_unlock();
 +      percpu_up_write(&scx_fork_rwsem);
 +
 +      /*
 +       * All tasks are moved off of @sch but there may still be on-going
 +       * operations (e.g. ops.select_cpu()). Drain them by flushing RCU. Use
 +       * the expedited version as ancestors may be waiting in bypass mode.
 +       * Also, tell the parent that there is no need to keep running bypass
 +       * DSQs for us.
 +       */
 +      synchronize_rcu_expedited();
 +      disable_bypass_dsp(sch);
 +
 +      scx_unlink_sched(sch);
 +
 +      mutex_unlock(&scx_enable_mutex);
 +
 +      /*
 +       * @sch is now unlinked from the parent's children list. Notify and call
 +       * ops.sub_detach/exit(). Note that ops.sub_detach/exit() must be called
 +       * after unlinking and releasing all locks. See scx_claim_exit().
 +       */
 +      wake_up_all(&scx_unlink_waitq);
 +
 +      if (sch->ops.sub_detach && sch->sub_attached) {
 +              struct scx_sub_detach_args sub_detach_args = {
 +                      .ops = &sch->ops,
 +                      .cgroup_path = sch->cgrp_path,
 +              };
 +              SCX_CALL_OP(parent, SCX_KF_UNLOCKED, sub_detach, NULL,
 +                          &sub_detach_args);
 +      }
 +
 +      if (sch->ops.exit)
 +              SCX_CALL_OP(sch, SCX_KF_UNLOCKED, exit, NULL, sch->exit_info);
 +      kobject_del(&sch->kobj);
 +}
 +#else /* CONFIG_EXT_SUB_SCHED */
 +static void drain_descendants(struct scx_sched *sch) { }
 +static void scx_sub_disable(struct scx_sched *sch) { }
 +#endif        /* CONFIG_EXT_SUB_SCHED */
 +
 +static void scx_root_disable(struct scx_sched *sch)
 +{
 +      struct scx_exit_info *ei = sch->exit_info;
 +      struct scx_task_iter sti;
 +      struct task_struct *p;
 +      int cpu;
 +
 +      /* guarantee forward progress and wait for descendants to be disabled */
 +      scx_bypass(sch, true);
 +      drain_descendants(sch);
  
        switch (scx_set_enable_state(SCX_DISABLING)) {
        case SCX_DISABLING:
Simple merge