From: Tejun Heo Date: Mon, 9 Mar 2026 19:59:36 +0000 (-1000) Subject: Merge branch 'sched/core' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip... X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=0e7cd9cef61fde36ebfb653fe9e7a9722185cb57;p=thirdparty%2Flinux.git Merge branch 'sched/core' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip into for-7.1 Pull sched/core to resolve conflicts between: c2a57380df9dd ("sched: Replace use of system_unbound_wq with system_dfl_wq") from the tip tree and commit: cde94c032b32b ("sched_ext: Make watchdog sub-sched aware") The latter moves around code modiefied by the former. Apply the changes in the new locations. Signed-off-by: Tejun Heo --- 0e7cd9cef61fde36ebfb653fe9e7a9722185cb57 diff --cc kernel/sched/ext.c index 43fda12589035,7278d57496478..b35b98020f3b2 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@@ -3205,11 -2766,8 +3205,10 @@@ static void scx_watchdog_workfn(struct cond_resched(); } - queue_delayed_work(system_dfl_wq, to_delayed_work(work), - READ_ONCE(scx_watchdog_timeout) / 2); + + intv = READ_ONCE(scx_watchdog_interval); + if (intv < ULONG_MAX) - queue_delayed_work(system_unbound_wq, to_delayed_work(work), - intv); ++ queue_delayed_work(system_dfl_wq, to_delayed_work(work), intv); } void scx_tick(struct rq *rq) @@@ -5218,255 -4282,28 +5217,255 @@@ static void free_kick_syncs(void } } -static void scx_disable_workfn(struct kthread_work *work) +static void refresh_watchdog(void) { - struct scx_sched *sch = container_of(work, struct scx_sched, disable_work); - struct scx_exit_info *ei = sch->exit_info; + struct scx_sched *sch; + unsigned long intv = ULONG_MAX; + + /* take the shortest timeout and use its half for watchdog interval */ + rcu_read_lock(); + list_for_each_entry_rcu(sch, &scx_sched_all, all) + intv = max(min(intv, sch->watchdog_timeout / 2), 1); + rcu_read_unlock(); + + WRITE_ONCE(scx_watchdog_timestamp, jiffies); + WRITE_ONCE(scx_watchdog_interval, intv); + + if (intv < ULONG_MAX) - mod_delayed_work(system_unbound_wq, &scx_watchdog_work, intv); ++ mod_delayed_work(system_dfl_wq, &scx_watchdog_work, intv); + else + cancel_delayed_work_sync(&scx_watchdog_work); +} + +static s32 scx_link_sched(struct scx_sched *sch) +{ + scoped_guard(raw_spinlock_irq, &scx_sched_lock) { +#ifdef CONFIG_EXT_SUB_SCHED + struct scx_sched *parent = scx_parent(sch); + s32 ret; + + if (parent) { + ret = rhashtable_lookup_insert_fast(&scx_sched_hash, + &sch->hash_node, scx_sched_hash_params); + if (ret) { + scx_error(sch, "failed to insert into scx_sched_hash (%d)", ret); + return ret; + } + + list_add_tail(&sch->sibling, &parent->children); + } +#endif /* CONFIG_EXT_SUB_SCHED */ + + list_add_tail_rcu(&sch->all, &scx_sched_all); + } + + refresh_watchdog(); + return 0; +} + +static void scx_unlink_sched(struct scx_sched *sch) +{ + scoped_guard(raw_spinlock_irq, &scx_sched_lock) { +#ifdef CONFIG_EXT_SUB_SCHED + if (scx_parent(sch)) { + rhashtable_remove_fast(&scx_sched_hash, &sch->hash_node, + scx_sched_hash_params); + list_del_init(&sch->sibling); + } +#endif /* CONFIG_EXT_SUB_SCHED */ + list_del_rcu(&sch->all); + } + + refresh_watchdog(); +} + +#ifdef CONFIG_EXT_SUB_SCHED +static DECLARE_WAIT_QUEUE_HEAD(scx_unlink_waitq); + +static void drain_descendants(struct scx_sched *sch) +{ + /* + * Child scheds that finished the critical part of disabling will take + * themselves off @sch->children. Wait for it to drain. As propagation + * is recursive, empty @sch->children means that all proper descendant + * scheds reached unlinking stage. + */ + wait_event(scx_unlink_waitq, list_empty(&sch->children)); +} + +static void scx_fail_parent(struct scx_sched *sch, + struct task_struct *failed, s32 fail_code) +{ + struct scx_sched *parent = scx_parent(sch); struct scx_task_iter sti; struct task_struct *p; - int kind, cpu; - kind = atomic_read(&sch->exit_kind); - while (true) { - if (kind == SCX_EXIT_DONE) /* already disabled? */ - return; - WARN_ON_ONCE(kind == SCX_EXIT_NONE); - if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE)) + scx_error(parent, "ops.init_task() failed (%d) for %s[%d] while disabling a sub-scheduler", + fail_code, failed->comm, failed->pid); + + /* + * Once $parent is bypassed, it's safe to put SCX_TASK_NONE tasks into + * it. This may cause downstream failures on the BPF side but $parent is + * dying anyway. + */ + scx_bypass(parent, true); + + scx_task_iter_start(&sti, sch->cgrp); + while ((p = scx_task_iter_next_locked(&sti))) { + if (scx_task_on_sched(parent, p)) + continue; + + scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { + scx_disable_and_exit_task(sch, p); + rcu_assign_pointer(p->scx.sched, parent); + } + } + scx_task_iter_stop(&sti); +} + +static void scx_sub_disable(struct scx_sched *sch) +{ + struct scx_sched *parent = scx_parent(sch); + struct scx_task_iter sti; + struct task_struct *p; + int ret; + + /* + * Guarantee forward progress and wait for descendants to be disabled. + * To limit disruptions, $parent is not bypassed. Tasks are fully + * prepped and then inserted back into $parent. + */ + scx_bypass(sch, true); + drain_descendants(sch); + + /* + * Here, every runnable task is guaranteed to make forward progress and + * we can safely use blocking synchronization constructs. Actually + * disable ops. + */ + mutex_lock(&scx_enable_mutex); + percpu_down_write(&scx_fork_rwsem); + scx_cgroup_lock(); + + set_cgroup_sched(sch_cgroup(sch), parent); + + scx_task_iter_start(&sti, sch->cgrp); + while ((p = scx_task_iter_next_locked(&sti))) { + struct rq *rq; + struct rq_flags rf; + + /* filter out duplicate visits */ + if (scx_task_on_sched(parent, p)) + continue; + + /* + * By the time control reaches here, all descendant schedulers + * should already have been disabled. + */ + WARN_ON_ONCE(!scx_task_on_sched(sch, p)); + + /* + * If $p is about to be freed, nothing prevents $sch from + * unloading before $p reaches sched_ext_free(). Disable and + * exit $p right away. + */ + if (!tryget_task_struct(p)) { + scx_disable_and_exit_task(sch, p); + continue; + } + + scx_task_iter_unlock(&sti); + + /* + * $p is READY or ENABLED on @sch. Initialize for $parent, + * disable and exit from @sch, and then switch over to $parent. + * + * If a task fails to initialize for $parent, the only available + * action is disabling $parent too. While this allows disabling + * of a child sched to cause the parent scheduler to fail, the + * failure can only originate from ops.init_task() of the + * parent. A child can't directly affect the parent through its + * own failures. + */ + ret = __scx_init_task(parent, p, false); + if (ret) { + scx_fail_parent(sch, p, ret); + put_task_struct(p); break; + } + + rq = task_rq_lock(p, &rf); + scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { + /* + * $p is initialized for $parent and still attached to + * @sch. Disable and exit for @sch, switch over to + * $parent, override the state to READY to account for + * $p having already been initialized, and then enable. + */ + scx_disable_and_exit_task(sch, p); + scx_set_task_state(p, SCX_TASK_INIT); + rcu_assign_pointer(p->scx.sched, parent); + scx_set_task_state(p, SCX_TASK_READY); + scx_enable_task(parent, p); + } + task_rq_unlock(rq, p, &rf); + + put_task_struct(p); } - ei->kind = kind; - ei->reason = scx_exit_reason(ei->kind); + scx_task_iter_stop(&sti); - /* guarantee forward progress by bypassing scx_ops */ - scx_bypass(true); - WRITE_ONCE(scx_aborting, false); + scx_cgroup_unlock(); + percpu_up_write(&scx_fork_rwsem); + + /* + * All tasks are moved off of @sch but there may still be on-going + * operations (e.g. ops.select_cpu()). Drain them by flushing RCU. Use + * the expedited version as ancestors may be waiting in bypass mode. + * Also, tell the parent that there is no need to keep running bypass + * DSQs for us. + */ + synchronize_rcu_expedited(); + disable_bypass_dsp(sch); + + scx_unlink_sched(sch); + + mutex_unlock(&scx_enable_mutex); + + /* + * @sch is now unlinked from the parent's children list. Notify and call + * ops.sub_detach/exit(). Note that ops.sub_detach/exit() must be called + * after unlinking and releasing all locks. See scx_claim_exit(). + */ + wake_up_all(&scx_unlink_waitq); + + if (sch->ops.sub_detach && sch->sub_attached) { + struct scx_sub_detach_args sub_detach_args = { + .ops = &sch->ops, + .cgroup_path = sch->cgrp_path, + }; + SCX_CALL_OP(parent, SCX_KF_UNLOCKED, sub_detach, NULL, + &sub_detach_args); + } + + if (sch->ops.exit) + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, exit, NULL, sch->exit_info); + kobject_del(&sch->kobj); +} +#else /* CONFIG_EXT_SUB_SCHED */ +static void drain_descendants(struct scx_sched *sch) { } +static void scx_sub_disable(struct scx_sched *sch) { } +#endif /* CONFIG_EXT_SUB_SCHED */ + +static void scx_root_disable(struct scx_sched *sch) +{ + struct scx_exit_info *ei = sch->exit_info; + struct scx_task_iter sti; + struct task_struct *p; + int cpu; + + /* guarantee forward progress and wait for descendants to be disabled */ + scx_bypass(sch, true); + drain_descendants(sch); switch (scx_set_enable_state(SCX_DISABLING)) { case SCX_DISABLING: