From: Tejun Heo <tj@kernel.org>
Date: Mon, 9 Mar 2026 19:59:36 +0000 (-1000)
Subject: Merge branch 'sched/core' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip... 
X-Git-Tag: v7.1-rc1~162^2~73
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=0e7cd9cef61fde36ebfb653fe9e7a9722185cb57;p=thirdparty%2Flinux.git

Merge branch 'sched/core' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip into for-7.1

Pull sched/core to resolve conflicts between:

  c2a57380df9dd ("sched: Replace use of system_unbound_wq with system_dfl_wq")

from the tip tree and commit:

  cde94c032b32b ("sched_ext: Make watchdog sub-sched aware")

The latter moves around code modiefied by the former. Apply the changes in
the new locations.

Signed-off-by: Tejun Heo <tj@kernel.org>
---

0e7cd9cef61fde36ebfb653fe9e7a9722185cb57
diff --cc kernel/sched/ext.c
index 43fda12589035,7278d57496478..b35b98020f3b2
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@@ -3205,11 -2766,8 +3205,10 @@@ static void scx_watchdog_workfn(struct 
  
  		cond_resched();
  	}
 -	queue_delayed_work(system_dfl_wq, to_delayed_work(work),
 -			   READ_ONCE(scx_watchdog_timeout) / 2);
 +
 +	intv = READ_ONCE(scx_watchdog_interval);
 +	if (intv < ULONG_MAX)
- 		queue_delayed_work(system_unbound_wq, to_delayed_work(work),
- 				   intv);
++		queue_delayed_work(system_dfl_wq, to_delayed_work(work), intv);
  }
  
  void scx_tick(struct rq *rq)
@@@ -5218,255 -4282,28 +5217,255 @@@ static void free_kick_syncs(void
  	}
  }
  
 -static void scx_disable_workfn(struct kthread_work *work)
 +static void refresh_watchdog(void)
  {
 -	struct scx_sched *sch = container_of(work, struct scx_sched, disable_work);
 -	struct scx_exit_info *ei = sch->exit_info;
 +	struct scx_sched *sch;
 +	unsigned long intv = ULONG_MAX;
 +
 +	/* take the shortest timeout and use its half for watchdog interval */
 +	rcu_read_lock();
 +	list_for_each_entry_rcu(sch, &scx_sched_all, all)
 +		intv = max(min(intv, sch->watchdog_timeout / 2), 1);
 +	rcu_read_unlock();
 +
 +	WRITE_ONCE(scx_watchdog_timestamp, jiffies);
 +	WRITE_ONCE(scx_watchdog_interval, intv);
 +
 +	if (intv < ULONG_MAX)
- 		mod_delayed_work(system_unbound_wq, &scx_watchdog_work, intv);
++		mod_delayed_work(system_dfl_wq, &scx_watchdog_work, intv);
 +	else
 +		cancel_delayed_work_sync(&scx_watchdog_work);
 +}
 +
 +static s32 scx_link_sched(struct scx_sched *sch)
 +{
 +	scoped_guard(raw_spinlock_irq, &scx_sched_lock) {
 +#ifdef CONFIG_EXT_SUB_SCHED
 +		struct scx_sched *parent = scx_parent(sch);
 +		s32 ret;
 +
 +		if (parent) {
 +			ret = rhashtable_lookup_insert_fast(&scx_sched_hash,
 +					&sch->hash_node, scx_sched_hash_params);
 +			if (ret) {
 +				scx_error(sch, "failed to insert into scx_sched_hash (%d)", ret);
 +				return ret;
 +			}
 +
 +			list_add_tail(&sch->sibling, &parent->children);
 +		}
 +#endif	/* CONFIG_EXT_SUB_SCHED */
 +
 +		list_add_tail_rcu(&sch->all, &scx_sched_all);
 +	}
 +
 +	refresh_watchdog();
 +	return 0;
 +}
 +
 +static void scx_unlink_sched(struct scx_sched *sch)
 +{
 +	scoped_guard(raw_spinlock_irq, &scx_sched_lock) {
 +#ifdef CONFIG_EXT_SUB_SCHED
 +		if (scx_parent(sch)) {
 +			rhashtable_remove_fast(&scx_sched_hash, &sch->hash_node,
 +					       scx_sched_hash_params);
 +			list_del_init(&sch->sibling);
 +		}
 +#endif	/* CONFIG_EXT_SUB_SCHED */
 +		list_del_rcu(&sch->all);
 +	}
 +
 +	refresh_watchdog();
 +}
 +
 +#ifdef CONFIG_EXT_SUB_SCHED
 +static DECLARE_WAIT_QUEUE_HEAD(scx_unlink_waitq);
 +
 +static void drain_descendants(struct scx_sched *sch)
 +{
 +	/*
 +	 * Child scheds that finished the critical part of disabling will take
 +	 * themselves off @sch->children. Wait for it to drain. As propagation
 +	 * is recursive, empty @sch->children means that all proper descendant
 +	 * scheds reached unlinking stage.
 +	 */
 +	wait_event(scx_unlink_waitq, list_empty(&sch->children));
 +}
 +
 +static void scx_fail_parent(struct scx_sched *sch,
 +			    struct task_struct *failed, s32 fail_code)
 +{
 +	struct scx_sched *parent = scx_parent(sch);
  	struct scx_task_iter sti;
  	struct task_struct *p;
 -	int kind, cpu;
  
 -	kind = atomic_read(&sch->exit_kind);
 -	while (true) {
 -		if (kind == SCX_EXIT_DONE)	/* already disabled? */
 -			return;
 -		WARN_ON_ONCE(kind == SCX_EXIT_NONE);
 -		if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE))
 +	scx_error(parent, "ops.init_task() failed (%d) for %s[%d] while disabling a sub-scheduler",
 +		  fail_code, failed->comm, failed->pid);
 +
 +	/*
 +	 * Once $parent is bypassed, it's safe to put SCX_TASK_NONE tasks into
 +	 * it. This may cause downstream failures on the BPF side but $parent is
 +	 * dying anyway.
 +	 */
 +	scx_bypass(parent, true);
 +
 +	scx_task_iter_start(&sti, sch->cgrp);
 +	while ((p = scx_task_iter_next_locked(&sti))) {
 +		if (scx_task_on_sched(parent, p))
 +			continue;
 +
 +		scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
 +			scx_disable_and_exit_task(sch, p);
 +			rcu_assign_pointer(p->scx.sched, parent);
 +		}
 +	}
 +	scx_task_iter_stop(&sti);
 +}
 +
 +static void scx_sub_disable(struct scx_sched *sch)
 +{
 +	struct scx_sched *parent = scx_parent(sch);
 +	struct scx_task_iter sti;
 +	struct task_struct *p;
 +	int ret;
 +
 +	/*
 +	 * Guarantee forward progress and wait for descendants to be disabled.
 +	 * To limit disruptions, $parent is not bypassed. Tasks are fully
 +	 * prepped and then inserted back into $parent.
 +	 */
 +	scx_bypass(sch, true);
 +	drain_descendants(sch);
 +
 +	/*
 +	 * Here, every runnable task is guaranteed to make forward progress and
 +	 * we can safely use blocking synchronization constructs. Actually
 +	 * disable ops.
 +	 */
 +	mutex_lock(&scx_enable_mutex);
 +	percpu_down_write(&scx_fork_rwsem);
 +	scx_cgroup_lock();
 +
 +	set_cgroup_sched(sch_cgroup(sch), parent);
 +
 +	scx_task_iter_start(&sti, sch->cgrp);
 +	while ((p = scx_task_iter_next_locked(&sti))) {
 +		struct rq *rq;
 +		struct rq_flags rf;
 +
 +		/* filter out duplicate visits */
 +		if (scx_task_on_sched(parent, p))
 +			continue;
 +
 +		/*
 +		 * By the time control reaches here, all descendant schedulers
 +		 * should already have been disabled.
 +		 */
 +		WARN_ON_ONCE(!scx_task_on_sched(sch, p));
 +
 +		/*
 +		 * If $p is about to be freed, nothing prevents $sch from
 +		 * unloading before $p reaches sched_ext_free(). Disable and
 +		 * exit $p right away.
 +		 */
 +		if (!tryget_task_struct(p)) {
 +			scx_disable_and_exit_task(sch, p);
 +			continue;
 +		}
 +
 +		scx_task_iter_unlock(&sti);
 +
 +		/*
 +		 * $p is READY or ENABLED on @sch. Initialize for $parent,
 +		 * disable and exit from @sch, and then switch over to $parent.
 +		 *
 +		 * If a task fails to initialize for $parent, the only available
 +		 * action is disabling $parent too. While this allows disabling
 +		 * of a child sched to cause the parent scheduler to fail, the
 +		 * failure can only originate from ops.init_task() of the
 +		 * parent. A child can't directly affect the parent through its
 +		 * own failures.
 +		 */
 +		ret = __scx_init_task(parent, p, false);
 +		if (ret) {
 +			scx_fail_parent(sch, p, ret);
 +			put_task_struct(p);
  			break;
 +		}
 +
 +		rq = task_rq_lock(p, &rf);
 +		scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
 +			/*
 +			 * $p is initialized for $parent and still attached to
 +			 * @sch. Disable and exit for @sch, switch over to
 +			 * $parent, override the state to READY to account for
 +			 * $p having already been initialized, and then enable.
 +			 */
 +			scx_disable_and_exit_task(sch, p);
 +			scx_set_task_state(p, SCX_TASK_INIT);
 +			rcu_assign_pointer(p->scx.sched, parent);
 +			scx_set_task_state(p, SCX_TASK_READY);
 +			scx_enable_task(parent, p);
 +		}
 +		task_rq_unlock(rq, p, &rf);
 +
 +		put_task_struct(p);
  	}
 -	ei->kind = kind;
 -	ei->reason = scx_exit_reason(ei->kind);
 +	scx_task_iter_stop(&sti);
  
 -	/* guarantee forward progress by bypassing scx_ops */
 -	scx_bypass(true);
 -	WRITE_ONCE(scx_aborting, false);
 +	scx_cgroup_unlock();
 +	percpu_up_write(&scx_fork_rwsem);
 +
 +	/*
 +	 * All tasks are moved off of @sch but there may still be on-going
 +	 * operations (e.g. ops.select_cpu()). Drain them by flushing RCU. Use
 +	 * the expedited version as ancestors may be waiting in bypass mode.
 +	 * Also, tell the parent that there is no need to keep running bypass
 +	 * DSQs for us.
 +	 */
 +	synchronize_rcu_expedited();
 +	disable_bypass_dsp(sch);
 +
 +	scx_unlink_sched(sch);
 +
 +	mutex_unlock(&scx_enable_mutex);
 +
 +	/*
 +	 * @sch is now unlinked from the parent's children list. Notify and call
 +	 * ops.sub_detach/exit(). Note that ops.sub_detach/exit() must be called
 +	 * after unlinking and releasing all locks. See scx_claim_exit().
 +	 */
 +	wake_up_all(&scx_unlink_waitq);
 +
 +	if (sch->ops.sub_detach && sch->sub_attached) {
 +		struct scx_sub_detach_args sub_detach_args = {
 +			.ops = &sch->ops,
 +			.cgroup_path = sch->cgrp_path,
 +		};
 +		SCX_CALL_OP(parent, SCX_KF_UNLOCKED, sub_detach, NULL,
 +			    &sub_detach_args);
 +	}
 +
 +	if (sch->ops.exit)
 +		SCX_CALL_OP(sch, SCX_KF_UNLOCKED, exit, NULL, sch->exit_info);
 +	kobject_del(&sch->kobj);
 +}
 +#else	/* CONFIG_EXT_SUB_SCHED */
 +static void drain_descendants(struct scx_sched *sch) { }
 +static void scx_sub_disable(struct scx_sched *sch) { }
 +#endif	/* CONFIG_EXT_SUB_SCHED */
 +
 +static void scx_root_disable(struct scx_sched *sch)
 +{
 +	struct scx_exit_info *ei = sch->exit_info;
 +	struct scx_task_iter sti;
 +	struct task_struct *p;
 +	int cpu;
 +
 +	/* guarantee forward progress and wait for descendants to be disabled */
 +	scx_bypass(sch, true);
 +	drain_descendants(sch);
  
  	switch (scx_set_enable_state(SCX_DISABLING)) {
  	case SCX_DISABLING: