From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Apr 2026 17:48:28 +0000 (-1000)
Subject: Merge branch 'for-7.0-fixes' into for-7.1
X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=744ab12a5bd1a0dc59c5ba5354ae40030c834a46;p=thirdparty%2Fkernel%2Flinux.git

Merge branch 'for-7.0-fixes' into for-7.1

Conflict in kernel/sched/ext.c between:

  7e0ffb72de8a ("sched_ext: Fix stale direct dispatch state in
  ddsp_dsq_id")

which clears ddsp state at individual call sites instead of
dispatch_enqueue(), and sub-sched related code reorg and API updates on
for-7.1. Resolved by applying the ddsp fix with for-7.1's signatures.

Signed-off-by: Tejun Heo <tj@kernel.org>
---

744ab12a5bd1a0dc59c5ba5354ae40030c834a46
diff --cc kernel/sched/ext.c
index 0253887e63c03,064eaa76be4b9..b757b853b42bb
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@@ -1586,40 -1106,9 +1586,31 @@@ static void dispatch_enqueue(struct scx
  	WRITE_ONCE(dsq->seq, dsq->seq + 1);
  	p->scx.dsq_seq = dsq->seq;
  
 -	dsq_mod_nr(dsq, 1);
 +	dsq_inc_nr(dsq, p, enq_flags);
  	p->scx.dsq = dsq;
  
- 	/*
- 	 * scx.ddsp_dsq_id and scx.ddsp_enq_flags are only relevant on the
- 	 * direct dispatch path, but we clear them here because the direct
- 	 * dispatch verdict may be overridden on the enqueue path during e.g.
- 	 * bypass.
- 	 */
- 	p->scx.ddsp_dsq_id = SCX_DSQ_INVALID;
- 	p->scx.ddsp_enq_flags = 0;
- 
 +	/*
 +	 * Update custody and call ops.dequeue() before clearing ops_state:
 +	 * once ops_state is cleared, waiters in ops_dequeue() can proceed
 +	 * and dequeue_task_scx() will RMW p->scx.flags. If we clear
 +	 * ops_state first, both sides would modify p->scx.flags
 +	 * concurrently in a non-atomic way.
 +	 */
 +	if (is_local) {
 +		local_dsq_post_enq(dsq, p, enq_flags);
 +	} else {
 +		/*
 +		 * Task on global/bypass DSQ: leave custody, task on
 +		 * non-terminal DSQ: enter custody.
 +		 */
 +		if (dsq->id == SCX_DSQ_GLOBAL || dsq->id == SCX_DSQ_BYPASS)
 +			call_task_dequeue(sch, rq, p, 0);
 +		else
 +			p->scx.flags |= SCX_TASK_IN_CUSTODY;
 +
 +		raw_spin_unlock(&dsq->lock);
 +	}
 +
  	/*
  	 * We're transitioning out of QUEUEING or DISPATCHING. store_release to
  	 * match waiters' load_acquire.
@@@ -1784,7 -1300,8 +1796,8 @@@ static void direct_dispatch(struct scx_
  {
  	struct rq *rq = task_rq(p);
  	struct scx_dispatch_q *dsq =
 -		find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, p);
 +		find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, task_cpu(p));
+ 	u64 ddsp_enq_flags;
  
  	touch_core_sched_dispatch(rq, p);
  
@@@ -1825,8 -1342,10 +1838,10 @@@
  		return;
  	}
  
- 	dispatch_enqueue(sch, rq, dsq, p,
- 			 p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
+ 	ddsp_enq_flags = p->scx.ddsp_enq_flags;
+ 	clear_direct_dispatch(p);
+ 
 -	dispatch_enqueue(sch, dsq, p, ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
++	dispatch_enqueue(sch, rq, dsq, p, ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
  }
  
  static bool scx_rq_online(struct rq *rq)
@@@ -1949,7 -1454,8 +1964,8 @@@ enqueue
  	 */
  	touch_core_sched(rq, p);
  	refill_task_slice_dfl(sch, p);
+ 	clear_direct_dispatch(p);
 -	dispatch_enqueue(sch, dsq, p, enq_flags);
 +	dispatch_enqueue(sch, rq, dsq, p, enq_flags);
  }
  
  static bool task_runnable(const struct task_struct *p)
@@@ -3931,313 -3267,6 +3950,315 @@@ int scx_check_setscheduler(struct task_
  	return 0;
  }
  
 +static void process_ddsp_deferred_locals(struct rq *rq)
 +{
 +	struct task_struct *p;
 +
 +	lockdep_assert_rq_held(rq);
 +
 +	/*
 +	 * Now that @rq can be unlocked, execute the deferred enqueueing of
 +	 * tasks directly dispatched to the local DSQs of other CPUs. See
 +	 * direct_dispatch(). Keep popping from the head instead of using
 +	 * list_for_each_entry_safe() as dispatch_local_dsq() may unlock @rq
 +	 * temporarily.
 +	 */
 +	while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals,
 +				struct task_struct, scx.dsq_list.node))) {
 +		struct scx_sched *sch = scx_task_sched(p);
 +		struct scx_dispatch_q *dsq;
++		u64 dsq_id = p->scx.ddsp_dsq_id;
++		u64 enq_flags = p->scx.ddsp_enq_flags;
 +
 +		list_del_init(&p->scx.dsq_list.node);
++		clear_direct_dispatch(p);
 +
- 		dsq = find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, task_cpu(p));
++		dsq = find_dsq_for_dispatch(sch, rq, dsq_id, task_cpu(p));
 +		if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL))
- 			dispatch_to_local_dsq(sch, rq, dsq, p,
- 					      p->scx.ddsp_enq_flags);
++			dispatch_to_local_dsq(sch, rq, dsq, p, enq_flags);
 +	}
 +}
 +
 +/*
 + * Determine whether @p should be reenqueued from a local DSQ.
 + *
 + * @reenq_flags is mutable and accumulates state across the DSQ walk:
 + *
 + * - %SCX_REENQ_TSR_NOT_FIRST: Set after the first task is visited. "First"
 + *   tracks position in the DSQ list, not among IMMED tasks. A non-IMMED task at
 + *   the head consumes the first slot.
 + *
 + * - %SCX_REENQ_TSR_RQ_OPEN: Set by reenq_local() before the walk if
 + *   rq_is_open() is true.
 + *
 + * An IMMED task is kept (returns %false) only if it's the first task in the DSQ
 + * AND the current task is done â i.e. it will execute immediately. All other
 + * IMMED tasks are reenqueued. This means if a non-IMMED task sits at the head,
 + * every IMMED task behind it gets reenqueued.
 + *
 + * Reenqueued tasks go through ops.enqueue() with %SCX_ENQ_REENQ |
 + * %SCX_TASK_REENQ_IMMED. If the BPF scheduler dispatches back to the same local
 + * DSQ with %SCX_ENQ_IMMED while the CPU is still unavailable, this triggers
 + * another reenq cycle. Repetitions are bounded by %SCX_REENQ_LOCAL_MAX_REPEAT
 + * in process_deferred_reenq_locals().
 + */
 +static bool local_task_should_reenq(struct task_struct *p, u64 *reenq_flags, u32 *reason)
 +{
 +	bool first;
 +
 +	first = !(*reenq_flags & SCX_REENQ_TSR_NOT_FIRST);
 +	*reenq_flags |= SCX_REENQ_TSR_NOT_FIRST;
 +
 +	*reason = SCX_TASK_REENQ_KFUNC;
 +
 +	if ((p->scx.flags & SCX_TASK_IMMED) &&
 +	    (!first || !(*reenq_flags & SCX_REENQ_TSR_RQ_OPEN))) {
 +		__scx_add_event(scx_task_sched(p), SCX_EV_REENQ_IMMED, 1);
 +		*reason = SCX_TASK_REENQ_IMMED;
 +		return true;
 +	}
 +
 +	return *reenq_flags & SCX_REENQ_ANY;
 +}
 +
 +static u32 reenq_local(struct scx_sched *sch, struct rq *rq, u64 reenq_flags)
 +{
 +	LIST_HEAD(tasks);
 +	u32 nr_enqueued = 0;
 +	struct task_struct *p, *n;
 +
 +	lockdep_assert_rq_held(rq);
 +
 +	if (WARN_ON_ONCE(reenq_flags & __SCX_REENQ_TSR_MASK))
 +		reenq_flags &= ~__SCX_REENQ_TSR_MASK;
 +	if (rq_is_open(rq, 0))
 +		reenq_flags |= SCX_REENQ_TSR_RQ_OPEN;
 +
 +	/*
 +	 * The BPF scheduler may choose to dispatch tasks back to
 +	 * @rq->scx.local_dsq. Move all candidate tasks off to a private list
 +	 * first to avoid processing the same tasks repeatedly.
 +	 */
 +	list_for_each_entry_safe(p, n, &rq->scx.local_dsq.list,
 +				 scx.dsq_list.node) {
 +		struct scx_sched *task_sch = scx_task_sched(p);
 +		u32 reason;
 +
 +		/*
 +		 * If @p is being migrated, @p's current CPU may not agree with
 +		 * its allowed CPUs and the migration_cpu_stop is about to
 +		 * deactivate and re-activate @p anyway. Skip re-enqueueing.
 +		 *
 +		 * While racing sched property changes may also dequeue and
 +		 * re-enqueue a migrating task while its current CPU and allowed
 +		 * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to
 +		 * the current local DSQ for running tasks and thus are not
 +		 * visible to the BPF scheduler.
 +		 */
 +		if (p->migration_pending)
 +			continue;
 +
 +		if (!scx_is_descendant(task_sch, sch))
 +			continue;
 +
 +		if (!local_task_should_reenq(p, &reenq_flags, &reason))
 +			continue;
 +
 +		dispatch_dequeue(rq, p);
 +
 +		if (WARN_ON_ONCE(p->scx.flags & SCX_TASK_REENQ_REASON_MASK))
 +			p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK;
 +		p->scx.flags |= reason;
 +
 +		list_add_tail(&p->scx.dsq_list.node, &tasks);
 +	}
 +
 +	list_for_each_entry_safe(p, n, &tasks, scx.dsq_list.node) {
 +		list_del_init(&p->scx.dsq_list.node);
 +
 +		do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1);
 +
 +		p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK;
 +		nr_enqueued++;
 +	}
 +
 +	return nr_enqueued;
 +}
 +
 +static void process_deferred_reenq_locals(struct rq *rq)
 +{
 +	u64 seq = ++rq->scx.deferred_reenq_locals_seq;
 +
 +	lockdep_assert_rq_held(rq);
 +
 +	while (true) {
 +		struct scx_sched *sch;
 +		u64 reenq_flags;
 +		bool skip = false;
 +
 +		scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) {
 +			struct scx_deferred_reenq_local *drl =
 +				list_first_entry_or_null(&rq->scx.deferred_reenq_locals,
 +							 struct scx_deferred_reenq_local,
 +							 node);
 +			struct scx_sched_pcpu *sch_pcpu;
 +
 +			if (!drl)
 +				return;
 +
 +			sch_pcpu = container_of(drl, struct scx_sched_pcpu,
 +						deferred_reenq_local);
 +			sch = sch_pcpu->sch;
 +
 +			reenq_flags = drl->flags;
 +			WRITE_ONCE(drl->flags, 0);
 +			list_del_init(&drl->node);
 +
 +			if (likely(drl->seq != seq)) {
 +				drl->seq = seq;
 +				drl->cnt = 0;
 +			} else {
 +				if (unlikely(++drl->cnt > SCX_REENQ_LOCAL_MAX_REPEAT)) {
 +					scx_error(sch, "SCX_ENQ_REENQ on SCX_DSQ_LOCAL repeated %u times",
 +						  drl->cnt);
 +					skip = true;
 +				}
 +
 +				__scx_add_event(sch, SCX_EV_REENQ_LOCAL_REPEAT, 1);
 +			}
 +		}
 +
 +		if (!skip) {
 +			/* see schedule_dsq_reenq() */
 +			smp_mb();
 +
 +			reenq_local(sch, rq, reenq_flags);
 +		}
 +	}
 +}
 +
 +static bool user_task_should_reenq(struct task_struct *p, u64 reenq_flags, u32 *reason)
 +{
 +	*reason = SCX_TASK_REENQ_KFUNC;
 +	return reenq_flags & SCX_REENQ_ANY;
 +}
 +
 +static void reenq_user(struct rq *rq, struct scx_dispatch_q *dsq, u64 reenq_flags)
 +{
 +	struct rq *locked_rq = rq;
 +	struct scx_sched *sch = dsq->sched;
 +	struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, dsq, 0);
 +	struct task_struct *p;
 +	s32 nr_enqueued = 0;
 +
 +	lockdep_assert_rq_held(rq);
 +
 +	raw_spin_lock(&dsq->lock);
 +
 +	while (likely(!READ_ONCE(sch->bypass_depth))) {
 +		struct rq *task_rq;
 +		u32 reason;
 +
 +		p = nldsq_cursor_next_task(&cursor, dsq);
 +		if (!p)
 +			break;
 +
 +		if (!user_task_should_reenq(p, reenq_flags, &reason))
 +			continue;
 +
 +		task_rq = task_rq(p);
 +
 +		if (locked_rq != task_rq) {
 +			if (locked_rq)
 +				raw_spin_rq_unlock(locked_rq);
 +			if (unlikely(!raw_spin_rq_trylock(task_rq))) {
 +				raw_spin_unlock(&dsq->lock);
 +				raw_spin_rq_lock(task_rq);
 +				raw_spin_lock(&dsq->lock);
 +			}
 +			locked_rq = task_rq;
 +
 +			/* did we lose @p while switching locks? */
 +			if (nldsq_cursor_lost_task(&cursor, task_rq, dsq, p))
 +				continue;
 +		}
 +
 +		/* @p is on @dsq, its rq and @dsq are locked */
 +		dispatch_dequeue_locked(p, dsq);
 +		raw_spin_unlock(&dsq->lock);
 +
 +		if (WARN_ON_ONCE(p->scx.flags & SCX_TASK_REENQ_REASON_MASK))
 +			p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK;
 +		p->scx.flags |= reason;
 +
 +		do_enqueue_task(task_rq, p, SCX_ENQ_REENQ, -1);
 +
 +		p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK;
 +
 +		if (!(++nr_enqueued % SCX_TASK_ITER_BATCH)) {
 +			raw_spin_rq_unlock(locked_rq);
 +			locked_rq = NULL;
 +			cpu_relax();
 +		}
 +
 +		raw_spin_lock(&dsq->lock);
 +	}
 +
 +	list_del_init(&cursor.node);
 +	raw_spin_unlock(&dsq->lock);
 +
 +	if (locked_rq != rq) {
 +		if (locked_rq)
 +			raw_spin_rq_unlock(locked_rq);
 +		raw_spin_rq_lock(rq);
 +	}
 +}
 +
 +static void process_deferred_reenq_users(struct rq *rq)
 +{
 +	lockdep_assert_rq_held(rq);
 +
 +	while (true) {
 +		struct scx_dispatch_q *dsq;
 +		u64 reenq_flags;
 +
 +		scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) {
 +			struct scx_deferred_reenq_user *dru =
 +				list_first_entry_or_null(&rq->scx.deferred_reenq_users,
 +							 struct scx_deferred_reenq_user,
 +							 node);
 +			struct scx_dsq_pcpu *dsq_pcpu;
 +
 +			if (!dru)
 +				return;
 +
 +			dsq_pcpu = container_of(dru, struct scx_dsq_pcpu,
 +						deferred_reenq_user);
 +			dsq = dsq_pcpu->dsq;
 +			reenq_flags = dru->flags;
 +			WRITE_ONCE(dru->flags, 0);
 +			list_del_init(&dru->node);
 +		}
 +
 +		/* see schedule_dsq_reenq() */
 +		smp_mb();
 +
 +		BUG_ON(dsq->id & SCX_DSQ_FLAG_BUILTIN);
 +		reenq_user(rq, dsq, reenq_flags);
 +	}
 +}
 +
 +static void run_deferred(struct rq *rq)
 +{
 +	process_ddsp_deferred_locals(rq);
 +
 +	if (!list_empty(&rq->scx.deferred_reenq_locals))
 +		process_deferred_reenq_locals(rq);
 +
 +	if (!list_empty(&rq->scx.deferred_reenq_users))
 +		process_deferred_reenq_users(rq);
 +}
 +
  #ifdef CONFIG_NO_HZ_FULL
  bool scx_can_stop_tick(struct rq *rq)
  {