From: Tejun Heo Date: Fri, 3 Apr 2026 17:48:28 +0000 (-1000) Subject: Merge branch 'for-7.0-fixes' into for-7.1 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=744ab12a5bd1a0dc59c5ba5354ae40030c834a46;p=thirdparty%2Fkernel%2Flinux.git Merge branch 'for-7.0-fixes' into for-7.1 Conflict in kernel/sched/ext.c between: 7e0ffb72de8a ("sched_ext: Fix stale direct dispatch state in ddsp_dsq_id") which clears ddsp state at individual call sites instead of dispatch_enqueue(), and sub-sched related code reorg and API updates on for-7.1. Resolved by applying the ddsp fix with for-7.1's signatures. Signed-off-by: Tejun Heo --- 744ab12a5bd1a0dc59c5ba5354ae40030c834a46 diff --cc kernel/sched/ext.c index 0253887e63c03,064eaa76be4b9..b757b853b42bb --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@@ -1586,40 -1106,9 +1586,31 @@@ static void dispatch_enqueue(struct scx WRITE_ONCE(dsq->seq, dsq->seq + 1); p->scx.dsq_seq = dsq->seq; - dsq_mod_nr(dsq, 1); + dsq_inc_nr(dsq, p, enq_flags); p->scx.dsq = dsq; - /* - * scx.ddsp_dsq_id and scx.ddsp_enq_flags are only relevant on the - * direct dispatch path, but we clear them here because the direct - * dispatch verdict may be overridden on the enqueue path during e.g. - * bypass. - */ - p->scx.ddsp_dsq_id = SCX_DSQ_INVALID; - p->scx.ddsp_enq_flags = 0; - + /* + * Update custody and call ops.dequeue() before clearing ops_state: + * once ops_state is cleared, waiters in ops_dequeue() can proceed + * and dequeue_task_scx() will RMW p->scx.flags. If we clear + * ops_state first, both sides would modify p->scx.flags + * concurrently in a non-atomic way. + */ + if (is_local) { + local_dsq_post_enq(dsq, p, enq_flags); + } else { + /* + * Task on global/bypass DSQ: leave custody, task on + * non-terminal DSQ: enter custody. + */ + if (dsq->id == SCX_DSQ_GLOBAL || dsq->id == SCX_DSQ_BYPASS) + call_task_dequeue(sch, rq, p, 0); + else + p->scx.flags |= SCX_TASK_IN_CUSTODY; + + raw_spin_unlock(&dsq->lock); + } + /* * We're transitioning out of QUEUEING or DISPATCHING. store_release to * match waiters' load_acquire. @@@ -1784,7 -1300,8 +1796,8 @@@ static void direct_dispatch(struct scx_ { struct rq *rq = task_rq(p); struct scx_dispatch_q *dsq = - find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, p); + find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, task_cpu(p)); + u64 ddsp_enq_flags; touch_core_sched_dispatch(rq, p); @@@ -1825,8 -1342,10 +1838,10 @@@ return; } - dispatch_enqueue(sch, rq, dsq, p, - p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS); + ddsp_enq_flags = p->scx.ddsp_enq_flags; + clear_direct_dispatch(p); + - dispatch_enqueue(sch, dsq, p, ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS); ++ dispatch_enqueue(sch, rq, dsq, p, ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS); } static bool scx_rq_online(struct rq *rq) @@@ -1949,7 -1454,8 +1964,8 @@@ enqueue */ touch_core_sched(rq, p); refill_task_slice_dfl(sch, p); + clear_direct_dispatch(p); - dispatch_enqueue(sch, dsq, p, enq_flags); + dispatch_enqueue(sch, rq, dsq, p, enq_flags); } static bool task_runnable(const struct task_struct *p) @@@ -3931,313 -3267,6 +3950,315 @@@ int scx_check_setscheduler(struct task_ return 0; } +static void process_ddsp_deferred_locals(struct rq *rq) +{ + struct task_struct *p; + + lockdep_assert_rq_held(rq); + + /* + * Now that @rq can be unlocked, execute the deferred enqueueing of + * tasks directly dispatched to the local DSQs of other CPUs. See + * direct_dispatch(). Keep popping from the head instead of using + * list_for_each_entry_safe() as dispatch_local_dsq() may unlock @rq + * temporarily. + */ + while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals, + struct task_struct, scx.dsq_list.node))) { + struct scx_sched *sch = scx_task_sched(p); + struct scx_dispatch_q *dsq; ++ u64 dsq_id = p->scx.ddsp_dsq_id; ++ u64 enq_flags = p->scx.ddsp_enq_flags; + + list_del_init(&p->scx.dsq_list.node); ++ clear_direct_dispatch(p); + - dsq = find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, task_cpu(p)); ++ dsq = find_dsq_for_dispatch(sch, rq, dsq_id, task_cpu(p)); + if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL)) - dispatch_to_local_dsq(sch, rq, dsq, p, - p->scx.ddsp_enq_flags); ++ dispatch_to_local_dsq(sch, rq, dsq, p, enq_flags); + } +} + +/* + * Determine whether @p should be reenqueued from a local DSQ. + * + * @reenq_flags is mutable and accumulates state across the DSQ walk: + * + * - %SCX_REENQ_TSR_NOT_FIRST: Set after the first task is visited. "First" + * tracks position in the DSQ list, not among IMMED tasks. A non-IMMED task at + * the head consumes the first slot. + * + * - %SCX_REENQ_TSR_RQ_OPEN: Set by reenq_local() before the walk if + * rq_is_open() is true. + * + * An IMMED task is kept (returns %false) only if it's the first task in the DSQ + * AND the current task is done — i.e. it will execute immediately. All other + * IMMED tasks are reenqueued. This means if a non-IMMED task sits at the head, + * every IMMED task behind it gets reenqueued. + * + * Reenqueued tasks go through ops.enqueue() with %SCX_ENQ_REENQ | + * %SCX_TASK_REENQ_IMMED. If the BPF scheduler dispatches back to the same local + * DSQ with %SCX_ENQ_IMMED while the CPU is still unavailable, this triggers + * another reenq cycle. Repetitions are bounded by %SCX_REENQ_LOCAL_MAX_REPEAT + * in process_deferred_reenq_locals(). + */ +static bool local_task_should_reenq(struct task_struct *p, u64 *reenq_flags, u32 *reason) +{ + bool first; + + first = !(*reenq_flags & SCX_REENQ_TSR_NOT_FIRST); + *reenq_flags |= SCX_REENQ_TSR_NOT_FIRST; + + *reason = SCX_TASK_REENQ_KFUNC; + + if ((p->scx.flags & SCX_TASK_IMMED) && + (!first || !(*reenq_flags & SCX_REENQ_TSR_RQ_OPEN))) { + __scx_add_event(scx_task_sched(p), SCX_EV_REENQ_IMMED, 1); + *reason = SCX_TASK_REENQ_IMMED; + return true; + } + + return *reenq_flags & SCX_REENQ_ANY; +} + +static u32 reenq_local(struct scx_sched *sch, struct rq *rq, u64 reenq_flags) +{ + LIST_HEAD(tasks); + u32 nr_enqueued = 0; + struct task_struct *p, *n; + + lockdep_assert_rq_held(rq); + + if (WARN_ON_ONCE(reenq_flags & __SCX_REENQ_TSR_MASK)) + reenq_flags &= ~__SCX_REENQ_TSR_MASK; + if (rq_is_open(rq, 0)) + reenq_flags |= SCX_REENQ_TSR_RQ_OPEN; + + /* + * The BPF scheduler may choose to dispatch tasks back to + * @rq->scx.local_dsq. Move all candidate tasks off to a private list + * first to avoid processing the same tasks repeatedly. + */ + list_for_each_entry_safe(p, n, &rq->scx.local_dsq.list, + scx.dsq_list.node) { + struct scx_sched *task_sch = scx_task_sched(p); + u32 reason; + + /* + * If @p is being migrated, @p's current CPU may not agree with + * its allowed CPUs and the migration_cpu_stop is about to + * deactivate and re-activate @p anyway. Skip re-enqueueing. + * + * While racing sched property changes may also dequeue and + * re-enqueue a migrating task while its current CPU and allowed + * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to + * the current local DSQ for running tasks and thus are not + * visible to the BPF scheduler. + */ + if (p->migration_pending) + continue; + + if (!scx_is_descendant(task_sch, sch)) + continue; + + if (!local_task_should_reenq(p, &reenq_flags, &reason)) + continue; + + dispatch_dequeue(rq, p); + + if (WARN_ON_ONCE(p->scx.flags & SCX_TASK_REENQ_REASON_MASK)) + p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; + p->scx.flags |= reason; + + list_add_tail(&p->scx.dsq_list.node, &tasks); + } + + list_for_each_entry_safe(p, n, &tasks, scx.dsq_list.node) { + list_del_init(&p->scx.dsq_list.node); + + do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1); + + p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; + nr_enqueued++; + } + + return nr_enqueued; +} + +static void process_deferred_reenq_locals(struct rq *rq) +{ + u64 seq = ++rq->scx.deferred_reenq_locals_seq; + + lockdep_assert_rq_held(rq); + + while (true) { + struct scx_sched *sch; + u64 reenq_flags; + bool skip = false; + + scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) { + struct scx_deferred_reenq_local *drl = + list_first_entry_or_null(&rq->scx.deferred_reenq_locals, + struct scx_deferred_reenq_local, + node); + struct scx_sched_pcpu *sch_pcpu; + + if (!drl) + return; + + sch_pcpu = container_of(drl, struct scx_sched_pcpu, + deferred_reenq_local); + sch = sch_pcpu->sch; + + reenq_flags = drl->flags; + WRITE_ONCE(drl->flags, 0); + list_del_init(&drl->node); + + if (likely(drl->seq != seq)) { + drl->seq = seq; + drl->cnt = 0; + } else { + if (unlikely(++drl->cnt > SCX_REENQ_LOCAL_MAX_REPEAT)) { + scx_error(sch, "SCX_ENQ_REENQ on SCX_DSQ_LOCAL repeated %u times", + drl->cnt); + skip = true; + } + + __scx_add_event(sch, SCX_EV_REENQ_LOCAL_REPEAT, 1); + } + } + + if (!skip) { + /* see schedule_dsq_reenq() */ + smp_mb(); + + reenq_local(sch, rq, reenq_flags); + } + } +} + +static bool user_task_should_reenq(struct task_struct *p, u64 reenq_flags, u32 *reason) +{ + *reason = SCX_TASK_REENQ_KFUNC; + return reenq_flags & SCX_REENQ_ANY; +} + +static void reenq_user(struct rq *rq, struct scx_dispatch_q *dsq, u64 reenq_flags) +{ + struct rq *locked_rq = rq; + struct scx_sched *sch = dsq->sched; + struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, dsq, 0); + struct task_struct *p; + s32 nr_enqueued = 0; + + lockdep_assert_rq_held(rq); + + raw_spin_lock(&dsq->lock); + + while (likely(!READ_ONCE(sch->bypass_depth))) { + struct rq *task_rq; + u32 reason; + + p = nldsq_cursor_next_task(&cursor, dsq); + if (!p) + break; + + if (!user_task_should_reenq(p, reenq_flags, &reason)) + continue; + + task_rq = task_rq(p); + + if (locked_rq != task_rq) { + if (locked_rq) + raw_spin_rq_unlock(locked_rq); + if (unlikely(!raw_spin_rq_trylock(task_rq))) { + raw_spin_unlock(&dsq->lock); + raw_spin_rq_lock(task_rq); + raw_spin_lock(&dsq->lock); + } + locked_rq = task_rq; + + /* did we lose @p while switching locks? */ + if (nldsq_cursor_lost_task(&cursor, task_rq, dsq, p)) + continue; + } + + /* @p is on @dsq, its rq and @dsq are locked */ + dispatch_dequeue_locked(p, dsq); + raw_spin_unlock(&dsq->lock); + + if (WARN_ON_ONCE(p->scx.flags & SCX_TASK_REENQ_REASON_MASK)) + p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; + p->scx.flags |= reason; + + do_enqueue_task(task_rq, p, SCX_ENQ_REENQ, -1); + + p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; + + if (!(++nr_enqueued % SCX_TASK_ITER_BATCH)) { + raw_spin_rq_unlock(locked_rq); + locked_rq = NULL; + cpu_relax(); + } + + raw_spin_lock(&dsq->lock); + } + + list_del_init(&cursor.node); + raw_spin_unlock(&dsq->lock); + + if (locked_rq != rq) { + if (locked_rq) + raw_spin_rq_unlock(locked_rq); + raw_spin_rq_lock(rq); + } +} + +static void process_deferred_reenq_users(struct rq *rq) +{ + lockdep_assert_rq_held(rq); + + while (true) { + struct scx_dispatch_q *dsq; + u64 reenq_flags; + + scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) { + struct scx_deferred_reenq_user *dru = + list_first_entry_or_null(&rq->scx.deferred_reenq_users, + struct scx_deferred_reenq_user, + node); + struct scx_dsq_pcpu *dsq_pcpu; + + if (!dru) + return; + + dsq_pcpu = container_of(dru, struct scx_dsq_pcpu, + deferred_reenq_user); + dsq = dsq_pcpu->dsq; + reenq_flags = dru->flags; + WRITE_ONCE(dru->flags, 0); + list_del_init(&dru->node); + } + + /* see schedule_dsq_reenq() */ + smp_mb(); + + BUG_ON(dsq->id & SCX_DSQ_FLAG_BUILTIN); + reenq_user(rq, dsq, reenq_flags); + } +} + +static void run_deferred(struct rq *rq) +{ + process_ddsp_deferred_locals(rq); + + if (!list_empty(&rq->scx.deferred_reenq_locals)) + process_deferred_reenq_locals(rq); + + if (!list_empty(&rq->scx.deferred_reenq_users)) + process_deferred_reenq_users(rq); +} + #ifdef CONFIG_NO_HZ_FULL bool scx_can_stop_tick(struct rq *rq) {