WRITE_ONCE(dsq->seq, dsq->seq + 1);
p->scx.dsq_seq = dsq->seq;
- dsq_mod_nr(dsq, 1);
+ dsq_inc_nr(dsq, p, enq_flags);
p->scx.dsq = dsq;
- /*
- * scx.ddsp_dsq_id and scx.ddsp_enq_flags are only relevant on the
- * direct dispatch path, but we clear them here because the direct
- * dispatch verdict may be overridden on the enqueue path during e.g.
- * bypass.
- */
- p->scx.ddsp_dsq_id = SCX_DSQ_INVALID;
- p->scx.ddsp_enq_flags = 0;
-
+ /*
+ * Update custody and call ops.dequeue() before clearing ops_state:
+ * once ops_state is cleared, waiters in ops_dequeue() can proceed
+ * and dequeue_task_scx() will RMW p->scx.flags. If we clear
+ * ops_state first, both sides would modify p->scx.flags
+ * concurrently in a non-atomic way.
+ */
+ if (is_local) {
+ local_dsq_post_enq(dsq, p, enq_flags);
+ } else {
+ /*
+ * Task on global/bypass DSQ: leave custody, task on
+ * non-terminal DSQ: enter custody.
+ */
+ if (dsq->id == SCX_DSQ_GLOBAL || dsq->id == SCX_DSQ_BYPASS)
+ call_task_dequeue(sch, rq, p, 0);
+ else
+ p->scx.flags |= SCX_TASK_IN_CUSTODY;
+
+ raw_spin_unlock(&dsq->lock);
+ }
+
/*
* We're transitioning out of QUEUEING or DISPATCHING. store_release to
* match waiters' load_acquire.
{
struct rq *rq = task_rq(p);
struct scx_dispatch_q *dsq =
- find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, p);
+ find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, task_cpu(p));
+ u64 ddsp_enq_flags;
touch_core_sched_dispatch(rq, p);
return;
}
- dispatch_enqueue(sch, rq, dsq, p,
- p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
+ ddsp_enq_flags = p->scx.ddsp_enq_flags;
+ clear_direct_dispatch(p);
+
- dispatch_enqueue(sch, dsq, p, ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
++ dispatch_enqueue(sch, rq, dsq, p, ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
}
static bool scx_rq_online(struct rq *rq)
*/
touch_core_sched(rq, p);
refill_task_slice_dfl(sch, p);
- dispatch_enqueue(sch, dsq, p, enq_flags);
+ clear_direct_dispatch(p);
+ dispatch_enqueue(sch, rq, dsq, p, enq_flags);
}
static bool task_runnable(const struct task_struct *p)
return 0;
}
- dsq = find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, task_cpu(p));
+static void process_ddsp_deferred_locals(struct rq *rq)
+{
+ struct task_struct *p;
+
+ lockdep_assert_rq_held(rq);
+
+ /*
+ * Now that @rq can be unlocked, execute the deferred enqueueing of
+ * tasks directly dispatched to the local DSQs of other CPUs. See
+ * direct_dispatch(). Keep popping from the head instead of using
+ * list_for_each_entry_safe() as dispatch_local_dsq() may unlock @rq
+ * temporarily.
+ */
+ while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals,
+ struct task_struct, scx.dsq_list.node))) {
+ struct scx_sched *sch = scx_task_sched(p);
+ struct scx_dispatch_q *dsq;
++ u64 dsq_id = p->scx.ddsp_dsq_id;
++ u64 enq_flags = p->scx.ddsp_enq_flags;
+
+ list_del_init(&p->scx.dsq_list.node);
++ clear_direct_dispatch(p);
+
- dispatch_to_local_dsq(sch, rq, dsq, p,
- p->scx.ddsp_enq_flags);
++ dsq = find_dsq_for_dispatch(sch, rq, dsq_id, task_cpu(p));
+ if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL))
++ dispatch_to_local_dsq(sch, rq, dsq, p, enq_flags);
+ }
+}
+
+/*
+ * Determine whether @p should be reenqueued from a local DSQ.
+ *
+ * @reenq_flags is mutable and accumulates state across the DSQ walk:
+ *
+ * - %SCX_REENQ_TSR_NOT_FIRST: Set after the first task is visited. "First"
+ * tracks position in the DSQ list, not among IMMED tasks. A non-IMMED task at
+ * the head consumes the first slot.
+ *
+ * - %SCX_REENQ_TSR_RQ_OPEN: Set by reenq_local() before the walk if
+ * rq_is_open() is true.
+ *
+ * An IMMED task is kept (returns %false) only if it's the first task in the DSQ
+ * AND the current task is done — i.e. it will execute immediately. All other
+ * IMMED tasks are reenqueued. This means if a non-IMMED task sits at the head,
+ * every IMMED task behind it gets reenqueued.
+ *
+ * Reenqueued tasks go through ops.enqueue() with %SCX_ENQ_REENQ |
+ * %SCX_TASK_REENQ_IMMED. If the BPF scheduler dispatches back to the same local
+ * DSQ with %SCX_ENQ_IMMED while the CPU is still unavailable, this triggers
+ * another reenq cycle. Repetitions are bounded by %SCX_REENQ_LOCAL_MAX_REPEAT
+ * in process_deferred_reenq_locals().
+ */
+static bool local_task_should_reenq(struct task_struct *p, u64 *reenq_flags, u32 *reason)
+{
+ bool first;
+
+ first = !(*reenq_flags & SCX_REENQ_TSR_NOT_FIRST);
+ *reenq_flags |= SCX_REENQ_TSR_NOT_FIRST;
+
+ *reason = SCX_TASK_REENQ_KFUNC;
+
+ if ((p->scx.flags & SCX_TASK_IMMED) &&
+ (!first || !(*reenq_flags & SCX_REENQ_TSR_RQ_OPEN))) {
+ __scx_add_event(scx_task_sched(p), SCX_EV_REENQ_IMMED, 1);
+ *reason = SCX_TASK_REENQ_IMMED;
+ return true;
+ }
+
+ return *reenq_flags & SCX_REENQ_ANY;
+}
+
+static u32 reenq_local(struct scx_sched *sch, struct rq *rq, u64 reenq_flags)
+{
+ LIST_HEAD(tasks);
+ u32 nr_enqueued = 0;
+ struct task_struct *p, *n;
+
+ lockdep_assert_rq_held(rq);
+
+ if (WARN_ON_ONCE(reenq_flags & __SCX_REENQ_TSR_MASK))
+ reenq_flags &= ~__SCX_REENQ_TSR_MASK;
+ if (rq_is_open(rq, 0))
+ reenq_flags |= SCX_REENQ_TSR_RQ_OPEN;
+
+ /*
+ * The BPF scheduler may choose to dispatch tasks back to
+ * @rq->scx.local_dsq. Move all candidate tasks off to a private list
+ * first to avoid processing the same tasks repeatedly.
+ */
+ list_for_each_entry_safe(p, n, &rq->scx.local_dsq.list,
+ scx.dsq_list.node) {
+ struct scx_sched *task_sch = scx_task_sched(p);
+ u32 reason;
+
+ /*
+ * If @p is being migrated, @p's current CPU may not agree with
+ * its allowed CPUs and the migration_cpu_stop is about to
+ * deactivate and re-activate @p anyway. Skip re-enqueueing.
+ *
+ * While racing sched property changes may also dequeue and
+ * re-enqueue a migrating task while its current CPU and allowed
+ * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to
+ * the current local DSQ for running tasks and thus are not
+ * visible to the BPF scheduler.
+ */
+ if (p->migration_pending)
+ continue;
+
+ if (!scx_is_descendant(task_sch, sch))
+ continue;
+
+ if (!local_task_should_reenq(p, &reenq_flags, &reason))
+ continue;
+
+ dispatch_dequeue(rq, p);
+
+ if (WARN_ON_ONCE(p->scx.flags & SCX_TASK_REENQ_REASON_MASK))
+ p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK;
+ p->scx.flags |= reason;
+
+ list_add_tail(&p->scx.dsq_list.node, &tasks);
+ }
+
+ list_for_each_entry_safe(p, n, &tasks, scx.dsq_list.node) {
+ list_del_init(&p->scx.dsq_list.node);
+
+ do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1);
+
+ p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK;
+ nr_enqueued++;
+ }
+
+ return nr_enqueued;
+}
+
+static void process_deferred_reenq_locals(struct rq *rq)
+{
+ u64 seq = ++rq->scx.deferred_reenq_locals_seq;
+
+ lockdep_assert_rq_held(rq);
+
+ while (true) {
+ struct scx_sched *sch;
+ u64 reenq_flags;
+ bool skip = false;
+
+ scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) {
+ struct scx_deferred_reenq_local *drl =
+ list_first_entry_or_null(&rq->scx.deferred_reenq_locals,
+ struct scx_deferred_reenq_local,
+ node);
+ struct scx_sched_pcpu *sch_pcpu;
+
+ if (!drl)
+ return;
+
+ sch_pcpu = container_of(drl, struct scx_sched_pcpu,
+ deferred_reenq_local);
+ sch = sch_pcpu->sch;
+
+ reenq_flags = drl->flags;
+ WRITE_ONCE(drl->flags, 0);
+ list_del_init(&drl->node);
+
+ if (likely(drl->seq != seq)) {
+ drl->seq = seq;
+ drl->cnt = 0;
+ } else {
+ if (unlikely(++drl->cnt > SCX_REENQ_LOCAL_MAX_REPEAT)) {
+ scx_error(sch, "SCX_ENQ_REENQ on SCX_DSQ_LOCAL repeated %u times",
+ drl->cnt);
+ skip = true;
+ }
+
+ __scx_add_event(sch, SCX_EV_REENQ_LOCAL_REPEAT, 1);
+ }
+ }
+
+ if (!skip) {
+ /* see schedule_dsq_reenq() */
+ smp_mb();
+
+ reenq_local(sch, rq, reenq_flags);
+ }
+ }
+}
+
+static bool user_task_should_reenq(struct task_struct *p, u64 reenq_flags, u32 *reason)
+{
+ *reason = SCX_TASK_REENQ_KFUNC;
+ return reenq_flags & SCX_REENQ_ANY;
+}
+
+static void reenq_user(struct rq *rq, struct scx_dispatch_q *dsq, u64 reenq_flags)
+{
+ struct rq *locked_rq = rq;
+ struct scx_sched *sch = dsq->sched;
+ struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, dsq, 0);
+ struct task_struct *p;
+ s32 nr_enqueued = 0;
+
+ lockdep_assert_rq_held(rq);
+
+ raw_spin_lock(&dsq->lock);
+
+ while (likely(!READ_ONCE(sch->bypass_depth))) {
+ struct rq *task_rq;
+ u32 reason;
+
+ p = nldsq_cursor_next_task(&cursor, dsq);
+ if (!p)
+ break;
+
+ if (!user_task_should_reenq(p, reenq_flags, &reason))
+ continue;
+
+ task_rq = task_rq(p);
+
+ if (locked_rq != task_rq) {
+ if (locked_rq)
+ raw_spin_rq_unlock(locked_rq);
+ if (unlikely(!raw_spin_rq_trylock(task_rq))) {
+ raw_spin_unlock(&dsq->lock);
+ raw_spin_rq_lock(task_rq);
+ raw_spin_lock(&dsq->lock);
+ }
+ locked_rq = task_rq;
+
+ /* did we lose @p while switching locks? */
+ if (nldsq_cursor_lost_task(&cursor, task_rq, dsq, p))
+ continue;
+ }
+
+ /* @p is on @dsq, its rq and @dsq are locked */
+ dispatch_dequeue_locked(p, dsq);
+ raw_spin_unlock(&dsq->lock);
+
+ if (WARN_ON_ONCE(p->scx.flags & SCX_TASK_REENQ_REASON_MASK))
+ p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK;
+ p->scx.flags |= reason;
+
+ do_enqueue_task(task_rq, p, SCX_ENQ_REENQ, -1);
+
+ p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK;
+
+ if (!(++nr_enqueued % SCX_TASK_ITER_BATCH)) {
+ raw_spin_rq_unlock(locked_rq);
+ locked_rq = NULL;
+ cpu_relax();
+ }
+
+ raw_spin_lock(&dsq->lock);
+ }
+
+ list_del_init(&cursor.node);
+ raw_spin_unlock(&dsq->lock);
+
+ if (locked_rq != rq) {
+ if (locked_rq)
+ raw_spin_rq_unlock(locked_rq);
+ raw_spin_rq_lock(rq);
+ }
+}
+
+static void process_deferred_reenq_users(struct rq *rq)
+{
+ lockdep_assert_rq_held(rq);
+
+ while (true) {
+ struct scx_dispatch_q *dsq;
+ u64 reenq_flags;
+
+ scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) {
+ struct scx_deferred_reenq_user *dru =
+ list_first_entry_or_null(&rq->scx.deferred_reenq_users,
+ struct scx_deferred_reenq_user,
+ node);
+ struct scx_dsq_pcpu *dsq_pcpu;
+
+ if (!dru)
+ return;
+
+ dsq_pcpu = container_of(dru, struct scx_dsq_pcpu,
+ deferred_reenq_user);
+ dsq = dsq_pcpu->dsq;
+ reenq_flags = dru->flags;
+ WRITE_ONCE(dru->flags, 0);
+ list_del_init(&dru->node);
+ }
+
+ /* see schedule_dsq_reenq() */
+ smp_mb();
+
+ BUG_ON(dsq->id & SCX_DSQ_FLAG_BUILTIN);
+ reenq_user(rq, dsq, reenq_flags);
+ }
+}
+
+static void run_deferred(struct rq *rq)
+{
+ process_ddsp_deferred_locals(rq);
+
+ if (!list_empty(&rq->scx.deferred_reenq_locals))
+ process_deferred_reenq_locals(rq);
+
+ if (!list_empty(&rq->scx.deferred_reenq_users))
+ process_deferred_reenq_users(rq);
+}
+
#ifdef CONFIG_NO_HZ_FULL
bool scx_can_stop_tick(struct rq *rq)
{