Merge branch 'for-7.0-fixes' into for-7.1

author Tejun Heo <tj@kernel.org>

Fri, 3 Apr 2026 17:48:28 +0000 (07:48 -1000)

committer Tejun Heo <tj@kernel.org>

Fri, 3 Apr 2026 17:48:28 +0000 (07:48 -1000)
author Tejun Heo <tj@kernel.org>
Fri, 3 Apr 2026 17:48:28 +0000 (07:48 -1000)
committer Tejun Heo <tj@kernel.org>
Fri, 3 Apr 2026 17:48:28 +0000 (07:48 -1000)
diff --cc kernel/sched/ext.c

index 0253887e63c0387479f33739e73a8f0cac07c73f,064eaa76be4b9f8146c1249c6326a7ae4cbaa472..b757b853b42bb0d45014bea77f922c025fe1c1dd
--- 1/kernel/sched/ext.c
--- 2/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@@ -1586,40 -1106,9 +1586,31 @@@ static void dispatch_enqueue(struct scx
         WRITE_ONCE(dsq->seq, dsq->seq + 1);
         p->scx.dsq_seq = dsq->seq;
   
- -      dsq_mod_nr(dsq, 1);
+ +      dsq_inc_nr(dsq, p, enq_flags);
         p->scx.dsq = dsq;
   
-       /*
-        * scx.ddsp_dsq_id and scx.ddsp_enq_flags are only relevant on the
-        * direct dispatch path, but we clear them here because the direct
-        * dispatch verdict may be overridden on the enqueue path during e.g.
-        * bypass.
-        */
-       p->scx.ddsp_dsq_id = SCX_DSQ_INVALID;
-       p->scx.ddsp_enq_flags = 0;
- 
+ +      /*
+ +       * Update custody and call ops.dequeue() before clearing ops_state:
+ +       * once ops_state is cleared, waiters in ops_dequeue() can proceed
+ +       * and dequeue_task_scx() will RMW p->scx.flags. If we clear
+ +       * ops_state first, both sides would modify p->scx.flags
+ +       * concurrently in a non-atomic way.
+ +       */
+ +      if (is_local) {
+ +              local_dsq_post_enq(dsq, p, enq_flags);
+ +      } else {
+ +              /*
+ +               * Task on global/bypass DSQ: leave custody, task on
+ +               * non-terminal DSQ: enter custody.
+ +               */
+ +              if (dsq->id == SCX_DSQ_GLOBAL || dsq->id == SCX_DSQ_BYPASS)
+ +                      call_task_dequeue(sch, rq, p, 0);
+ +              else
+ +                      p->scx.flags |= SCX_TASK_IN_CUSTODY;
+ +
+ +              raw_spin_unlock(&dsq->lock);
+ +      }
+ +
         /*
          * We're transitioning out of QUEUEING or DISPATCHING. store_release to
          * match waiters' load_acquire.
@@@ -1784,7 -1300,8 +1796,8 @@@ static void direct_dispatch(struct scx_
   {
         struct rq *rq = task_rq(p);
         struct scx_dispatch_q *dsq =
- -              find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, p);
+ +              find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, task_cpu(p));
+       u64 ddsp_enq_flags;
   
         touch_core_sched_dispatch(rq, p);
   
@@@ -1825,8 -1342,10 +1838,10 @@@
                 return;
         }
   
-       dispatch_enqueue(sch, rq, dsq, p,
-                        p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
+       ddsp_enq_flags = p->scx.ddsp_enq_flags;
+       clear_direct_dispatch(p);
+ 
- -      dispatch_enqueue(sch, dsq, p, ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
++      dispatch_enqueue(sch, rq, dsq, p, ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
   }
   
   static bool scx_rq_online(struct rq *rq)
@@@ -1949,7 -1454,8 +1964,8 @@@ enqueue
          */
         touch_core_sched(rq, p);
         refill_task_slice_dfl(sch, p);
- -      dispatch_enqueue(sch, dsq, p, enq_flags);
+       clear_direct_dispatch(p);
+ +      dispatch_enqueue(sch, rq, dsq, p, enq_flags);
   }
   
   static bool task_runnable(const struct task_struct *p)
@@@ -3931,313 -3267,6 +3950,315 @@@ int scx_check_setscheduler(struct task_
         return 0;
   }
   
-               dsq = find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, task_cpu(p));
+ +static void process_ddsp_deferred_locals(struct rq *rq)
+ +{
+ +      struct task_struct *p;
+ +
+ +      lockdep_assert_rq_held(rq);
+ +
+ +      /*
+ +       * Now that @rq can be unlocked, execute the deferred enqueueing of
+ +       * tasks directly dispatched to the local DSQs of other CPUs. See
+ +       * direct_dispatch(). Keep popping from the head instead of using
+ +       * list_for_each_entry_safe() as dispatch_local_dsq() may unlock @rq
+ +       * temporarily.
+ +       */
+ +      while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals,
+ +                              struct task_struct, scx.dsq_list.node))) {
+ +              struct scx_sched *sch = scx_task_sched(p);
+ +              struct scx_dispatch_q *dsq;
++              u64 dsq_id = p->scx.ddsp_dsq_id;
++              u64 enq_flags = p->scx.ddsp_enq_flags;
+ +
+ +              list_del_init(&p->scx.dsq_list.node);
++              clear_direct_dispatch(p);
+ +
-                       dispatch_to_local_dsq(sch, rq, dsq, p,
-                                             p->scx.ddsp_enq_flags);
++              dsq = find_dsq_for_dispatch(sch, rq, dsq_id, task_cpu(p));
+ +              if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL))
++                      dispatch_to_local_dsq(sch, rq, dsq, p, enq_flags);
+ +      }
+ +}
+ +
+ +/*
+ + * Determine whether @p should be reenqueued from a local DSQ.
+ + *
+ + * @reenq_flags is mutable and accumulates state across the DSQ walk:
+ + *
+ + * - %SCX_REENQ_TSR_NOT_FIRST: Set after the first task is visited. "First"
+ + *   tracks position in the DSQ list, not among IMMED tasks. A non-IMMED task at
+ + *   the head consumes the first slot.
+ + *
+ + * - %SCX_REENQ_TSR_RQ_OPEN: Set by reenq_local() before the walk if
+ + *   rq_is_open() is true.
+ + *
+ + * An IMMED task is kept (returns %false) only if it's the first task in the DSQ
+ + * AND the current task is done — i.e. it will execute immediately. All other
+ + * IMMED tasks are reenqueued. This means if a non-IMMED task sits at the head,
+ + * every IMMED task behind it gets reenqueued.
+ + *
+ + * Reenqueued tasks go through ops.enqueue() with %SCX_ENQ_REENQ |
+ + * %SCX_TASK_REENQ_IMMED. If the BPF scheduler dispatches back to the same local
+ + * DSQ with %SCX_ENQ_IMMED while the CPU is still unavailable, this triggers
+ + * another reenq cycle. Repetitions are bounded by %SCX_REENQ_LOCAL_MAX_REPEAT
+ + * in process_deferred_reenq_locals().
+ + */
+ +static bool local_task_should_reenq(struct task_struct *p, u64 *reenq_flags, u32 *reason)
+ +{
+ +      bool first;
+ +
+ +      first = !(*reenq_flags & SCX_REENQ_TSR_NOT_FIRST);
+ +      *reenq_flags |= SCX_REENQ_TSR_NOT_FIRST;
+ +
+ +      *reason = SCX_TASK_REENQ_KFUNC;
+ +
+ +      if ((p->scx.flags & SCX_TASK_IMMED) &&
+ +          (!first || !(*reenq_flags & SCX_REENQ_TSR_RQ_OPEN))) {
+ +              __scx_add_event(scx_task_sched(p), SCX_EV_REENQ_IMMED, 1);
+ +              *reason = SCX_TASK_REENQ_IMMED;
+ +              return true;
+ +      }
+ +
+ +      return *reenq_flags & SCX_REENQ_ANY;
+ +}
+ +
+ +static u32 reenq_local(struct scx_sched *sch, struct rq *rq, u64 reenq_flags)
+ +{
+ +      LIST_HEAD(tasks);
+ +      u32 nr_enqueued = 0;
+ +      struct task_struct *p, *n;
+ +
+ +      lockdep_assert_rq_held(rq);
+ +
+ +      if (WARN_ON_ONCE(reenq_flags & __SCX_REENQ_TSR_MASK))
+ +              reenq_flags &= ~__SCX_REENQ_TSR_MASK;
+ +      if (rq_is_open(rq, 0))
+ +              reenq_flags |= SCX_REENQ_TSR_RQ_OPEN;
+ +
+ +      /*
+ +       * The BPF scheduler may choose to dispatch tasks back to
+ +       * @rq->scx.local_dsq. Move all candidate tasks off to a private list
+ +       * first to avoid processing the same tasks repeatedly.
+ +       */
+ +      list_for_each_entry_safe(p, n, &rq->scx.local_dsq.list,
+ +                               scx.dsq_list.node) {
+ +              struct scx_sched *task_sch = scx_task_sched(p);
+ +              u32 reason;
+ +
+ +              /*
+ +               * If @p is being migrated, @p's current CPU may not agree with
+ +               * its allowed CPUs and the migration_cpu_stop is about to
+ +               * deactivate and re-activate @p anyway. Skip re-enqueueing.
+ +               *
+ +               * While racing sched property changes may also dequeue and
+ +               * re-enqueue a migrating task while its current CPU and allowed
+ +               * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to
+ +               * the current local DSQ for running tasks and thus are not
+ +               * visible to the BPF scheduler.
+ +               */
+ +              if (p->migration_pending)
+ +                      continue;
+ +
+ +              if (!scx_is_descendant(task_sch, sch))
+ +                      continue;
+ +
+ +              if (!local_task_should_reenq(p, &reenq_flags, &reason))
+ +                      continue;
+ +
+ +              dispatch_dequeue(rq, p);
+ +
+ +              if (WARN_ON_ONCE(p->scx.flags & SCX_TASK_REENQ_REASON_MASK))
+ +                      p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK;
+ +              p->scx.flags |= reason;
+ +
+ +              list_add_tail(&p->scx.dsq_list.node, &tasks);
+ +      }
+ +
+ +      list_for_each_entry_safe(p, n, &tasks, scx.dsq_list.node) {
+ +              list_del_init(&p->scx.dsq_list.node);
+ +
+ +              do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1);
+ +
+ +              p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK;
+ +              nr_enqueued++;
+ +      }
+ +
+ +      return nr_enqueued;
+ +}
+ +
+ +static void process_deferred_reenq_locals(struct rq *rq)
+ +{
+ +      u64 seq = ++rq->scx.deferred_reenq_locals_seq;
+ +
+ +      lockdep_assert_rq_held(rq);
+ +
+ +      while (true) {
+ +              struct scx_sched *sch;
+ +              u64 reenq_flags;
+ +              bool skip = false;
+ +
+ +              scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) {
+ +                      struct scx_deferred_reenq_local *drl =
+ +                              list_first_entry_or_null(&rq->scx.deferred_reenq_locals,
+ +                                                       struct scx_deferred_reenq_local,
+ +                                                       node);
+ +                      struct scx_sched_pcpu *sch_pcpu;
+ +
+ +                      if (!drl)
+ +                              return;
+ +
+ +                      sch_pcpu = container_of(drl, struct scx_sched_pcpu,
+ +                                              deferred_reenq_local);
+ +                      sch = sch_pcpu->sch;
+ +
+ +                      reenq_flags = drl->flags;
+ +                      WRITE_ONCE(drl->flags, 0);
+ +                      list_del_init(&drl->node);
+ +
+ +                      if (likely(drl->seq != seq)) {
+ +                              drl->seq = seq;
+ +                              drl->cnt = 0;
+ +                      } else {
+ +                              if (unlikely(++drl->cnt > SCX_REENQ_LOCAL_MAX_REPEAT)) {
+ +                                      scx_error(sch, "SCX_ENQ_REENQ on SCX_DSQ_LOCAL repeated %u times",
+ +                                                drl->cnt);
+ +                                      skip = true;
+ +                              }
+ +
+ +                              __scx_add_event(sch, SCX_EV_REENQ_LOCAL_REPEAT, 1);
+ +                      }
+ +              }
+ +
+ +              if (!skip) {
+ +                      /* see schedule_dsq_reenq() */
+ +                      smp_mb();
+ +
+ +                      reenq_local(sch, rq, reenq_flags);
+ +              }
+ +      }
+ +}
+ +
+ +static bool user_task_should_reenq(struct task_struct *p, u64 reenq_flags, u32 *reason)
+ +{
+ +      *reason = SCX_TASK_REENQ_KFUNC;
+ +      return reenq_flags & SCX_REENQ_ANY;
+ +}
+ +
+ +static void reenq_user(struct rq *rq, struct scx_dispatch_q *dsq, u64 reenq_flags)
+ +{
+ +      struct rq *locked_rq = rq;
+ +      struct scx_sched *sch = dsq->sched;
+ +      struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, dsq, 0);
+ +      struct task_struct *p;
+ +      s32 nr_enqueued = 0;
+ +
+ +      lockdep_assert_rq_held(rq);
+ +
+ +      raw_spin_lock(&dsq->lock);
+ +
+ +      while (likely(!READ_ONCE(sch->bypass_depth))) {
+ +              struct rq *task_rq;
+ +              u32 reason;
+ +
+ +              p = nldsq_cursor_next_task(&cursor, dsq);
+ +              if (!p)
+ +                      break;
+ +
+ +              if (!user_task_should_reenq(p, reenq_flags, &reason))
+ +                      continue;
+ +
+ +              task_rq = task_rq(p);
+ +
+ +              if (locked_rq != task_rq) {
+ +                      if (locked_rq)
+ +                              raw_spin_rq_unlock(locked_rq);
+ +                      if (unlikely(!raw_spin_rq_trylock(task_rq))) {
+ +                              raw_spin_unlock(&dsq->lock);
+ +                              raw_spin_rq_lock(task_rq);
+ +                              raw_spin_lock(&dsq->lock);
+ +                      }
+ +                      locked_rq = task_rq;
+ +
+ +                      /* did we lose @p while switching locks? */
+ +                      if (nldsq_cursor_lost_task(&cursor, task_rq, dsq, p))
+ +                              continue;
+ +              }
+ +
+ +              /* @p is on @dsq, its rq and @dsq are locked */
+ +              dispatch_dequeue_locked(p, dsq);
+ +              raw_spin_unlock(&dsq->lock);
+ +
+ +              if (WARN_ON_ONCE(p->scx.flags & SCX_TASK_REENQ_REASON_MASK))
+ +                      p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK;
+ +              p->scx.flags |= reason;
+ +
+ +              do_enqueue_task(task_rq, p, SCX_ENQ_REENQ, -1);
+ +
+ +              p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK;
+ +
+ +              if (!(++nr_enqueued % SCX_TASK_ITER_BATCH)) {
+ +                      raw_spin_rq_unlock(locked_rq);
+ +                      locked_rq = NULL;
+ +                      cpu_relax();
+ +              }
+ +
+ +              raw_spin_lock(&dsq->lock);
+ +      }
+ +
+ +      list_del_init(&cursor.node);
+ +      raw_spin_unlock(&dsq->lock);
+ +
+ +      if (locked_rq != rq) {
+ +              if (locked_rq)
+ +                      raw_spin_rq_unlock(locked_rq);
+ +              raw_spin_rq_lock(rq);
+ +      }
+ +}
+ +
+ +static void process_deferred_reenq_users(struct rq *rq)
+ +{
+ +      lockdep_assert_rq_held(rq);
+ +
+ +      while (true) {
+ +              struct scx_dispatch_q *dsq;
+ +              u64 reenq_flags;
+ +
+ +              scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) {
+ +                      struct scx_deferred_reenq_user *dru =
+ +                              list_first_entry_or_null(&rq->scx.deferred_reenq_users,
+ +                                                       struct scx_deferred_reenq_user,
+ +                                                       node);
+ +                      struct scx_dsq_pcpu *dsq_pcpu;
+ +
+ +                      if (!dru)
+ +                              return;
+ +
+ +                      dsq_pcpu = container_of(dru, struct scx_dsq_pcpu,
+ +                                              deferred_reenq_user);
+ +                      dsq = dsq_pcpu->dsq;
+ +                      reenq_flags = dru->flags;
+ +                      WRITE_ONCE(dru->flags, 0);
+ +                      list_del_init(&dru->node);
+ +              }
+ +
+ +              /* see schedule_dsq_reenq() */
+ +              smp_mb();
+ +
+ +              BUG_ON(dsq->id & SCX_DSQ_FLAG_BUILTIN);
+ +              reenq_user(rq, dsq, reenq_flags);
+ +      }
+ +}
+ +
+ +static void run_deferred(struct rq *rq)
+ +{
+ +      process_ddsp_deferred_locals(rq);
+ +
+ +      if (!list_empty(&rq->scx.deferred_reenq_locals))
+ +              process_deferred_reenq_locals(rq);
+ +
+ +      if (!list_empty(&rq->scx.deferred_reenq_users))
+ +              process_deferred_reenq_users(rq);
+ +}
+ +
   #ifdef CONFIG_NO_HZ_FULL
   bool scx_can_stop_tick(struct rq *rq)
   {
diff --cc kernel/sched/ext_idle.c
Simple merge
author	Tejun Heo <tj@kernel.org>
	Fri, 3 Apr 2026 17:48:28 +0000 (07:48 -1000)
committer	Tejun Heo <tj@kernel.org>
	Fri, 3 Apr 2026 17:48:28 +0000 (07:48 -1000)
		1	2
kernel/sched/ext.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/ext_idle.c	patch \|	diff1 \|	diff2 \|	blob \| history