]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
Merge branch 'for-7.0-fixes' into for-7.1
authorTejun Heo <tj@kernel.org>
Fri, 3 Apr 2026 17:48:28 +0000 (07:48 -1000)
committerTejun Heo <tj@kernel.org>
Fri, 3 Apr 2026 17:48:28 +0000 (07:48 -1000)
Conflict in kernel/sched/ext.c between:

  7e0ffb72de8a ("sched_ext: Fix stale direct dispatch state in
  ddsp_dsq_id")

which clears ddsp state at individual call sites instead of
dispatch_enqueue(), and sub-sched related code reorg and API updates on
for-7.1. Resolved by applying the ddsp fix with for-7.1's signatures.

Signed-off-by: Tejun Heo <tj@kernel.org>
1  2 
kernel/sched/ext.c
kernel/sched/ext_idle.c

index 0253887e63c0387479f33739e73a8f0cac07c73f,064eaa76be4b9f8146c1249c6326a7ae4cbaa472..b757b853b42bb0d45014bea77f922c025fe1c1dd
@@@ -1586,40 -1106,9 +1586,31 @@@ static void dispatch_enqueue(struct scx
        WRITE_ONCE(dsq->seq, dsq->seq + 1);
        p->scx.dsq_seq = dsq->seq;
  
 -      dsq_mod_nr(dsq, 1);
 +      dsq_inc_nr(dsq, p, enq_flags);
        p->scx.dsq = dsq;
  
-       /*
-        * scx.ddsp_dsq_id and scx.ddsp_enq_flags are only relevant on the
-        * direct dispatch path, but we clear them here because the direct
-        * dispatch verdict may be overridden on the enqueue path during e.g.
-        * bypass.
-        */
-       p->scx.ddsp_dsq_id = SCX_DSQ_INVALID;
-       p->scx.ddsp_enq_flags = 0;
 +      /*
 +       * Update custody and call ops.dequeue() before clearing ops_state:
 +       * once ops_state is cleared, waiters in ops_dequeue() can proceed
 +       * and dequeue_task_scx() will RMW p->scx.flags. If we clear
 +       * ops_state first, both sides would modify p->scx.flags
 +       * concurrently in a non-atomic way.
 +       */
 +      if (is_local) {
 +              local_dsq_post_enq(dsq, p, enq_flags);
 +      } else {
 +              /*
 +               * Task on global/bypass DSQ: leave custody, task on
 +               * non-terminal DSQ: enter custody.
 +               */
 +              if (dsq->id == SCX_DSQ_GLOBAL || dsq->id == SCX_DSQ_BYPASS)
 +                      call_task_dequeue(sch, rq, p, 0);
 +              else
 +                      p->scx.flags |= SCX_TASK_IN_CUSTODY;
 +
 +              raw_spin_unlock(&dsq->lock);
 +      }
 +
        /*
         * We're transitioning out of QUEUEING or DISPATCHING. store_release to
         * match waiters' load_acquire.
@@@ -1784,7 -1300,8 +1796,8 @@@ static void direct_dispatch(struct scx_
  {
        struct rq *rq = task_rq(p);
        struct scx_dispatch_q *dsq =
 -              find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, p);
 +              find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, task_cpu(p));
+       u64 ddsp_enq_flags;
  
        touch_core_sched_dispatch(rq, p);
  
                return;
        }
  
-       dispatch_enqueue(sch, rq, dsq, p,
-                        p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
+       ddsp_enq_flags = p->scx.ddsp_enq_flags;
+       clear_direct_dispatch(p);
 -      dispatch_enqueue(sch, dsq, p, ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
++      dispatch_enqueue(sch, rq, dsq, p, ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
  }
  
  static bool scx_rq_online(struct rq *rq)
@@@ -1949,7 -1454,8 +1964,8 @@@ enqueue
         */
        touch_core_sched(rq, p);
        refill_task_slice_dfl(sch, p);
 -      dispatch_enqueue(sch, dsq, p, enq_flags);
+       clear_direct_dispatch(p);
 +      dispatch_enqueue(sch, rq, dsq, p, enq_flags);
  }
  
  static bool task_runnable(const struct task_struct *p)
@@@ -3931,313 -3267,6 +3950,315 @@@ int scx_check_setscheduler(struct task_
        return 0;
  }
  
-               dsq = find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, task_cpu(p));
 +static void process_ddsp_deferred_locals(struct rq *rq)
 +{
 +      struct task_struct *p;
 +
 +      lockdep_assert_rq_held(rq);
 +
 +      /*
 +       * Now that @rq can be unlocked, execute the deferred enqueueing of
 +       * tasks directly dispatched to the local DSQs of other CPUs. See
 +       * direct_dispatch(). Keep popping from the head instead of using
 +       * list_for_each_entry_safe() as dispatch_local_dsq() may unlock @rq
 +       * temporarily.
 +       */
 +      while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals,
 +                              struct task_struct, scx.dsq_list.node))) {
 +              struct scx_sched *sch = scx_task_sched(p);
 +              struct scx_dispatch_q *dsq;
++              u64 dsq_id = p->scx.ddsp_dsq_id;
++              u64 enq_flags = p->scx.ddsp_enq_flags;
 +
 +              list_del_init(&p->scx.dsq_list.node);
++              clear_direct_dispatch(p);
 +
-                       dispatch_to_local_dsq(sch, rq, dsq, p,
-                                             p->scx.ddsp_enq_flags);
++              dsq = find_dsq_for_dispatch(sch, rq, dsq_id, task_cpu(p));
 +              if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL))
++                      dispatch_to_local_dsq(sch, rq, dsq, p, enq_flags);
 +      }
 +}
 +
 +/*
 + * Determine whether @p should be reenqueued from a local DSQ.
 + *
 + * @reenq_flags is mutable and accumulates state across the DSQ walk:
 + *
 + * - %SCX_REENQ_TSR_NOT_FIRST: Set after the first task is visited. "First"
 + *   tracks position in the DSQ list, not among IMMED tasks. A non-IMMED task at
 + *   the head consumes the first slot.
 + *
 + * - %SCX_REENQ_TSR_RQ_OPEN: Set by reenq_local() before the walk if
 + *   rq_is_open() is true.
 + *
 + * An IMMED task is kept (returns %false) only if it's the first task in the DSQ
 + * AND the current task is done — i.e. it will execute immediately. All other
 + * IMMED tasks are reenqueued. This means if a non-IMMED task sits at the head,
 + * every IMMED task behind it gets reenqueued.
 + *
 + * Reenqueued tasks go through ops.enqueue() with %SCX_ENQ_REENQ |
 + * %SCX_TASK_REENQ_IMMED. If the BPF scheduler dispatches back to the same local
 + * DSQ with %SCX_ENQ_IMMED while the CPU is still unavailable, this triggers
 + * another reenq cycle. Repetitions are bounded by %SCX_REENQ_LOCAL_MAX_REPEAT
 + * in process_deferred_reenq_locals().
 + */
 +static bool local_task_should_reenq(struct task_struct *p, u64 *reenq_flags, u32 *reason)
 +{
 +      bool first;
 +
 +      first = !(*reenq_flags & SCX_REENQ_TSR_NOT_FIRST);
 +      *reenq_flags |= SCX_REENQ_TSR_NOT_FIRST;
 +
 +      *reason = SCX_TASK_REENQ_KFUNC;
 +
 +      if ((p->scx.flags & SCX_TASK_IMMED) &&
 +          (!first || !(*reenq_flags & SCX_REENQ_TSR_RQ_OPEN))) {
 +              __scx_add_event(scx_task_sched(p), SCX_EV_REENQ_IMMED, 1);
 +              *reason = SCX_TASK_REENQ_IMMED;
 +              return true;
 +      }
 +
 +      return *reenq_flags & SCX_REENQ_ANY;
 +}
 +
 +static u32 reenq_local(struct scx_sched *sch, struct rq *rq, u64 reenq_flags)
 +{
 +      LIST_HEAD(tasks);
 +      u32 nr_enqueued = 0;
 +      struct task_struct *p, *n;
 +
 +      lockdep_assert_rq_held(rq);
 +
 +      if (WARN_ON_ONCE(reenq_flags & __SCX_REENQ_TSR_MASK))
 +              reenq_flags &= ~__SCX_REENQ_TSR_MASK;
 +      if (rq_is_open(rq, 0))
 +              reenq_flags |= SCX_REENQ_TSR_RQ_OPEN;
 +
 +      /*
 +       * The BPF scheduler may choose to dispatch tasks back to
 +       * @rq->scx.local_dsq. Move all candidate tasks off to a private list
 +       * first to avoid processing the same tasks repeatedly.
 +       */
 +      list_for_each_entry_safe(p, n, &rq->scx.local_dsq.list,
 +                               scx.dsq_list.node) {
 +              struct scx_sched *task_sch = scx_task_sched(p);
 +              u32 reason;
 +
 +              /*
 +               * If @p is being migrated, @p's current CPU may not agree with
 +               * its allowed CPUs and the migration_cpu_stop is about to
 +               * deactivate and re-activate @p anyway. Skip re-enqueueing.
 +               *
 +               * While racing sched property changes may also dequeue and
 +               * re-enqueue a migrating task while its current CPU and allowed
 +               * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to
 +               * the current local DSQ for running tasks and thus are not
 +               * visible to the BPF scheduler.
 +               */
 +              if (p->migration_pending)
 +                      continue;
 +
 +              if (!scx_is_descendant(task_sch, sch))
 +                      continue;
 +
 +              if (!local_task_should_reenq(p, &reenq_flags, &reason))
 +                      continue;
 +
 +              dispatch_dequeue(rq, p);
 +
 +              if (WARN_ON_ONCE(p->scx.flags & SCX_TASK_REENQ_REASON_MASK))
 +                      p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK;
 +              p->scx.flags |= reason;
 +
 +              list_add_tail(&p->scx.dsq_list.node, &tasks);
 +      }
 +
 +      list_for_each_entry_safe(p, n, &tasks, scx.dsq_list.node) {
 +              list_del_init(&p->scx.dsq_list.node);
 +
 +              do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1);
 +
 +              p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK;
 +              nr_enqueued++;
 +      }
 +
 +      return nr_enqueued;
 +}
 +
 +static void process_deferred_reenq_locals(struct rq *rq)
 +{
 +      u64 seq = ++rq->scx.deferred_reenq_locals_seq;
 +
 +      lockdep_assert_rq_held(rq);
 +
 +      while (true) {
 +              struct scx_sched *sch;
 +              u64 reenq_flags;
 +              bool skip = false;
 +
 +              scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) {
 +                      struct scx_deferred_reenq_local *drl =
 +                              list_first_entry_or_null(&rq->scx.deferred_reenq_locals,
 +                                                       struct scx_deferred_reenq_local,
 +                                                       node);
 +                      struct scx_sched_pcpu *sch_pcpu;
 +
 +                      if (!drl)
 +                              return;
 +
 +                      sch_pcpu = container_of(drl, struct scx_sched_pcpu,
 +                                              deferred_reenq_local);
 +                      sch = sch_pcpu->sch;
 +
 +                      reenq_flags = drl->flags;
 +                      WRITE_ONCE(drl->flags, 0);
 +                      list_del_init(&drl->node);
 +
 +                      if (likely(drl->seq != seq)) {
 +                              drl->seq = seq;
 +                              drl->cnt = 0;
 +                      } else {
 +                              if (unlikely(++drl->cnt > SCX_REENQ_LOCAL_MAX_REPEAT)) {
 +                                      scx_error(sch, "SCX_ENQ_REENQ on SCX_DSQ_LOCAL repeated %u times",
 +                                                drl->cnt);
 +                                      skip = true;
 +                              }
 +
 +                              __scx_add_event(sch, SCX_EV_REENQ_LOCAL_REPEAT, 1);
 +                      }
 +              }
 +
 +              if (!skip) {
 +                      /* see schedule_dsq_reenq() */
 +                      smp_mb();
 +
 +                      reenq_local(sch, rq, reenq_flags);
 +              }
 +      }
 +}
 +
 +static bool user_task_should_reenq(struct task_struct *p, u64 reenq_flags, u32 *reason)
 +{
 +      *reason = SCX_TASK_REENQ_KFUNC;
 +      return reenq_flags & SCX_REENQ_ANY;
 +}
 +
 +static void reenq_user(struct rq *rq, struct scx_dispatch_q *dsq, u64 reenq_flags)
 +{
 +      struct rq *locked_rq = rq;
 +      struct scx_sched *sch = dsq->sched;
 +      struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, dsq, 0);
 +      struct task_struct *p;
 +      s32 nr_enqueued = 0;
 +
 +      lockdep_assert_rq_held(rq);
 +
 +      raw_spin_lock(&dsq->lock);
 +
 +      while (likely(!READ_ONCE(sch->bypass_depth))) {
 +              struct rq *task_rq;
 +              u32 reason;
 +
 +              p = nldsq_cursor_next_task(&cursor, dsq);
 +              if (!p)
 +                      break;
 +
 +              if (!user_task_should_reenq(p, reenq_flags, &reason))
 +                      continue;
 +
 +              task_rq = task_rq(p);
 +
 +              if (locked_rq != task_rq) {
 +                      if (locked_rq)
 +                              raw_spin_rq_unlock(locked_rq);
 +                      if (unlikely(!raw_spin_rq_trylock(task_rq))) {
 +                              raw_spin_unlock(&dsq->lock);
 +                              raw_spin_rq_lock(task_rq);
 +                              raw_spin_lock(&dsq->lock);
 +                      }
 +                      locked_rq = task_rq;
 +
 +                      /* did we lose @p while switching locks? */
 +                      if (nldsq_cursor_lost_task(&cursor, task_rq, dsq, p))
 +                              continue;
 +              }
 +
 +              /* @p is on @dsq, its rq and @dsq are locked */
 +              dispatch_dequeue_locked(p, dsq);
 +              raw_spin_unlock(&dsq->lock);
 +
 +              if (WARN_ON_ONCE(p->scx.flags & SCX_TASK_REENQ_REASON_MASK))
 +                      p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK;
 +              p->scx.flags |= reason;
 +
 +              do_enqueue_task(task_rq, p, SCX_ENQ_REENQ, -1);
 +
 +              p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK;
 +
 +              if (!(++nr_enqueued % SCX_TASK_ITER_BATCH)) {
 +                      raw_spin_rq_unlock(locked_rq);
 +                      locked_rq = NULL;
 +                      cpu_relax();
 +              }
 +
 +              raw_spin_lock(&dsq->lock);
 +      }
 +
 +      list_del_init(&cursor.node);
 +      raw_spin_unlock(&dsq->lock);
 +
 +      if (locked_rq != rq) {
 +              if (locked_rq)
 +                      raw_spin_rq_unlock(locked_rq);
 +              raw_spin_rq_lock(rq);
 +      }
 +}
 +
 +static void process_deferred_reenq_users(struct rq *rq)
 +{
 +      lockdep_assert_rq_held(rq);
 +
 +      while (true) {
 +              struct scx_dispatch_q *dsq;
 +              u64 reenq_flags;
 +
 +              scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) {
 +                      struct scx_deferred_reenq_user *dru =
 +                              list_first_entry_or_null(&rq->scx.deferred_reenq_users,
 +                                                       struct scx_deferred_reenq_user,
 +                                                       node);
 +                      struct scx_dsq_pcpu *dsq_pcpu;
 +
 +                      if (!dru)
 +                              return;
 +
 +                      dsq_pcpu = container_of(dru, struct scx_dsq_pcpu,
 +                                              deferred_reenq_user);
 +                      dsq = dsq_pcpu->dsq;
 +                      reenq_flags = dru->flags;
 +                      WRITE_ONCE(dru->flags, 0);
 +                      list_del_init(&dru->node);
 +              }
 +
 +              /* see schedule_dsq_reenq() */
 +              smp_mb();
 +
 +              BUG_ON(dsq->id & SCX_DSQ_FLAG_BUILTIN);
 +              reenq_user(rq, dsq, reenq_flags);
 +      }
 +}
 +
 +static void run_deferred(struct rq *rq)
 +{
 +      process_ddsp_deferred_locals(rq);
 +
 +      if (!list_empty(&rq->scx.deferred_reenq_locals))
 +              process_deferred_reenq_locals(rq);
 +
 +      if (!list_empty(&rq->scx.deferred_reenq_users))
 +              process_deferred_reenq_users(rq);
 +}
 +
  #ifdef CONFIG_NO_HZ_FULL
  bool scx_can_stop_tick(struct rq *rq)
  {
Simple merge