sched_ext: Allow SCX_DSQ_LOCAL_ON for direct dispatches

author Tejun Heo <tj@kernel.org>

Fri, 12 Jul 2024 18:20:33 +0000 (08:20 -1000)

committer Tejun Heo <tj@kernel.org>

Fri, 12 Jul 2024 18:20:33 +0000 (08:20 -1000)
author Tejun Heo <tj@kernel.org>
Fri, 12 Jul 2024 18:20:33 +0000 (08:20 -1000)
committer Tejun Heo <tj@kernel.org>
Fri, 12 Jul 2024 18:20:33 +0000 (08:20 -1000)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c

index 57d6ea65f857b2e40372f7c34ff399f85b62a1cf..03da2cecb54716e0f2a105d0b937ecd585ce6ff4 100644 (file)
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -888,6 +888,7 @@ static struct kobject *scx_root_kobj;
  #define CREATE_TRACE_POINTS
  #include <trace/events/sched_ext.h>
  
+static void process_ddsp_deferred_locals(struct rq *rq);
  static void scx_bpf_kick_cpu(s32 cpu, u64 flags);
  static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind,
                                              s64 exit_code,
@@ -1362,6 +1363,67 @@ static int ops_sanitize_err(const char *ops_name, s32 err)
         return -EPROTO;
  }
  
+static void run_deferred(struct rq *rq)
+{
+       process_ddsp_deferred_locals(rq);
+}
+
+#ifdef CONFIG_SMP
+static void deferred_bal_cb_workfn(struct rq *rq)
+{
+       run_deferred(rq);
+}
+#endif
+
+static void deferred_irq_workfn(struct irq_work *irq_work)
+{
+       struct rq *rq = container_of(irq_work, struct rq, scx.deferred_irq_work);
+
+       raw_spin_rq_lock(rq);
+       run_deferred(rq);
+       raw_spin_rq_unlock(rq);
+}
+
+/**
+ * schedule_deferred - Schedule execution of deferred actions on an rq
+ * @rq: target rq
+ *
+ * Schedule execution of deferred actions on @rq. Must be called with @rq
+ * locked. Deferred actions are executed with @rq locked but unpinned, and thus
+ * can unlock @rq to e.g. migrate tasks to other rqs.
+ */
+static void schedule_deferred(struct rq *rq)
+{
+       lockdep_assert_rq_held(rq);
+
+#ifdef CONFIG_SMP
+       /*
+        * If in the middle of waking up a task, task_woken_scx() will be called
+        * afterwards which will then run the deferred actions, no need to
+        * schedule anything.
+        */
+       if (rq->scx.flags & SCX_RQ_IN_WAKEUP)
+               return;
+
+       /*
+        * If in balance, the balance callbacks will be called before rq lock is
+        * released. Schedule one.
+        */
+       if (rq->scx.flags & SCX_RQ_IN_BALANCE) {
+               queue_balance_callback(rq, &rq->scx.deferred_bal_cb,
+                                      deferred_bal_cb_workfn);
+               return;
+       }
+#endif
+       /*
+        * No scheduler hooks available. Queue an irq work. They are executed on
+        * IRQ re-enable which may take a bit longer than the scheduler hooks.
+        * The above WAKEUP and BALANCE paths should cover most of the cases and
+        * the time to IRQ re-enable shouldn't be long.
+        */
+       irq_work_queue(&rq->scx.deferred_irq_work);
+}
+
  /**
   * touch_core_sched - Update timestamp used for core-sched task ordering
   * @rq: rq to read clock from, must be locked
@@ -1577,7 +1639,13 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p)
         bool is_local = dsq == &rq->scx.local_dsq;
  
         if (!dsq) {
-               WARN_ON_ONCE(!list_empty(&p->scx.dsq_list.node));
+               /*
+                * If !dsq && on-list, @p is on @rq's ddsp_deferred_locals.
+                * Unlinking is all that's needed to cancel.
+                */
+               if (unlikely(!list_empty(&p->scx.dsq_list.node)))
+                       list_del_init(&p->scx.dsq_list.node);
+
                 /*
                  * When dispatching directly from the BPF scheduler to a local
                  * DSQ, the task isn't associated with any DSQ but
@@ -1586,6 +1654,7 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p)
                  */
                 if (p->scx.holding_cpu >= 0)
                         p->scx.holding_cpu = -1;
+
                 return;
         }
  
@@ -1673,17 +1742,6 @@ static void mark_direct_dispatch(struct task_struct *ddsp_task,
                 return;
         }
  
-       /*
-        * %SCX_DSQ_LOCAL_ON is not supported during direct dispatch because
-        * dispatching to the local DSQ of a different CPU requires unlocking
-        * the current rq which isn't allowed in the enqueue path. Use
-        * ops.select_cpu() to be on the target CPU and then %SCX_DSQ_LOCAL.
-        */
-       if (unlikely((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON)) {
-               scx_ops_error("SCX_DSQ_LOCAL_ON can't be used for direct-dispatch");
-               return;
-       }
-
         WARN_ON_ONCE(p->scx.ddsp_dsq_id != SCX_DSQ_INVALID);
         WARN_ON_ONCE(p->scx.ddsp_enq_flags);
  
@@ -1693,13 +1751,58 @@ static void mark_direct_dispatch(struct task_struct *ddsp_task,
  
  static void direct_dispatch(struct task_struct *p, u64 enq_flags)
  {
+       struct rq *rq = task_rq(p);
         struct scx_dispatch_q *dsq;
+       u64 dsq_id = p->scx.ddsp_dsq_id;
+
+       touch_core_sched_dispatch(rq, p);
+
+       p->scx.ddsp_enq_flags |= enq_flags;
+
+       /*
+        * We are in the enqueue path with @rq locked and pinned, and thus can't
+        * double lock a remote rq and enqueue to its local DSQ. For
+        * DSQ_LOCAL_ON verdicts targeting the local DSQ of a remote CPU, defer
+        * the enqueue so that it's executed when @rq can be unlocked.
+        */
+       if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) {
+               s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
+               unsigned long opss;
  
-       touch_core_sched_dispatch(task_rq(p), p);
+               if (cpu == cpu_of(rq)) {
+                       dsq_id = SCX_DSQ_LOCAL;
+                       goto dispatch;
+               }
+
+               opss = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_STATE_MASK;
+
+               switch (opss & SCX_OPSS_STATE_MASK) {
+               case SCX_OPSS_NONE:
+                       break;
+               case SCX_OPSS_QUEUEING:
+                       /*
+                        * As @p was never passed to the BPF side, _release is
+                        * not strictly necessary. Still do it for consistency.
+                        */
+                       atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
+                       break;
+               default:
+                       WARN_ONCE(true, "sched_ext: %s[%d] has invalid ops state 0x%lx in direct_dispatch()",
+                                 p->comm, p->pid, opss);
+                       atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
+                       break;
+               }
  
-       enq_flags |= (p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
-       dsq = find_dsq_for_dispatch(task_rq(p), p->scx.ddsp_dsq_id, p);
-       dispatch_enqueue(dsq, p, enq_flags);
+               WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node));
+               list_add_tail(&p->scx.dsq_list.node,
+                             &rq->scx.ddsp_deferred_locals);
+               schedule_deferred(rq);
+               return;
+       }
+
+dispatch:
+       dsq = find_dsq_for_dispatch(rq, dsq_id, p);
+       dispatch_enqueue(dsq, p, p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
  }
  
  static bool scx_rq_online(struct rq *rq)
@@ -2601,6 +2704,29 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
         }
  }
  
+static void process_ddsp_deferred_locals(struct rq *rq)
+{
+       struct task_struct *p, *tmp;
+
+       lockdep_assert_rq_held(rq);
+
+       /*
+        * Now that @rq can be unlocked, execute the deferred enqueueing of
+        * tasks directly dispatched to the local DSQs of other CPUs. See
+        * direct_dispatch().
+        */
+       list_for_each_entry_safe(p, tmp, &rq->scx.ddsp_deferred_locals,
+                                scx.dsq_list.node) {
+               s32 ret;
+
+               list_del_init(&p->scx.dsq_list.node);
+
+               ret = dispatch_to_local_dsq(rq, p->scx.ddsp_dsq_id, p,
+                                           p->scx.ddsp_enq_flags);
+               WARN_ON_ONCE(ret == DTL_NOT_LOCAL);
+       }
+}
+
  static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
  {
  #ifndef CONFIG_SMP
@@ -3022,6 +3148,11 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
         }
  }
  
+static void task_woken_scx(struct rq *rq, struct task_struct *p)
+{
+       run_deferred(rq);
+}
+
  static void set_cpus_allowed_scx(struct task_struct *p,
                                  struct affinity_context *ac)
  {
@@ -3538,8 +3669,6 @@ bool scx_can_stop_tick(struct rq *rq)
   *
   * - task_fork/dead: We need fork/dead notifications for all tasks regardless of
   *   their current sched_class. Call them directly from sched core instead.
- *
- * - task_woken: Unnecessary.
   */
  DEFINE_SCHED_CLASS(ext) = {
         .enqueue_task           = enqueue_task_scx,
@@ -3559,6 +3688,7 @@ DEFINE_SCHED_CLASS(ext) = {
  #ifdef CONFIG_SMP
         .balance                = balance_scx,
         .select_task_rq         = select_task_rq_scx,
+       .task_woken             = task_woken_scx,
         .set_cpus_allowed       = set_cpus_allowed_scx,
  
         .rq_online              = rq_online_scx,
@@ -5263,11 +5393,13 @@ void __init init_sched_ext_class(void)
  
                 init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL);
                 INIT_LIST_HEAD(&rq->scx.runnable_list);
+               INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals);
  
                 BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL));
                 BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL));
                 BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL));
                 BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_wait, GFP_KERNEL));
+               init_irq_work(&rq->scx.deferred_irq_work, deferred_irq_workfn);
                 init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn);
  
                 if (cpu_online(cpu))
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index 8a0e8052f6b03844faa01a06897eff6c8fb3c62d..be7be54484c08dfde2fc27c1aa1d19862e5fdbfe 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -746,6 +746,7 @@ enum scx_rq_flags {
  struct scx_rq {
         struct scx_dispatch_q   local_dsq;
         struct list_head        runnable_list;          /* runnable tasks on this rq */
+       struct list_head        ddsp_deferred_locals;   /* deferred ddsps from enq */
         unsigned long           ops_qseq;
         u64                     extra_enq_flags;        /* see move_task_to_local_dsq() */
         u32                     nr_running;
@@ -757,6 +758,8 @@ struct scx_rq {
         cpumask_var_t           cpus_to_preempt;
         cpumask_var_t           cpus_to_wait;
         unsigned long           pnt_seq;
+       struct balance_callback deferred_bal_cb;
+       struct irq_work         deferred_irq_work;
         struct irq_work         kick_cpus_irq_work;
  };
  #endif /* CONFIG_SCHED_CLASS_EXT */
author	Tejun Heo <tj@kernel.org>
	Fri, 12 Jul 2024 18:20:33 +0000 (08:20 -1000)
committer	Tejun Heo <tj@kernel.org>
	Fri, 12 Jul 2024 18:20:33 +0000 (08:20 -1000)
kernel/sched/ext.c		patch \| blob \| blame \| history
kernel/sched/sched.h		patch \| blob \| blame \| history