return &per_cpu_ptr(sch->pcpu, cpu)->bypass_dsq;
}
+static struct scx_dispatch_q *bypass_enq_target_dsq(struct scx_sched *sch, s32 cpu)
+{
+#ifdef CONFIG_EXT_SUB_SCHED
+ /*
+ * If @sch is a sub-sched which is bypassing, its tasks should go into
+ * the bypass DSQs of the nearest ancestor which is not bypassing. The
+ * not-bypassing ancestor is responsible for scheduling all tasks from
+ * bypassing sub-trees. If all ancestors including root are bypassing,
+ * all tasks should go to the root's bypass DSQs.
+ *
+ * Whenever a sched starts bypassing, all runnable tasks in its subtree
+ * are re-enqueued after scx_bypassing() is turned on, guaranteeing that
+ * all tasks are transferred to the right DSQs.
+ */
+ while (scx_parent(sch) && scx_bypassing(sch, cpu))
+ sch = scx_parent(sch);
+#endif /* CONFIG_EXT_SUB_SCHED */
+
+ return bypass_dsq(sch, cpu);
+}
+
/**
* bypass_dsp_enabled - Check if bypass dispatch path is enabled
* @sch: scheduler to check
dsq = find_global_dsq(sch, p);
goto enqueue;
bypass:
- dsq = bypass_dsq(sch, task_cpu(p));
+ dsq = bypass_enq_target_dsq(sch, task_cpu(p));
goto enqueue;
enqueue:
if (consume_global_dsq(sch, rq))
return true;
- if (bypass_dsp_enabled(sch) && scx_bypassing(sch, cpu))
- return consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu));
+ if (bypass_dsp_enabled(sch)) {
+ /* if @sch is bypassing, only the bypass DSQs are active */
+ if (scx_bypassing(sch, cpu))
+ return consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu));
+
+#ifdef CONFIG_EXT_SUB_SCHED
+ /*
+ * If @sch isn't bypassing but its children are, @sch is
+ * responsible for making forward progress for both its own
+ * tasks that aren't bypassing and the bypassing descendants'
+ * tasks. The following implements a simple built-in behavior -
+ * let each CPU try to run the bypass DSQ every Nth time.
+ *
+ * Later, if necessary, we can add an ops flag to suppress the
+ * auto-consumption and a kfunc to consume the bypass DSQ and,
+ * so that the BPF scheduler can fully control scheduling of
+ * bypassed tasks.
+ */
+ struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu);
+
+ if (!(pcpu->bypass_host_seq++ % SCX_BYPASS_HOST_NTH) &&
+ consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu))) {
+ __scx_add_event(sch, SCX_EV_SUB_BYPASS_DISPATCH, 1);
+ return true;
+ }
+#endif /* CONFIG_EXT_SUB_SCHED */
+ }
if (unlikely(!SCX_HAS_OP(sch, dispatch)) || !scx_rq_online(rq))
return false;
}
} while (dspc->nr_tasks);
+ /*
+ * Prevent the CPU from going idle while bypassed descendants have tasks
+ * queued. Without this fallback, bypassed tasks could stall if the host
+ * scheduler's ops.dispatch() doesn't yield any tasks.
+ */
+ if (bypass_dsp_enabled(sch))
+ return consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu));
+
return false;
}
at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DISPATCH);
at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_ACTIVATE);
at += scx_attr_event_show(buf, at, &events, SCX_EV_INSERT_NOT_OWNED);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_SUB_BYPASS_DISPATCH);
return at;
}
SCX_ATTR(events);
static void enable_bypass_dsp(struct scx_sched *sch)
{
+ struct scx_sched *host = scx_parent(sch) ?: sch;
u32 intv_us = READ_ONCE(scx_bypass_lb_intv_us);
s32 ret;
return;
/*
- * The LB timer will stop running if bypass_arm_depth is 0. Increment
- * before starting the LB timer.
+ * When a sub-sched bypasses, its tasks are queued on the bypass DSQs of
+ * the nearest non-bypassing ancestor or root. As enable_bypass_dsp() is
+ * called iff @sch is not already bypassed due to an ancestor bypassing,
+ * we can assume that the parent is not bypassing and thus will be the
+ * host of the bypass DSQs.
+ *
+ * While the situation may change in the future, the following
+ * guarantees that the nearest non-bypassing ancestor or root has bypass
+ * dispatch enabled while a descendant is bypassing, which is all that's
+ * required.
+ *
+ * bypass_dsp_enabled() test is used to determine whether to enter the
+ * bypass dispatch handling path from both bypassing and hosting scheds.
+ * Bump enable depth on both @sch and bypass dispatch host.
*/
ret = atomic_inc_return(&sch->bypass_dsp_enable_depth);
WARN_ON_ONCE(ret <= 0);
- if (intv_us && !timer_pending(&sch->bypass_lb_timer))
- mod_timer(&sch->bypass_lb_timer,
+ if (host != sch) {
+ ret = atomic_inc_return(&host->bypass_dsp_enable_depth);
+ WARN_ON_ONCE(ret <= 0);
+ }
+
+ /*
+ * The LB timer will stop running if bypass dispatch is disabled. Start
+ * after enabling bypass dispatch.
+ */
+ if (intv_us && !timer_pending(&host->bypass_lb_timer))
+ mod_timer(&host->bypass_lb_timer,
jiffies + usecs_to_jiffies(intv_us));
}
ret = atomic_dec_return(&sch->bypass_dsp_enable_depth);
WARN_ON_ONCE(ret < 0);
+
+ if (scx_parent(sch)) {
+ ret = atomic_dec_return(&scx_parent(sch)->bypass_dsp_enable_depth);
+ WARN_ON_ONCE(ret < 0);
+ }
}
/**
scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH);
scx_dump_event(s, &events, SCX_EV_BYPASS_ACTIVATE);
scx_dump_event(s, &events, SCX_EV_INSERT_NOT_OWNED);
+ scx_dump_event(s, &events, SCX_EV_SUB_BYPASS_DISPATCH);
if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker))
memcpy(ei->dump + dump_len - sizeof(trunc_marker),