DEFINE_STATIC_KEY_FALSE(__scx_enabled);
DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED);
+static DEFINE_RAW_SPINLOCK(scx_bypass_lock);
static cpumask_var_t scx_bypass_lb_donee_cpumask;
static cpumask_var_t scx_bypass_lb_resched_cpumask;
static bool scx_init_task_enabled;
mod_timer(timer, jiffies + usecs_to_jiffies(intv_us));
}
+static bool inc_bypass_depth(struct scx_sched *sch)
+{
+ lockdep_assert_held(&scx_bypass_lock);
+
+ WARN_ON_ONCE(sch->bypass_depth < 0);
+ WRITE_ONCE(sch->bypass_depth, sch->bypass_depth + 1);
+ if (sch->bypass_depth != 1)
+ return false;
+
+ WRITE_ONCE(sch->slice_dfl, READ_ONCE(scx_slice_bypass_us) * NSEC_PER_USEC);
+ sch->bypass_timestamp = ktime_get_ns();
+ scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1);
+ return true;
+}
+
+static bool dec_bypass_depth(struct scx_sched *sch)
+{
+ lockdep_assert_held(&scx_bypass_lock);
+
+ WARN_ON_ONCE(sch->bypass_depth < 1);
+ WRITE_ONCE(sch->bypass_depth, sch->bypass_depth - 1);
+ if (sch->bypass_depth != 0)
+ return false;
+
+ WRITE_ONCE(sch->slice_dfl, SCX_SLICE_DFL);
+ scx_add_event(sch, SCX_EV_BYPASS_DURATION,
+ ktime_get_ns() - sch->bypass_timestamp);
+ return true;
+}
+
/**
* scx_bypass - [Un]bypass scx_ops and guarantee forward progress
* @sch: sched to bypass
*/
static void scx_bypass(struct scx_sched *sch, bool bypass)
{
- static DEFINE_RAW_SPINLOCK(bypass_lock);
+ struct scx_sched *pos;
unsigned long flags;
int cpu;
- raw_spin_lock_irqsave(&bypass_lock, flags);
+ raw_spin_lock_irqsave(&scx_bypass_lock, flags);
if (bypass) {
u32 intv_us;
- WRITE_ONCE(sch->bypass_depth, sch->bypass_depth + 1);
- WARN_ON_ONCE(sch->bypass_depth <= 0);
- if (sch->bypass_depth != 1)
+ if (!inc_bypass_depth(sch))
goto unlock;
- WRITE_ONCE(sch->slice_dfl, READ_ONCE(scx_slice_bypass_us) * NSEC_PER_USEC);
- sch->bypass_timestamp = ktime_get_ns();
- scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1);
intv_us = READ_ONCE(scx_bypass_lb_intv_us);
if (intv_us && !timer_pending(&sch->bypass_lb_timer)) {
add_timer_global(&sch->bypass_lb_timer);
}
} else {
- WRITE_ONCE(sch->bypass_depth, sch->bypass_depth - 1);
- WARN_ON_ONCE(sch->bypass_depth < 0);
- if (sch->bypass_depth != 0)
+ if (!dec_bypass_depth(sch))
goto unlock;
- WRITE_ONCE(sch->slice_dfl, SCX_SLICE_DFL);
- scx_add_event(sch, SCX_EV_BYPASS_DURATION,
- ktime_get_ns() - sch->bypass_timestamp);
}
+ /*
+ * Bypass state is propagated to all descendants - an scx_sched bypasses
+ * if itself or any of its ancestors are in bypass mode.
+ */
+ raw_spin_lock(&scx_sched_lock);
+ scx_for_each_descendant_pre(pos, sch) {
+ if (pos == sch)
+ continue;
+ if (bypass)
+ inc_bypass_depth(pos);
+ else
+ dec_bypass_depth(pos);
+ }
+ raw_spin_unlock(&scx_sched_lock);
+
/*
* No task property is changing. We just need to make sure all currently
* queued tasks are re-queued according to the new scx_bypassing()
*/
for_each_possible_cpu(cpu) {
struct rq *rq = cpu_rq(cpu);
- struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu);
struct task_struct *p, *n;
raw_spin_rq_lock(rq);
- if (bypass) {
- WARN_ON_ONCE(pcpu->flags & SCX_SCHED_PCPU_BYPASSING);
- pcpu->flags |= SCX_SCHED_PCPU_BYPASSING;
- } else {
- WARN_ON_ONCE(!(pcpu->flags & SCX_SCHED_PCPU_BYPASSING));
- pcpu->flags &= ~SCX_SCHED_PCPU_BYPASSING;
+ raw_spin_lock(&scx_sched_lock);
+ scx_for_each_descendant_pre(pos, sch) {
+ struct scx_sched_pcpu *pcpu = per_cpu_ptr(pos->pcpu, cpu);
+
+ if (pos->bypass_depth)
+ pcpu->flags |= SCX_SCHED_PCPU_BYPASSING;
+ else
+ pcpu->flags &= ~SCX_SCHED_PCPU_BYPASSING;
}
+ raw_spin_unlock(&scx_sched_lock);
/*
* We need to guarantee that no tasks are on the BPF scheduler
*/
list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list,
scx.runnable_node) {
+ if (!scx_is_descendant(scx_task_sched(p), sch))
+ continue;
+
/* cycling deq/enq is enough, see the function comment */
scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
/* nothing */ ;
}
unlock:
- raw_spin_unlock_irqrestore(&bypass_lock, flags);
+ raw_spin_unlock_irqrestore(&scx_bypass_lock, flags);
}
static void free_exit_info(struct scx_exit_info *ei)