]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
sched_ext: add p->scx.tid and SCX_OPS_TID_TO_TASK lookup
authorTejun Heo <tj@kernel.org>
Sun, 19 Apr 2026 18:36:45 +0000 (08:36 -1000)
committerTejun Heo <tj@kernel.org>
Mon, 20 Apr 2026 16:55:33 +0000 (06:55 -1000)
BPF schedulers that can't hold task_struct pointers (arena-backed ones in
particular) key tasks by pid. During exit, pid is released before the
task finishes passing through scheduler callbacks, so a dying task
becomes invisible to the BPF side mid-schedule. scx_qmap hits this: an
exiting task's dispatch callback can't recover its queue entry, stalling
dispatch until SCX_EXIT_ERROR_STALL.

Add a unique non-zero u64 p->scx.tid assigned at fork that survives the
full task lifetime including exit. scx_bpf_tid_to_task() looks up the
task; unlike bpf_task_from_pid(), it handles exiting tasks.

The lookup costs an rhashtable insert/remove under scx_tasks_lock, so
root schedulers opt in via SCX_OPS_TID_TO_TASK. Sub-schedulers that set
the flag to declare a dependency are rejected at attach if root didn't
opt in.

scx_qmap converted: keys tasks by tid and enables SCX_OPS_ENQ_EXITING.
Pre-patch it stalls within seconds under a non-leader-exec workload;
with the patch it runs cleanly.

v3: Warn on rhashtable_lookup_insert_fast() failure via new
    scx_tid_hash_insert() helper (Cheng-Yang Chou).

v2: Guard scx_root deref in scx_bpf_tid_to_task() error path. The kfunc
    is registered via scx_kfunc_set_any and reachable from tracing and
    syscall programs when no scheduler is attached (Cheng-Yang Chou).

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Cheng-Yang Chou <yphbchou0911@gmail.com>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
include/linux/sched/ext.h
kernel/sched/ext.c
kernel/sched/ext_internal.h
tools/sched_ext/include/scx/common.bpf.h
tools/sched_ext/scx_qmap.bpf.c

index 1a3af2ea2a794ab2031691413f36a19538ca3617..d05efcac794d684c25eae4f90923f9eb323f9bf3 100644 (file)
@@ -203,6 +203,15 @@ struct sched_ext_entity {
        u64                     core_sched_at;  /* see scx_prio_less() */
 #endif
 
+       /*
+        * Unique non-zero task ID assigned at fork. Persists across exec and
+        * is never reused. Lets BPF schedulers identify tasks without storing
+        * kernel pointers - arena-backed schedulers being one example. See
+        * scx_bpf_tid_to_task().
+        */
+       u64                     tid;
+       struct rhash_head       tid_hash_node;  /* see SCX_OPS_TID_TO_TASK */
+
        /* BPF scheduler modifiable fields */
 
        /*
index 4b0527840f2f04871154ca72685d3496df99462b..b34f1e5df1c5afb5fa5d677dc2a554a6de8c037d 100644 (file)
@@ -38,6 +38,15 @@ static const struct rhashtable_params scx_sched_hash_params = {
 static struct rhashtable scx_sched_hash;
 #endif
 
+/* see SCX_OPS_TID_TO_TASK */
+static const struct rhashtable_params scx_tid_hash_params = {
+       .key_len                = sizeof_field(struct sched_ext_entity, tid),
+       .key_offset             = offsetof(struct sched_ext_entity, tid),
+       .head_offset            = offsetof(struct sched_ext_entity, tid_hash_node),
+       .insecure_elasticity    = true, /* inserted/removed under scx_tasks_lock */
+};
+static struct rhashtable scx_tid_hash;
+
 /*
  * During exit, a task may schedule after losing its PIDs. When disabling the
  * BPF scheduler, we need to be able to iterate tasks in every state to
@@ -58,10 +67,25 @@ static cpumask_var_t scx_bypass_lb_resched_cpumask;
 static bool scx_init_task_enabled;
 static bool scx_switching_all;
 DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
+static DEFINE_STATIC_KEY_FALSE(__scx_tid_to_task_enabled);
+
+/*
+ * True once SCX_OPS_TID_TO_TASK has been negotiated with the root scheduler
+ * and the tid->task table is live. Wraps the static key so callers don't
+ * take the address, and hints "likely enabled" for the common case where
+ * the feature is in use.
+ */
+static inline bool scx_tid_to_task_enabled(void)
+{
+       return static_branch_likely(&__scx_tid_to_task_enabled);
+}
 
 static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0);
 static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);
 
+/* Global cursor for the per-CPU tid allocator. Starts at 1; tid 0 is reserved. */
+static atomic64_t scx_tid_cursor = ATOMIC64_INIT(1);
+
 #ifdef CONFIG_EXT_SUB_SCHED
 /*
  * The sub sched being enabled. Used by scx_disable_and_exit_task() to exit
@@ -110,6 +134,17 @@ struct scx_kick_syncs {
 
 static DEFINE_PER_CPU(struct scx_kick_syncs __rcu *, scx_kick_syncs);
 
+/*
+ * Per-CPU buffered allocator state for p->scx.tid. Each CPU pulls a chunk of
+ * SCX_TID_CHUNK ids from scx_tid_cursor and hands them out locally without
+ * further synchronization. See scx_alloc_tid().
+ */
+struct scx_tid_alloc {
+       u64     next;
+       u64     end;
+};
+static DEFINE_PER_CPU(struct scx_tid_alloc, scx_tid_alloc);
+
 /*
  * Direct dispatch marker.
  *
@@ -3665,6 +3700,33 @@ void init_scx_entity(struct sched_ext_entity *scx)
        scx->slice = SCX_SLICE_DFL;
 }
 
+/* See scx_tid_alloc / scx_tid_cursor. */
+static u64 scx_alloc_tid(void)
+{
+       struct scx_tid_alloc *ta;
+
+       guard(preempt)();
+       ta = this_cpu_ptr(&scx_tid_alloc);
+
+       if (unlikely(ta->next >= ta->end)) {
+               ta->next = atomic64_fetch_add(SCX_TID_CHUNK, &scx_tid_cursor);
+               ta->end = ta->next + SCX_TID_CHUNK;
+       }
+       return ta->next++;
+}
+
+static void scx_tid_hash_insert(struct task_struct *p)
+{
+       int ret;
+
+       lockdep_assert_held(&scx_tasks_lock);
+
+       ret = rhashtable_lookup_insert_fast(&scx_tid_hash,
+                                           &p->scx.tid_hash_node,
+                                           scx_tid_hash_params);
+       WARN_ON_ONCE(ret);
+}
+
 void scx_pre_fork(struct task_struct *p)
 {
        /*
@@ -3682,6 +3744,8 @@ int scx_fork(struct task_struct *p, struct kernel_clone_args *kargs)
 
        percpu_rwsem_assert_held(&scx_fork_rwsem);
 
+       p->scx.tid = scx_alloc_tid();
+
        if (scx_init_task_enabled) {
 #ifdef CONFIG_EXT_SUB_SCHED
                struct scx_sched *sch = kargs->cset->dfl_cgrp->scx_sched;
@@ -3717,9 +3781,11 @@ void scx_post_fork(struct task_struct *p)
                }
        }
 
-       raw_spin_lock_irq(&scx_tasks_lock);
-       list_add_tail(&p->scx.tasks_node, &scx_tasks);
-       raw_spin_unlock_irq(&scx_tasks_lock);
+       scoped_guard(raw_spinlock_irq, &scx_tasks_lock) {
+               list_add_tail(&p->scx.tasks_node, &scx_tasks);
+               if (scx_tid_to_task_enabled())
+                       scx_tid_hash_insert(p);
+       }
 
        percpu_up_read(&scx_fork_rwsem);
 }
@@ -3770,17 +3836,19 @@ static bool task_dead_and_done(struct task_struct *p)
 
 void sched_ext_dead(struct task_struct *p)
 {
-       unsigned long flags;
-
        /*
         * By the time control reaches here, @p has %TASK_DEAD set, switched out
         * for the last time and then dropped the rq lock - task_dead_and_done()
         * should be returning %true nullifying the straggling sched_class ops.
         * Remove from scx_tasks and exit @p.
         */
-       raw_spin_lock_irqsave(&scx_tasks_lock, flags);
-       list_del_init(&p->scx.tasks_node);
-       raw_spin_unlock_irqrestore(&scx_tasks_lock, flags);
+       scoped_guard(raw_spinlock_irqsave, &scx_tasks_lock) {
+               list_del_init(&p->scx.tasks_node);
+               if (scx_tid_to_task_enabled())
+                       rhashtable_remove_fast(&scx_tid_hash,
+                                              &p->scx.tid_hash_node,
+                                              scx_tid_hash_params);
+       }
 
        /*
         * @p is off scx_tasks and wholly ours. scx_root_enable()'s READY ->
@@ -5815,9 +5883,13 @@ static void scx_root_disable(struct scx_sched *sch)
 
        /* no task is on scx, turn off all the switches and flush in-progress calls */
        static_branch_disable(&__scx_enabled);
+       if (sch->ops.flags & SCX_OPS_TID_TO_TASK)
+               static_branch_disable(&__scx_tid_to_task_enabled);
        bitmap_zero(sch->has_op, SCX_OPI_END);
        scx_idle_disable();
        synchronize_rcu();
+       if (sch->ops.flags & SCX_OPS_TID_TO_TASK)
+               rhashtable_free_and_destroy(&scx_tid_hash, NULL, NULL);
 
        scx_log_sched_disable(sch);
 
@@ -6561,6 +6633,17 @@ static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops)
                return -EINVAL;
        }
 
+       /*
+        * SCX_OPS_TID_TO_TASK is enabled by the root scheduler. A sub-sched
+        * may set it to declare a dependency; reject if the root hasn't
+        * enabled it.
+        */
+       if ((ops->flags & SCX_OPS_TID_TO_TASK) && scx_parent(sch) &&
+           !(scx_root->ops.flags & SCX_OPS_TID_TO_TASK)) {
+               scx_error(sch, "SCX_OPS_TID_TO_TASK requires root scheduler to enable it");
+               return -EINVAL;
+       }
+
        /*
         * SCX_OPS_BUILTIN_IDLE_PER_NODE requires built-in CPU idle
         * selection policy to be enabled.
@@ -6611,13 +6694,19 @@ static void scx_root_enable_workfn(struct kthread_work *work)
        if (ret)
                goto err_unlock;
 
+       if (ops->flags & SCX_OPS_TID_TO_TASK) {
+               ret = rhashtable_init(&scx_tid_hash, &scx_tid_hash_params);
+               if (ret)
+                       goto err_free_ksyncs;
+       }
+
 #if defined(CONFIG_EXT_GROUP_SCHED) || defined(CONFIG_EXT_SUB_SCHED)
        cgroup_get(cgrp);
 #endif
        sch = scx_alloc_and_add_sched(ops, cgrp, NULL);
        if (IS_ERR(sch)) {
                ret = PTR_ERR(sch);
-               goto err_free_ksyncs;
+               goto err_free_tid_hash;
        }
 
        /*
@@ -6706,6 +6795,10 @@ static void scx_root_enable_workfn(struct kthread_work *work)
        WARN_ON_ONCE(scx_init_task_enabled);
        scx_init_task_enabled = true;
 
+       /* flip under fork_rwsem; the iter below covers existing tasks */
+       if (ops->flags & SCX_OPS_TID_TO_TASK)
+               static_branch_enable(&__scx_tid_to_task_enabled);
+
        /*
         * Enable ops for every task. Fork is excluded by scx_fork_rwsem
         * preventing new tasks from being added. No need to exclude tasks
@@ -6749,6 +6842,17 @@ static void scx_root_enable_workfn(struct kthread_work *work)
                scx_set_task_sched(p, sch);
                scx_set_task_state(p, SCX_TASK_READY);
 
+               /*
+                * Insert into the tid hash under scx_tasks_lock so we can't
+                * race sched_ext_dead() and leave a stale entry for an already
+                * exited task.
+                */
+               if (scx_tid_to_task_enabled()) {
+                       guard(raw_spinlock_irq)(&scx_tasks_lock);
+                       if (!list_empty(&p->scx.tasks_node))
+                               scx_tid_hash_insert(p);
+               }
+
                put_task_struct(p);
        }
        scx_task_iter_stop(&sti);
@@ -6808,6 +6912,9 @@ static void scx_root_enable_workfn(struct kthread_work *work)
        cmd->ret = 0;
        return;
 
+err_free_tid_hash:
+       if (ops->flags & SCX_OPS_TID_TO_TASK)
+               rhashtable_free_and_destroy(&scx_tid_hash, NULL, NULL);
 err_free_ksyncs:
        free_kick_syncs();
 err_unlock:
@@ -9296,6 +9403,34 @@ __bpf_kfunc struct task_struct *scx_bpf_cpu_curr(s32 cpu, const struct bpf_prog_
        return rcu_dereference(cpu_rq(cpu)->curr);
 }
 
+/**
+ * scx_bpf_tid_to_task - Look up a task by its scx tid
+ * @tid: task ID previously read from p->scx.tid
+ *
+ * Returns the task with the given tid, or NULL if no such task exists. The
+ * returned pointer is valid until the end of the current RCU read section
+ * (KF_RCU_PROTECTED). Requires SCX_OPS_TID_TO_TASK to be set on the root
+ * scheduler; otherwise an error is raised and NULL returned.
+ */
+__bpf_kfunc struct task_struct *scx_bpf_tid_to_task(u64 tid)
+{
+       struct sched_ext_entity *scx;
+
+       if (!scx_tid_to_task_enabled()) {
+               struct scx_sched *sch = rcu_dereference(scx_root);
+
+               if (sch)
+                       scx_error(sch, "scx_bpf_tid_to_task() called without SCX_OPS_TID_TO_TASK");
+               return NULL;
+       }
+
+       scx = rhashtable_lookup(&scx_tid_hash, &tid, scx_tid_hash_params);
+       if (!scx)
+               return NULL;
+
+       return container_of(scx, struct task_struct, scx);
+}
+
 /**
  * scx_bpf_now - Returns a high-performance monotonically non-decreasing
  * clock for the current CPU. The clock returned is in nanoseconds.
@@ -9479,6 +9614,7 @@ BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_cpu_rq, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, scx_bpf_locked_rq, KF_IMPLICIT_ARGS | KF_RET_NULL)
 BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_IMPLICIT_ARGS | KF_RET_NULL | KF_RCU_PROTECTED)
+BTF_ID_FLAGS(func, scx_bpf_tid_to_task, KF_RET_NULL | KF_RCU_PROTECTED)
 BTF_ID_FLAGS(func, scx_bpf_now)
 BTF_ID_FLAGS(func, scx_bpf_events)
 #ifdef CONFIG_CGROUP_SCHED
index 62ce4eaf6a3f560ce26c25ed7ac9618f1da42c09..4a7ffc7f55d2a72fdeab432259996bc2c89b7d7d 100644 (file)
@@ -13,6 +13,9 @@ enum scx_consts {
        SCX_DSP_MAX_LOOPS               = 32,
        SCX_WATCHDOG_MAX_TIMEOUT        = 30 * HZ,
 
+       /* per-CPU chunk size for p->scx.tid allocation, see scx_alloc_tid() */
+       SCX_TID_CHUNK                   = 1024,
+
        SCX_EXIT_BT_LEN                 = 64,
        SCX_EXIT_MSG_LEN                = 1024,
        SCX_EXIT_DUMP_DFL_LEN           = 32768,
@@ -138,7 +141,8 @@ enum scx_ops_flags {
         * To mask this problem, by default, unhashed tasks are automatically
         * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't
         * depend on pid lookups and wants to handle these tasks directly, the
-        * following flag can be used.
+        * following flag can be used. With %SCX_OPS_TID_TO_TASK,
+        * scx_bpf_tid_to_task() can find exiting tasks reliably.
         */
        SCX_OPS_ENQ_EXITING             = 1LLU << 2,
 
@@ -189,6 +193,17 @@ enum scx_ops_flags {
         */
        SCX_OPS_ALWAYS_ENQ_IMMED        = 1LLU << 7,
 
+       /*
+        * Maintain a mapping from p->scx.tid to task_struct so the BPF
+        * scheduler can recover task pointers from stored tids via
+        * scx_bpf_tid_to_task().
+        *
+        * Only the root scheduler turns this on. A sub-sched may set the flag
+        * to declare a dependency on the lookup; if the root scheduler hasn't
+        * enabled it, attaching the sub-sched is rejected.
+        */
+       SCX_OPS_TID_TO_TASK             = 1LLU << 8,
+
        SCX_OPS_ALL_FLAGS               = SCX_OPS_KEEP_BUILTIN_IDLE |
                                          SCX_OPS_ENQ_LAST |
                                          SCX_OPS_ENQ_EXITING |
@@ -196,7 +211,8 @@ enum scx_ops_flags {
                                          SCX_OPS_ALLOW_QUEUED_WAKEUP |
                                          SCX_OPS_SWITCH_PARTIAL |
                                          SCX_OPS_BUILTIN_IDLE_PER_NODE |
-                                         SCX_OPS_ALWAYS_ENQ_IMMED,
+                                         SCX_OPS_ALWAYS_ENQ_IMMED |
+                                         SCX_OPS_TID_TO_TASK,
 
        /* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */
        __SCX_OPS_INTERNAL_MASK         = 0xffLLU << 56,
index 35fc625562415f7a85a581644439757dba139374..67b4b179b422759c1e9812562d699ffd23d99421 100644 (file)
@@ -99,6 +99,7 @@ s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
 struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym;
 struct rq *scx_bpf_locked_rq(void) __ksym;
 struct task_struct *scx_bpf_cpu_curr(s32 cpu) __ksym __weak;
+struct task_struct *scx_bpf_tid_to_task(u64 tid) __ksym __weak;
 u64 scx_bpf_now(void) __ksym __weak;
 void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __weak;
 
index 480ae934a52623e804240f2a16ebd43c039d35e6..2f4c45f6544d861a5252a35c5bba1e2f8767e0d4 100644 (file)
@@ -127,7 +127,8 @@ struct task_ctx {
        struct task_ctx __arena *q_next;        /* queue link, NULL if tail */
        struct task_ctx __arena *q_prev;        /* queue link, NULL if head */
        struct qmap_fifo __arena *fifo;         /* queue we're on, NULL if not queued */
-       s32                     pid;
+       u64                     tid;
+       s32                     pid;    /* for dump only */
        bool                    force_local;    /* Dispatch directly to local_dsq */
        bool                    highpri;
        u64                     core_sched_seq;
@@ -547,7 +548,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
                        if (!taskc)
                                break;
 
-                       p = bpf_task_from_pid(taskc->pid);
+                       p = scx_bpf_tid_to_task(taskc->tid);
                        if (!p)
                                continue;
 
@@ -598,8 +599,6 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
                        if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr))
                                scx_bpf_kick_cpu(scx_bpf_task_cpu(p), 0);
 
-                       bpf_task_release(p);
-
                        batch--;
                        cpuc->dsp_cnt--;
                        if (!batch || !scx_bpf_dispatch_nr_slots()) {
@@ -724,6 +723,7 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init_task, struct task_struct *p,
        taskc->q_next = NULL;
        taskc->q_prev = NULL;
        taskc->fifo = NULL;
+       taskc->tid = p->scx.tid;
        taskc->pid = p->pid;
        taskc->force_local = false;
        taskc->highpri = false;
@@ -776,7 +776,7 @@ void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
        /*
         * Walk the queue lists without locking - kfunc calls (scx_bpf_dump)
         * aren't in the verifier's kfunc_spin_allowed() list so we can't hold
-        * a lock and dump. Best-effort; racing may print stale pids but the
+        * a lock and dump. Best-effort; racing may print stale tids but the
         * walk is bounded by bpf_repeat() so it always terminates.
         */
        bpf_for(i, 0, 5) {
@@ -785,7 +785,7 @@ void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
                bpf_repeat(4096) {
                        if (!taskc)
                                break;
-                       scx_bpf_dump(" %d", taskc->pid);
+                       scx_bpf_dump(" %d:%llu", taskc->pid, taskc->tid);
                        taskc = taskc->q_next;
                }
                scx_bpf_dump("\n");
@@ -1159,6 +1159,7 @@ void BPF_STRUCT_OPS(qmap_sub_detach, struct scx_sub_detach_args *args)
 }
 
 SCX_OPS_DEFINE(qmap_ops,
+              .flags                   = SCX_OPS_ENQ_EXITING | SCX_OPS_TID_TO_TASK,
               .select_cpu              = (void *)qmap_select_cpu,
               .enqueue                 = (void *)qmap_enqueue,
               .dequeue                 = (void *)qmap_dequeue,