sched_ext: add p->scx.tid and SCX_OPS_TID_TO_TASK lookup

author Tejun Heo <tj@kernel.org>

Sun, 19 Apr 2026 18:36:45 +0000 (08:36 -1000)

committer Tejun Heo <tj@kernel.org>

Mon, 20 Apr 2026 16:55:33 +0000 (06:55 -1000)
author Tejun Heo <tj@kernel.org>
Sun, 19 Apr 2026 18:36:45 +0000 (08:36 -1000)
committer Tejun Heo <tj@kernel.org>
Mon, 20 Apr 2026 16:55:33 +0000 (06:55 -1000)
diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h

index 1a3af2ea2a794ab2031691413f36a19538ca3617..d05efcac794d684c25eae4f90923f9eb323f9bf3 100644 (file)
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -203,6 +203,15 @@ struct sched_ext_entity {
         u64                     core_sched_at;  /* see scx_prio_less() */
  #endif
  
+       /*
+        * Unique non-zero task ID assigned at fork. Persists across exec and
+        * is never reused. Lets BPF schedulers identify tasks without storing
+        * kernel pointers - arena-backed schedulers being one example. See
+        * scx_bpf_tid_to_task().
+        */
+       u64                     tid;
+       struct rhash_head       tid_hash_node;  /* see SCX_OPS_TID_TO_TASK */
+
         /* BPF scheduler modifiable fields */
  
         /*
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c

index 4b0527840f2f04871154ca72685d3496df99462b..b34f1e5df1c5afb5fa5d677dc2a554a6de8c037d 100644 (file)
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -38,6 +38,15 @@ static const struct rhashtable_params scx_sched_hash_params = {
  static struct rhashtable scx_sched_hash;
  #endif
  
+/* see SCX_OPS_TID_TO_TASK */
+static const struct rhashtable_params scx_tid_hash_params = {
+       .key_len                = sizeof_field(struct sched_ext_entity, tid),
+       .key_offset             = offsetof(struct sched_ext_entity, tid),
+       .head_offset            = offsetof(struct sched_ext_entity, tid_hash_node),
+       .insecure_elasticity    = true, /* inserted/removed under scx_tasks_lock */
+};
+static struct rhashtable scx_tid_hash;
+
  /*
   * During exit, a task may schedule after losing its PIDs. When disabling the
   * BPF scheduler, we need to be able to iterate tasks in every state to
@@ -58,10 +67,25 @@ static cpumask_var_t scx_bypass_lb_resched_cpumask;
  static bool scx_init_task_enabled;
  static bool scx_switching_all;
  DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
+static DEFINE_STATIC_KEY_FALSE(__scx_tid_to_task_enabled);
+
+/*
+ * True once SCX_OPS_TID_TO_TASK has been negotiated with the root scheduler
+ * and the tid->task table is live. Wraps the static key so callers don't
+ * take the address, and hints "likely enabled" for the common case where
+ * the feature is in use.
+ */
+static inline bool scx_tid_to_task_enabled(void)
+{
+       return static_branch_likely(&__scx_tid_to_task_enabled);
+}
  
  static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0);
  static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);
  
+/* Global cursor for the per-CPU tid allocator. Starts at 1; tid 0 is reserved. */
+static atomic64_t scx_tid_cursor = ATOMIC64_INIT(1);
+
  #ifdef CONFIG_EXT_SUB_SCHED
  /*
   * The sub sched being enabled. Used by scx_disable_and_exit_task() to exit
@@ -110,6 +134,17 @@ struct scx_kick_syncs {
  
  static DEFINE_PER_CPU(struct scx_kick_syncs __rcu *, scx_kick_syncs);
  
+/*
+ * Per-CPU buffered allocator state for p->scx.tid. Each CPU pulls a chunk of
+ * SCX_TID_CHUNK ids from scx_tid_cursor and hands them out locally without
+ * further synchronization. See scx_alloc_tid().
+ */
+struct scx_tid_alloc {
+       u64     next;
+       u64     end;
+};
+static DEFINE_PER_CPU(struct scx_tid_alloc, scx_tid_alloc);
+
  /*
   * Direct dispatch marker.
   *
@@ -3665,6 +3700,33 @@ void init_scx_entity(struct sched_ext_entity *scx)
         scx->slice = SCX_SLICE_DFL;
  }
  
+/* See scx_tid_alloc / scx_tid_cursor. */
+static u64 scx_alloc_tid(void)
+{
+       struct scx_tid_alloc *ta;
+
+       guard(preempt)();
+       ta = this_cpu_ptr(&scx_tid_alloc);
+
+       if (unlikely(ta->next >= ta->end)) {
+               ta->next = atomic64_fetch_add(SCX_TID_CHUNK, &scx_tid_cursor);
+               ta->end = ta->next + SCX_TID_CHUNK;
+       }
+       return ta->next++;
+}
+
+static void scx_tid_hash_insert(struct task_struct *p)
+{
+       int ret;
+
+       lockdep_assert_held(&scx_tasks_lock);
+
+       ret = rhashtable_lookup_insert_fast(&scx_tid_hash,
+                                           &p->scx.tid_hash_node,
+                                           scx_tid_hash_params);
+       WARN_ON_ONCE(ret);
+}
+
  void scx_pre_fork(struct task_struct *p)
  {
         /*
@@ -3682,6 +3744,8 @@ int scx_fork(struct task_struct *p, struct kernel_clone_args *kargs)
  
         percpu_rwsem_assert_held(&scx_fork_rwsem);
  
+       p->scx.tid = scx_alloc_tid();
+
         if (scx_init_task_enabled) {
  #ifdef CONFIG_EXT_SUB_SCHED
                 struct scx_sched *sch = kargs->cset->dfl_cgrp->scx_sched;
@@ -3717,9 +3781,11 @@ void scx_post_fork(struct task_struct *p)
                 }
         }
  
-       raw_spin_lock_irq(&scx_tasks_lock);
-       list_add_tail(&p->scx.tasks_node, &scx_tasks);
-       raw_spin_unlock_irq(&scx_tasks_lock);
+       scoped_guard(raw_spinlock_irq, &scx_tasks_lock) {
+               list_add_tail(&p->scx.tasks_node, &scx_tasks);
+               if (scx_tid_to_task_enabled())
+                       scx_tid_hash_insert(p);
+       }
  
         percpu_up_read(&scx_fork_rwsem);
  }
@@ -3770,17 +3836,19 @@ static bool task_dead_and_done(struct task_struct *p)
  
  void sched_ext_dead(struct task_struct *p)
  {
-       unsigned long flags;
-
         /*
          * By the time control reaches here, @p has %TASK_DEAD set, switched out
          * for the last time and then dropped the rq lock - task_dead_and_done()
          * should be returning %true nullifying the straggling sched_class ops.
          * Remove from scx_tasks and exit @p.
          */
-       raw_spin_lock_irqsave(&scx_tasks_lock, flags);
-       list_del_init(&p->scx.tasks_node);
-       raw_spin_unlock_irqrestore(&scx_tasks_lock, flags);
+       scoped_guard(raw_spinlock_irqsave, &scx_tasks_lock) {
+               list_del_init(&p->scx.tasks_node);
+               if (scx_tid_to_task_enabled())
+                       rhashtable_remove_fast(&scx_tid_hash,
+                                              &p->scx.tid_hash_node,
+                                              scx_tid_hash_params);
+       }
  
         /*
          * @p is off scx_tasks and wholly ours. scx_root_enable()'s READY ->
@@ -5815,9 +5883,13 @@ static void scx_root_disable(struct scx_sched *sch)
  
         /* no task is on scx, turn off all the switches and flush in-progress calls */
         static_branch_disable(&__scx_enabled);
+       if (sch->ops.flags & SCX_OPS_TID_TO_TASK)
+               static_branch_disable(&__scx_tid_to_task_enabled);
         bitmap_zero(sch->has_op, SCX_OPI_END);
         scx_idle_disable();
         synchronize_rcu();
+       if (sch->ops.flags & SCX_OPS_TID_TO_TASK)
+               rhashtable_free_and_destroy(&scx_tid_hash, NULL, NULL);
  
         scx_log_sched_disable(sch);
  
@@ -6561,6 +6633,17 @@ static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops)
                 return -EINVAL;
         }
  
+       /*
+        * SCX_OPS_TID_TO_TASK is enabled by the root scheduler. A sub-sched
+        * may set it to declare a dependency; reject if the root hasn't
+        * enabled it.
+        */
+       if ((ops->flags & SCX_OPS_TID_TO_TASK) && scx_parent(sch) &&
+           !(scx_root->ops.flags & SCX_OPS_TID_TO_TASK)) {
+               scx_error(sch, "SCX_OPS_TID_TO_TASK requires root scheduler to enable it");
+               return -EINVAL;
+       }
+
         /*
          * SCX_OPS_BUILTIN_IDLE_PER_NODE requires built-in CPU idle
          * selection policy to be enabled.
@@ -6611,13 +6694,19 @@ static void scx_root_enable_workfn(struct kthread_work *work)
         if (ret)
                 goto err_unlock;
  
+       if (ops->flags & SCX_OPS_TID_TO_TASK) {
+               ret = rhashtable_init(&scx_tid_hash, &scx_tid_hash_params);
+               if (ret)
+                       goto err_free_ksyncs;
+       }
+
  #if defined(CONFIG_EXT_GROUP_SCHED) || defined(CONFIG_EXT_SUB_SCHED)
         cgroup_get(cgrp);
  #endif
         sch = scx_alloc_and_add_sched(ops, cgrp, NULL);
         if (IS_ERR(sch)) {
                 ret = PTR_ERR(sch);
-               goto err_free_ksyncs;
+               goto err_free_tid_hash;
         }
  
         /*
@@ -6706,6 +6795,10 @@ static void scx_root_enable_workfn(struct kthread_work *work)
         WARN_ON_ONCE(scx_init_task_enabled);
         scx_init_task_enabled = true;
  
+       /* flip under fork_rwsem; the iter below covers existing tasks */
+       if (ops->flags & SCX_OPS_TID_TO_TASK)
+               static_branch_enable(&__scx_tid_to_task_enabled);
+
         /*
          * Enable ops for every task. Fork is excluded by scx_fork_rwsem
          * preventing new tasks from being added. No need to exclude tasks
@@ -6749,6 +6842,17 @@ static void scx_root_enable_workfn(struct kthread_work *work)
                 scx_set_task_sched(p, sch);
                 scx_set_task_state(p, SCX_TASK_READY);
  
+               /*
+                * Insert into the tid hash under scx_tasks_lock so we can't
+                * race sched_ext_dead() and leave a stale entry for an already
+                * exited task.
+                */
+               if (scx_tid_to_task_enabled()) {
+                       guard(raw_spinlock_irq)(&scx_tasks_lock);
+                       if (!list_empty(&p->scx.tasks_node))
+                               scx_tid_hash_insert(p);
+               }
+
                 put_task_struct(p);
         }
         scx_task_iter_stop(&sti);
@@ -6808,6 +6912,9 @@ static void scx_root_enable_workfn(struct kthread_work *work)
         cmd->ret = 0;
         return;
  
+err_free_tid_hash:
+       if (ops->flags & SCX_OPS_TID_TO_TASK)
+               rhashtable_free_and_destroy(&scx_tid_hash, NULL, NULL);
  err_free_ksyncs:
         free_kick_syncs();
  err_unlock:
@@ -9296,6 +9403,34 @@ __bpf_kfunc struct task_struct *scx_bpf_cpu_curr(s32 cpu, const struct bpf_prog_
         return rcu_dereference(cpu_rq(cpu)->curr);
  }
  
+/**
+ * scx_bpf_tid_to_task - Look up a task by its scx tid
+ * @tid: task ID previously read from p->scx.tid
+ *
+ * Returns the task with the given tid, or NULL if no such task exists. The
+ * returned pointer is valid until the end of the current RCU read section
+ * (KF_RCU_PROTECTED). Requires SCX_OPS_TID_TO_TASK to be set on the root
+ * scheduler; otherwise an error is raised and NULL returned.
+ */
+__bpf_kfunc struct task_struct *scx_bpf_tid_to_task(u64 tid)
+{
+       struct sched_ext_entity *scx;
+
+       if (!scx_tid_to_task_enabled()) {
+               struct scx_sched *sch = rcu_dereference(scx_root);
+
+               if (sch)
+                       scx_error(sch, "scx_bpf_tid_to_task() called without SCX_OPS_TID_TO_TASK");
+               return NULL;
+       }
+
+       scx = rhashtable_lookup(&scx_tid_hash, &tid, scx_tid_hash_params);
+       if (!scx)
+               return NULL;
+
+       return container_of(scx, struct task_struct, scx);
+}
+
  /**
   * scx_bpf_now - Returns a high-performance monotonically non-decreasing
   * clock for the current CPU. The clock returned is in nanoseconds.
@@ -9479,6 +9614,7 @@ BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU)
  BTF_ID_FLAGS(func, scx_bpf_cpu_rq, KF_IMPLICIT_ARGS)
  BTF_ID_FLAGS(func, scx_bpf_locked_rq, KF_IMPLICIT_ARGS | KF_RET_NULL)
  BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_IMPLICIT_ARGS | KF_RET_NULL | KF_RCU_PROTECTED)
+BTF_ID_FLAGS(func, scx_bpf_tid_to_task, KF_RET_NULL | KF_RCU_PROTECTED)
  BTF_ID_FLAGS(func, scx_bpf_now)
  BTF_ID_FLAGS(func, scx_bpf_events)
  #ifdef CONFIG_CGROUP_SCHED
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h

index 62ce4eaf6a3f560ce26c25ed7ac9618f1da42c09..4a7ffc7f55d2a72fdeab432259996bc2c89b7d7d 100644 (file)
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -13,6 +13,9 @@ enum scx_consts {
         SCX_DSP_MAX_LOOPS               = 32,
         SCX_WATCHDOG_MAX_TIMEOUT        = 30 * HZ,
  
+       /* per-CPU chunk size for p->scx.tid allocation, see scx_alloc_tid() */
+       SCX_TID_CHUNK                   = 1024,
+
         SCX_EXIT_BT_LEN                 = 64,
         SCX_EXIT_MSG_LEN                = 1024,
         SCX_EXIT_DUMP_DFL_LEN           = 32768,
@@ -138,7 +141,8 @@ enum scx_ops_flags {
          * To mask this problem, by default, unhashed tasks are automatically
          * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't
          * depend on pid lookups and wants to handle these tasks directly, the
-        * following flag can be used.
+        * following flag can be used. With %SCX_OPS_TID_TO_TASK,
+        * scx_bpf_tid_to_task() can find exiting tasks reliably.
          */
         SCX_OPS_ENQ_EXITING             = 1LLU << 2,
  
@@ -189,6 +193,17 @@ enum scx_ops_flags {
          */
         SCX_OPS_ALWAYS_ENQ_IMMED        = 1LLU << 7,
  
+       /*
+        * Maintain a mapping from p->scx.tid to task_struct so the BPF
+        * scheduler can recover task pointers from stored tids via
+        * scx_bpf_tid_to_task().
+        *
+        * Only the root scheduler turns this on. A sub-sched may set the flag
+        * to declare a dependency on the lookup; if the root scheduler hasn't
+        * enabled it, attaching the sub-sched is rejected.
+        */
+       SCX_OPS_TID_TO_TASK             = 1LLU << 8,
+
         SCX_OPS_ALL_FLAGS               = SCX_OPS_KEEP_BUILTIN_IDLE |
                                           SCX_OPS_ENQ_LAST |
                                           SCX_OPS_ENQ_EXITING |
@@ -196,7 +211,8 @@ enum scx_ops_flags {
                                           SCX_OPS_ALLOW_QUEUED_WAKEUP |
                                           SCX_OPS_SWITCH_PARTIAL |
                                           SCX_OPS_BUILTIN_IDLE_PER_NODE |
-                                         SCX_OPS_ALWAYS_ENQ_IMMED,
+                                         SCX_OPS_ALWAYS_ENQ_IMMED |
+                                         SCX_OPS_TID_TO_TASK,
  
         /* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */
         __SCX_OPS_INTERNAL_MASK         = 0xffLLU << 56,
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h

index 35fc625562415f7a85a581644439757dba139374..67b4b179b422759c1e9812562d699ffd23d99421 100644 (file)
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -99,6 +99,7 @@ s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
  struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym;
  struct rq *scx_bpf_locked_rq(void) __ksym;
  struct task_struct *scx_bpf_cpu_curr(s32 cpu) __ksym __weak;
+struct task_struct *scx_bpf_tid_to_task(u64 tid) __ksym __weak;
  u64 scx_bpf_now(void) __ksym __weak;
  void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __weak;
  
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c

index 480ae934a52623e804240f2a16ebd43c039d35e6..2f4c45f6544d861a5252a35c5bba1e2f8767e0d4 100644 (file)
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -127,7 +127,8 @@ struct task_ctx {
         struct task_ctx __arena *q_next;        /* queue link, NULL if tail */
         struct task_ctx __arena *q_prev;        /* queue link, NULL if head */
         struct qmap_fifo __arena *fifo;         /* queue we're on, NULL if not queued */
-       s32                     pid;
+       u64                     tid;
+       s32                     pid;    /* for dump only */
         bool                    force_local;    /* Dispatch directly to local_dsq */
         bool                    highpri;
         u64                     core_sched_seq;
@@ -547,7 +548,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
                         if (!taskc)
                                 break;
  
-                       p = bpf_task_from_pid(taskc->pid);
+                       p = scx_bpf_tid_to_task(taskc->tid);
                         if (!p)
                                 continue;
  
@@ -598,8 +599,6 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
                         if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr))
                                 scx_bpf_kick_cpu(scx_bpf_task_cpu(p), 0);
  
-                       bpf_task_release(p);
-
                         batch--;
                         cpuc->dsp_cnt--;
                         if (!batch || !scx_bpf_dispatch_nr_slots()) {
@@ -724,6 +723,7 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init_task, struct task_struct *p,
         taskc->q_next = NULL;
         taskc->q_prev = NULL;
         taskc->fifo = NULL;
+       taskc->tid = p->scx.tid;
         taskc->pid = p->pid;
         taskc->force_local = false;
         taskc->highpri = false;
@@ -776,7 +776,7 @@ void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
         /*
          * Walk the queue lists without locking - kfunc calls (scx_bpf_dump)
          * aren't in the verifier's kfunc_spin_allowed() list so we can't hold
-        * a lock and dump. Best-effort; racing may print stale pids but the
+        * a lock and dump. Best-effort; racing may print stale tids but the
          * walk is bounded by bpf_repeat() so it always terminates.
          */
         bpf_for(i, 0, 5) {
@@ -785,7 +785,7 @@ void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
                 bpf_repeat(4096) {
                         if (!taskc)
                                 break;
-                       scx_bpf_dump(" %d", taskc->pid);
+                       scx_bpf_dump(" %d:%llu", taskc->pid, taskc->tid);
                         taskc = taskc->q_next;
                 }
                 scx_bpf_dump("\n");
@@ -1159,6 +1159,7 @@ void BPF_STRUCT_OPS(qmap_sub_detach, struct scx_sub_detach_args *args)
  }
  
  SCX_OPS_DEFINE(qmap_ops,
+              .flags                   = SCX_OPS_ENQ_EXITING | SCX_OPS_TID_TO_TASK,
                .select_cpu              = (void *)qmap_select_cpu,
                .enqueue                 = (void *)qmap_enqueue,
                .dequeue                 = (void *)qmap_dequeue,
author	Tejun Heo <tj@kernel.org>
	Sun, 19 Apr 2026 18:36:45 +0000 (08:36 -1000)
committer	Tejun Heo <tj@kernel.org>
	Mon, 20 Apr 2026 16:55:33 +0000 (06:55 -1000)
include/linux/sched/ext.h		patch \| blob \| blame \| history
kernel/sched/ext.c		patch \| blob \| blame \| history
kernel/sched/ext_internal.h		patch \| blob \| blame \| history
tools/sched_ext/include/scx/common.bpf.h		patch \| blob \| blame \| history
tools/sched_ext/scx_qmap.bpf.c		patch \| blob \| blame \| history