]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
sched_ext: Convert ops.set_cmask() to arena-resident cmask
authorTejun Heo <tj@kernel.org>
Fri, 22 May 2026 17:06:01 +0000 (07:06 -1000)
committerTejun Heo <tj@kernel.org>
Mon, 25 May 2026 19:44:07 +0000 (09:44 -1000)
ops_cid.set_cmask() expects a cmask. The kernel couldn't write into the
arena, so it translated cpumask -> cmask in kernel memory and passed the
result as a trusted pointer. The BPF cmask helpers all operate on arena
cmasks though, so the BPF side had to word-by-word probe-read the kernel
cmask into an arena cmask via cmask_copy_from_kernel() before any helper
could touch it. It works, but is clumsy.

With direct kernel-side arena access now in place, build the cmask in the
arena. The kernel writes to it through the kern_va side of the dual mapping.
BPF directly dereferences it via an __arena pointer like any other arena
struct.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
kernel/sched/ext.c
kernel/sched/ext_cid.c
kernel/sched/ext_internal.h
tools/sched_ext/include/scx/cid.bpf.h
tools/sched_ext/scx_qmap.bpf.c

index f5c67e3ff0753a9396a6b87c7a6a19d87de52626..83272acf176371de29db5d2485f6fbdde6c2e1fe 100644 (file)
@@ -621,11 +621,16 @@ static inline void scx_call_op_set_cpumask(struct scx_sched *sch, struct rq *rq,
                update_locked_rq(rq);
 
        if (scx_is_cid_type()) {
-               struct scx_cmask *cmask = this_cpu_ptr(scx_set_cmask_scratch);
-
-               lockdep_assert_irqs_disabled();
-               scx_cpumask_to_cmask(cpumask, cmask);
-               sch->ops_cid.set_cmask(task, cmask);
+               struct scx_cmask *kern_va = *this_cpu_ptr(sch->set_cmask_scratch);
+               unsigned long uaddr = (unsigned long)kern_va -
+                       bpf_arena_map_kern_vm_start(sch->arena_map);
+               /*
+                * Build the per-CPU arena cmask and hand BPF the uaddr. Caller
+                * holds the rq lock with IRQs disabled, which makes us the sole
+                * user of the scratch area.
+                */
+               scx_cpumask_to_cmask(cpumask, kern_va);
+               sch->ops_cid.set_cmask(task, (struct scx_cmask *)uaddr);
        } else {
                sch->ops.set_cpumask(task, cpumask);
        }
@@ -4949,6 +4954,48 @@ static const struct attribute_group scx_global_attr_group = {
 static void free_pnode(struct scx_sched_pnode *pnode);
 static void free_exit_info(struct scx_exit_info *ei);
 
+static s32 scx_set_cmask_scratch_alloc(struct scx_sched *sch)
+{
+       size_t size = struct_size_t(struct scx_cmask, bits,
+                                   SCX_CMASK_NR_WORDS(num_possible_cpus()));
+       int cpu;
+
+       if (!sch->is_cid_type || !sch->arena_pool)
+               return 0;
+
+       sch->set_cmask_scratch = alloc_percpu(struct scx_cmask *);
+       if (!sch->set_cmask_scratch)
+               return -ENOMEM;
+
+       for_each_possible_cpu(cpu) {
+               struct scx_cmask **slot = per_cpu_ptr(sch->set_cmask_scratch, cpu);
+
+               *slot = scx_arena_alloc(sch, size);
+               if (!*slot)
+                       return -ENOMEM;
+               scx_cmask_init(*slot, 0, num_possible_cpus());
+       }
+       return 0;
+}
+
+static void scx_set_cmask_scratch_free(struct scx_sched *sch)
+{
+       size_t size = struct_size_t(struct scx_cmask, bits,
+                                   SCX_CMASK_NR_WORDS(num_possible_cpus()));
+       int cpu;
+
+       if (!sch->set_cmask_scratch)
+               return;
+
+       for_each_possible_cpu(cpu) {
+               struct scx_cmask **slot = per_cpu_ptr(sch->set_cmask_scratch, cpu);
+
+               scx_arena_free(sch, *slot, size);
+       }
+       free_percpu(sch->set_cmask_scratch);
+       sch->set_cmask_scratch = NULL;
+}
+
 static void scx_sched_free_rcu_work(struct work_struct *work)
 {
        struct rcu_work *rcu_work = to_rcu_work(work);
@@ -5003,6 +5050,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
 
        rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
        free_exit_info(sch->exit_info);
+       scx_set_cmask_scratch_free(sch);
        scx_arena_pool_destroy(sch);
        if (sch->arena_map)
                bpf_map_put(sch->arena_map);
@@ -7162,6 +7210,12 @@ static void scx_root_enable_workfn(struct kthread_work *work)
                goto err_disable;
        }
 
+       ret = scx_set_cmask_scratch_alloc(sch);
+       if (ret) {
+               cpus_read_unlock();
+               goto err_disable;
+       }
+
        for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
                if (((void (**)(void))ops)[i])
                        set_bit(i, sch->has_op);
@@ -7484,6 +7538,10 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
        if (ret)
                goto err_disable;
 
+       ret = scx_set_cmask_scratch_alloc(sch);
+       if (ret)
+               goto err_disable;
+
        if (validate_ops(sch, ops))
                goto err_disable;
 
index 0c91b951fd33c1b9a7c619f5189eca2de694f5e3..808c6390da5a249a8cfb4a91adc89f2088718122 100644 (file)
@@ -7,14 +7,6 @@
  */
 #include <linux/cacheinfo.h>
 
-/*
- * Per-cpu scratch cmask used by scx_call_op_set_cpumask() to synthesize a
- * cmask from a cpumask. Allocated alongside the cid arrays on first enable
- * and never freed. Sized to the full cid space. Caller holds rq lock so
- * this_cpu_ptr is safe.
- */
-struct scx_cmask __percpu *scx_set_cmask_scratch;
-
 /*
  * cid tables.
  *
@@ -54,8 +46,6 @@ static s32 scx_cid_arrays_alloc(void)
        u32 npossible = num_possible_cpus();
        s16 *cid_to_cpu, *cpu_to_cid;
        struct scx_cid_topo *cid_topo;
-       struct scx_cmask __percpu *set_cmask_scratch;
-       s32 cpu;
 
        if (scx_cid_to_cpu_tbl)
                return 0;
@@ -63,25 +53,17 @@ static s32 scx_cid_arrays_alloc(void)
        cid_to_cpu = kzalloc_objs(*scx_cid_to_cpu_tbl, npossible, GFP_KERNEL);
        cpu_to_cid = kzalloc_objs(*scx_cpu_to_cid_tbl, nr_cpu_ids, GFP_KERNEL);
        cid_topo = kmalloc_objs(*scx_cid_topo, npossible, GFP_KERNEL);
-       set_cmask_scratch = __alloc_percpu(struct_size(set_cmask_scratch, bits,
-                                                      SCX_CMASK_NR_WORDS(npossible)),
-                                          sizeof(u64));
 
-       if (!cid_to_cpu || !cpu_to_cid || !cid_topo || !set_cmask_scratch) {
+       if (!cid_to_cpu || !cpu_to_cid || !cid_topo) {
                kfree(cid_to_cpu);
                kfree(cpu_to_cid);
                kfree(cid_topo);
-               free_percpu(set_cmask_scratch);
                return -ENOMEM;
        }
 
        WRITE_ONCE(scx_cid_to_cpu_tbl, cid_to_cpu);
        WRITE_ONCE(scx_cpu_to_cid_tbl, cpu_to_cid);
        WRITE_ONCE(scx_cid_topo, cid_topo);
-       for_each_possible_cpu(cpu)
-               scx_cmask_init(per_cpu_ptr(set_cmask_scratch, cpu),
-                              0, npossible);
-       WRITE_ONCE(scx_set_cmask_scratch, set_cmask_scratch);
        return 0;
 }
 
index ff7e882bd67a80ed2e2be33cd38989e3485f92a5..9bb65367f5105b2419f32f56bff2f51922223e9a 100644 (file)
@@ -1124,6 +1124,14 @@ struct scx_sched {
        struct bpf_map          *arena_map;
        struct gen_pool         *arena_pool;
 
+       /*
+        * Per-CPU arena cmask used by scx_call_op_set_cpumask() to hand a cmask
+        * to ops_cid.set_cmask(). The kernel writes through the stored kern_va;
+        * the BPF-arena uaddr handed to BPF is recovered by subtracting the
+        * arena's kern_vm_start.
+        */
+       struct scx_cmask * __percpu *set_cmask_scratch;
+
        DECLARE_BITMAP(has_op, SCX_OPI_END);
 
        /*
@@ -1480,8 +1488,6 @@ enum scx_ops_state {
 extern struct scx_sched __rcu *scx_root;
 DECLARE_PER_CPU(struct rq *, scx_locked_rq_state);
 
-extern struct scx_cmask __percpu *scx_set_cmask_scratch;
-
 /*
  * True when the currently loaded scheduler hierarchy is cid-form. All scheds
  * in a hierarchy share one form, so this single key tells callsites which
index e281c88fa824dfdaa1787d056d859234a26759a8..70f2a3829af4d8dcdb079536c56a95387f76dc51 100644 (file)
@@ -675,56 +675,4 @@ static __always_inline void cmask_from_cpumask(struct scx_cmask __arena *m,
        }
 }
 
-/**
- * cmask_copy_from_kernel - probe-read a kernel cmask into an arena cmask
- * @dst: arena cmask to fill; must have @dst->base == 0 and be sized for @src.
- * @src: kernel-memory cmask (e.g. ops.set_cmask() arg); @src->base must be 0.
- *
- * Word-for-word copy; @src and @dst must share base 0 alignment. Triggers
- * scx_bpf_error() on probe failure or precondition violation.
- */
-static __always_inline void cmask_copy_from_kernel(struct scx_cmask __arena *dst,
-                                                  const struct scx_cmask *src)
-{
-       u32 base = 0, nr_cids = 0, nr_words, wi;
-
-       if (dst->base != 0) {
-               scx_bpf_error("cmask_copy_from_kernel requires dst->base == 0");
-               return;
-       }
-
-       if (bpf_probe_read_kernel(&base, sizeof(base), &src->base)) {
-               scx_bpf_error("probe-read cmask->base failed");
-               return;
-       }
-       if (base != 0) {
-               scx_bpf_error("cmask_copy_from_kernel requires src->base == 0");
-               return;
-       }
-
-       if (bpf_probe_read_kernel(&nr_cids, sizeof(nr_cids), &src->nr_cids)) {
-               scx_bpf_error("probe-read cmask->nr_cids failed");
-               return;
-       }
-
-       if (nr_cids > dst->nr_cids) {
-               scx_bpf_error("src cmask nr_cids=%u exceeds dst nr_cids=%u",
-                             nr_cids, dst->nr_cids);
-               return;
-       }
-
-       nr_words = CMASK_NR_WORDS(nr_cids);
-       cmask_zero(dst);
-       bpf_for(wi, 0, CMASK_MAX_WORDS) {
-               u64 word = 0;
-               if (wi >= nr_words)
-                       break;
-               if (bpf_probe_read_kernel(&word, sizeof(u64), &src->bits[wi])) {
-                       scx_bpf_error("probe-read cmask->bits[%u] failed", wi);
-                       return;
-               }
-               dst->bits[wi] = word;
-       }
-}
-
 #endif /* __SCX_CID_BPF_H */
index 7e77f22674eaa1ece3ca27b20252325139c6009a..8a2d6a8ebd8eddee8dd02ee55cfd7c4dcd680342 100644 (file)
@@ -919,14 +919,15 @@ void BPF_STRUCT_OPS(qmap_update_idle, s32 cid, bool idle)
 }
 
 void BPF_STRUCT_OPS(qmap_set_cmask, struct task_struct *p,
-                   const struct scx_cmask *cmask)
+                   const struct scx_cmask *cmask_in)
 {
+       struct scx_cmask __arena *cmask = (struct scx_cmask __arena *)(long)cmask_in;
        task_ctx_t *taskc;
 
        taskc = lookup_task_ctx(p);
        if (!taskc)
                return;
-       cmask_copy_from_kernel(&taskc->cpus_allowed, cmask);
+       cmask_copy(&taskc->cpus_allowed, cmask);
 }
 
 struct monitor_timer {