]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
sched_ext: Sub-allocator over kernel-claimed BPF arena pages
authorTejun Heo <tj@kernel.org>
Fri, 22 May 2026 17:06:01 +0000 (07:06 -1000)
committerTejun Heo <tj@kernel.org>
Mon, 25 May 2026 19:44:07 +0000 (09:44 -1000)
Build a per-scheduler sub-allocator on top of pages claimed from the BPF
arena registered in the previous patch. Subsequent kernel-managed
arena-resident structures (e.g. per-CPU set_cmask cmask) carve their storage
from this pool.

scx_arena_pool_init() creates a gen_pool. scx_arena_alloc() returns the
kernel VA. On exhaustion, the pool grows by claiming more pages via
bpf_arena_alloc_pages_sleepable(). Chunks are added at the kernel-side
mapping address. Callers translate to the BPF-arena form themselves if
needed.

Allocations sleep (GFP_KERNEL) - they may grow the pool through vzalloc and
arena page allocation. All current consumers run from the enable path (after
ops.init() and the kernel-side arena auto-discovery, before validate_ops()),
where sleeping is fine.

scx_arena_pool_destroy() walks each chunk, returns outstanding ranges to the
gen_pool with gen_pool_free() and then calls gen_pool_destroy(). The
underlying arena pages are released when the arena map itself is torn down,
so the pool destroy doesn't free them explicitly.

v2: Switch scx_arena_alloc() to a loop. (Andrea)

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
kernel/sched/build_policy.c
kernel/sched/ext.c
kernel/sched/ext_arena.c [new file with mode: 0644]
kernel/sched/ext_arena.h [new file with mode: 0644]
kernel/sched/ext_internal.h

index 5e76c9177d5400b73a4003245f0a5bf954a80741..067979a7b69ec8b9d889d7c4efda7b9ca0d9400a 100644 (file)
 
 #ifdef CONFIG_SCHED_CLASS_EXT
 # include <linux/btf_ids.h>
+# include <linux/find.h>
+# include <linux/genalloc.h>
 # include "ext_types.h"
 # include "ext_internal.h"
 # include "ext_cid.h"
+# include "ext_arena.h"
 # include "ext_idle.h"
 # include "ext.c"
 # include "ext_cid.c"
+# include "ext_arena.c"
 # include "ext_idle.c"
 #endif
 
index 53708fd3cc349a85fe7c39f83e618c6f57c07913..f5c67e3ff0753a9396a6b87c7a6a19d87de52626 100644 (file)
@@ -5003,6 +5003,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
 
        rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
        free_exit_info(sch->exit_info);
+       scx_arena_pool_destroy(sch);
        if (sch->arena_map)
                bpf_map_put(sch->arena_map);
        kfree(sch);
@@ -7155,6 +7156,12 @@ static void scx_root_enable_workfn(struct kthread_work *work)
                sch->exit_info->flags |= SCX_EFLAG_INITIALIZED;
        }
 
+       ret = scx_arena_pool_init(sch);
+       if (ret) {
+               cpus_read_unlock();
+               goto err_disable;
+       }
+
        for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
                if (((void (**)(void))ops)[i])
                        set_bit(i, sch->has_op);
@@ -7473,6 +7480,10 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
                sch->exit_info->flags |= SCX_EFLAG_INITIALIZED;
        }
 
+       ret = scx_arena_pool_init(sch);
+       if (ret)
+               goto err_disable;
+
        if (validate_ops(sch, ops))
                goto err_disable;
 
diff --git a/kernel/sched/ext_arena.c b/kernel/sched/ext_arena.c
new file mode 100644 (file)
index 0000000..b413e15
--- /dev/null
@@ -0,0 +1,126 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
+ * scx_arena_pool: kernel-side sub-allocator over BPF-arena pages.
+ *
+ * Each chunk added to @sch->arena_pool comes from one
+ * bpf_arena_alloc_pages_sleepable() call and is registered at the
+ * kernel-side mapping address. Callers translate to the BPF-arena form
+ * themselves if needed.
+ *
+ * Allocations grow the pool on demand. Underlying arena pages are released
+ * when the arena map itself is torn down.
+ *
+ * Copyright (c) 2026 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2026 Tejun Heo <tj@kernel.org>
+ */
+
+enum scx_arena_consts {
+       SCX_ARENA_MIN_ORDER             = 3,    /* 8-byte minimum sub-allocation */
+       SCX_ARENA_GROW_PAGES            = 4,    /* per growth */
+};
+
+s32 scx_arena_pool_init(struct scx_sched *sch)
+{
+       if (!sch->arena_map)
+               return 0;
+
+       sch->arena_pool = gen_pool_create(SCX_ARENA_MIN_ORDER, NUMA_NO_NODE);
+       if (!sch->arena_pool)
+               return -ENOMEM;
+       return 0;
+}
+
+static void scx_arena_clear_chunk(struct gen_pool *pool, struct gen_pool_chunk *chunk,
+                                 void *data)
+{
+       int order = pool->min_alloc_order;
+       size_t chunk_sz = chunk->end_addr - chunk->start_addr + 1;
+       unsigned long end_bit = chunk_sz >> order;
+       unsigned long b, e;
+
+       for_each_set_bitrange(b, e, chunk->bits, end_bit)
+               gen_pool_free(pool, chunk->start_addr + (b << order),
+                             (e - b) << order);
+}
+
+/*
+ * Tear down the pool. Outstanding gen_pool allocations are freed via
+ * scx_arena_clear_chunk() so gen_pool_destroy() doesn't BUG. The underlying
+ * arena pages are released when the arena map itself is torn down.
+ */
+void scx_arena_pool_destroy(struct scx_sched *sch)
+{
+       if (!sch->arena_pool)
+               return;
+       gen_pool_for_each_chunk(sch->arena_pool, scx_arena_clear_chunk, NULL);
+       gen_pool_destroy(sch->arena_pool);
+       sch->arena_pool = NULL;
+}
+
+/*
+ * Grow the pool by @page_cnt pages. bpf_arena_alloc_pages_sleepable() and
+ * gen_pool_add() (which calls vzalloc(GFP_KERNEL)) require a sleepable
+ * context.
+ */
+static int scx_arena_grow(struct scx_sched *sch, u32 page_cnt)
+{
+       u64 kern_vm_start;
+       u32 uaddr32;
+       void *p;
+       int ret;
+
+       if (!sch->arena_map || !sch->arena_pool)
+               return -EINVAL;
+
+       p = bpf_arena_alloc_pages_sleepable(sch->arena_map, NULL,
+                                           page_cnt, NUMA_NO_NODE, 0);
+       if (!p)
+               return -ENOMEM;
+
+       uaddr32 = (u32)(unsigned long)p;
+       kern_vm_start = bpf_arena_map_kern_vm_start(sch->arena_map);
+
+       ret = gen_pool_add(sch->arena_pool, kern_vm_start + uaddr32,
+                          page_cnt * PAGE_SIZE, NUMA_NO_NODE);
+       if (ret) {
+               bpf_arena_free_pages_non_sleepable(sch->arena_map, p, page_cnt);
+               return ret;
+       }
+       return 0;
+}
+
+/*
+ * Allocate @size bytes from the arena pool. Returns kernel VA on success, NULL
+ * on failure. May grow the pool via scx_arena_grow() which sleeps. Caller must
+ * be in a GFP_KERNEL context.
+ */
+void *scx_arena_alloc(struct scx_sched *sch, size_t size)
+{
+       unsigned long kern_va;
+       u32 page_cnt;
+
+       might_sleep();
+
+       if (!sch->arena_pool)
+               return NULL;
+
+       while (true) {
+               kern_va = gen_pool_alloc(sch->arena_pool, size);
+               if (kern_va)
+                       break;
+               page_cnt = max_t(u32, SCX_ARENA_GROW_PAGES,
+                                (size + PAGE_SIZE - 1) >> PAGE_SHIFT);
+               if (scx_arena_grow(sch, page_cnt))
+                       return NULL;
+       }
+
+       return (void *)kern_va;
+}
+
+void scx_arena_free(struct scx_sched *sch, void *kern_va, size_t size)
+{
+       if (sch->arena_pool && kern_va)
+               gen_pool_free(sch->arena_pool, (unsigned long)kern_va, size);
+}
diff --git a/kernel/sched/ext_arena.h b/kernel/sched/ext_arena.h
new file mode 100644 (file)
index 0000000..4f36101
--- /dev/null
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
+ * Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2025 Tejun Heo <tj@kernel.org>
+ */
+#ifndef _KERNEL_SCHED_EXT_ARENA_H
+#define _KERNEL_SCHED_EXT_ARENA_H
+
+struct scx_sched;
+
+s32 scx_arena_pool_init(struct scx_sched *sch);
+void scx_arena_pool_destroy(struct scx_sched *sch);
+void *scx_arena_alloc(struct scx_sched *sch, size_t size);
+void scx_arena_free(struct scx_sched *sch, void *kern_va, size_t size);
+
+#endif /* _KERNEL_SCHED_EXT_ARENA_H */
index d40cfd29ddaa261389ce638e3f37c14d4401d8cb..ff7e882bd67a80ed2e2be33cd38989e3485f92a5 100644 (file)
@@ -1116,8 +1116,13 @@ struct scx_sched {
         * Arena map auto-discovered from member progs at struct_ops attach.
         * cid-form schedulers must use exactly one arena across all member
         * progs. NULL on cpu-form.
+        *
+        * @arena_pool sub-allocates @arena_map. Each gen_pool chunk is added
+        * at the kernel-side mapping address. Grows on demand and pages are
+        * not released until sched destroy.
         */
        struct bpf_map          *arena_map;
+       struct gen_pool         *arena_pool;
 
        DECLARE_BITMAP(has_op, SCX_OPI_END);