tools/sched_ext: add arena based scheduler

author Emil Tsalapatis <emil@etsalapatis.com>

Fri, 23 Jan 2026 03:26:05 +0000 (22:26 -0500)

committer Tejun Heo <tj@kernel.org>

Wed, 28 Jan 2026 01:43:34 +0000 (15:43 -1000)
author Emil Tsalapatis <emil@etsalapatis.com>
Fri, 23 Jan 2026 03:26:05 +0000 (22:26 -0500)
committer Tejun Heo <tj@kernel.org>
Wed, 28 Jan 2026 01:43:34 +0000 (15:43 -1000)
diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile

index 208e8f8fe4d8e1503c33042bcaece03031926abe..47ad7444677e6da3bd68d3a90ed1d97e37f2525b 100644 (file)
--- a/tools/sched_ext/Makefile
+++ b/tools/sched_ext/Makefile
@@ -189,7 +189,7 @@ $(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BP
  
  SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR)
  
-c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg scx_userland scx_pair
+c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg scx_userland scx_pair scx_sdt
  
  $(addprefix $(BINDIR)/,$(c-sched-targets)): \
         $(BINDIR)/%: \
diff --git a/tools/sched_ext/scx_sdt.bpf.c b/tools/sched_ext/scx_sdt.bpf.c

new file mode 100644 (file)

index 0000000..48ea186
--- /dev/null
+++ b/tools/sched_ext/scx_sdt.bpf.c
@@ -0,0 +1,710 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Arena-based task data scheduler. This is a variation of scx_simple
+ * that uses a combined allocator and indexing structure to organize
+ * task data. Task context allocation is done when a task enters the
+ * scheduler, while freeing is done when it exits. Task contexts are
+ * retrieved from task-local storage, pointing to the allocated memory.
+ *
+ * The main purpose of this scheduler is to demostrate arena memory
+ * management.
+ *
+ * Copyright (c) 2024-2025 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024-2025 Emil Tsalapatis <etsal@meta.com>
+ * Copyright (c) 2024-2025 Tejun Heo <tj@kernel.org>
+ *
+ */
+#include <scx/common.bpf.h>
+#include <scx/bpf_arena_common.bpf.h>
+
+#include "scx_sdt.h"
+
+char _license[] SEC("license") = "GPL";
+
+UEI_DEFINE(uei);
+
+struct {
+       __uint(type, BPF_MAP_TYPE_ARENA);
+       __uint(map_flags, BPF_F_MMAPABLE);
+#if defined(__TARGET_ARCH_arm64) || defined(__aarch64__)
+       __uint(max_entries, 1 << 16); /* number of pages */
+        __ulong(map_extra, (1ull << 32)); /* start of mmap() region */
+#else
+       __uint(max_entries, 1 << 20); /* number of pages */
+        __ulong(map_extra, (1ull << 44)); /* start of mmap() region */
+#endif
+} arena __weak SEC(".maps");
+
+#define SHARED_DSQ 0
+
+#define DEFINE_SDT_STAT(metric)                                \
+static inline void                             \
+stat_inc_##metric(struct scx_stats __arena *stats)     \
+{                                                      \
+       cast_kern(stats);                               \
+       stats->metric += 1;                             \
+}                                                      \
+__u64 stat_##metric;                                   \
+
+DEFINE_SDT_STAT(enqueue);
+DEFINE_SDT_STAT(init);
+DEFINE_SDT_STAT(exit);
+DEFINE_SDT_STAT(select_idle_cpu);
+DEFINE_SDT_STAT(select_busy_cpu);
+
+/*
+ * Necessary for cond_break/can_loop's semantics. According to kernel commit
+ * 011832b, the loop counter variable must be seen as imprecise and bounded
+ * by the verifier. Initializing it from a constant (e.g., i = 0;), then,
+ * makes it precise and prevents may_goto from helping with converging the
+ * loop. For these loops we must initialize the loop counter from a variable
+ * whose value the verifier cannot reason about when checking the program, so
+ * that the loop counter's value is imprecise.
+ */
+static __u64 zero = 0;
+
+/*
+ * XXX Hack to get the verifier to find the arena for sdt_exit_task.
+ * As of 6.12-rc5, The verifier associates arenas with programs by
+ * checking LD.IMM instruction operands for an arena and populating
+ * the program state with the first instance it finds. This requires
+ * accessing our global arena variable, but scx methods do not necessarily
+ * do so while still using pointers from that arena. Insert a bpf_printk
+ * statement that triggers at most once to generate an LD.IMM instruction
+ * to access the arena and help the verifier.
+ */
+static volatile bool scx_arena_verify_once;
+
+__hidden void scx_arena_subprog_init(void)
+{
+       if (scx_arena_verify_once)
+               return;
+
+       bpf_printk("%s: arena pointer %p", __func__, &arena);
+       scx_arena_verify_once = true;
+}
+
+
+private(LOCK) struct bpf_spin_lock alloc_lock;
+private(POOL_LOCK) struct bpf_spin_lock alloc_pool_lock;
+
+/* allocation pools */
+struct sdt_pool desc_pool;
+struct sdt_pool chunk_pool;
+
+/* Protected by alloc_lock. */
+struct scx_alloc_stats alloc_stats;
+
+
+/* Allocate element from the pool. Must be called with a then pool lock held. */
+static
+void __arena *scx_alloc_from_pool(struct sdt_pool *pool)
+{
+       __u64 elem_size, max_elems;
+       void __arena *slab;
+       void __arena *ptr;
+
+       elem_size = pool->elem_size;
+       max_elems = pool->max_elems;
+
+       /* If the chunk is spent, get a new one. */
+       if (pool->idx >= max_elems) {
+               slab = bpf_arena_alloc_pages(&arena, NULL,
+                       div_round_up(max_elems * elem_size, PAGE_SIZE), NUMA_NO_NODE, 0);
+               if (!slab)
+                       return NULL;
+
+               pool->slab = slab;
+               pool->idx = 0;
+       }
+
+       ptr = (void __arena *)((__u64) pool->slab + elem_size * pool->idx);
+       pool->idx += 1;
+
+       return ptr;
+}
+
+/* Alloc desc and associated chunk. Called with the allocator spinlock held. */
+static sdt_desc_t *scx_alloc_chunk(void)
+{
+       struct sdt_chunk __arena *chunk;
+       sdt_desc_t *desc;
+       sdt_desc_t *out;
+
+       chunk = scx_alloc_from_pool(&chunk_pool);
+       if (!chunk)
+               return NULL;
+
+       desc = scx_alloc_from_pool(&desc_pool);
+       if (!desc) {
+               /*
+                * Effectively frees the previous chunk allocation.
+                * Index cannot be 0, so decrementing is always
+                * valid.
+                */
+               chunk_pool.idx -= 1;
+               return NULL;
+       }
+
+       out = desc;
+
+       desc->nr_free = SDT_TASK_ENTS_PER_CHUNK;
+       desc->chunk = chunk;
+
+       alloc_stats.chunk_allocs += 1;
+
+       return out;
+}
+
+static int pool_set_size(struct sdt_pool *pool, __u64 data_size, __u64 nr_pages)
+{
+       if (unlikely(data_size % 8))
+               return -EINVAL;
+
+       if (unlikely(nr_pages == 0))
+               return -EINVAL;
+
+       pool->elem_size = data_size;
+       pool->max_elems = (PAGE_SIZE * nr_pages) / pool->elem_size;
+       /* Populate the pool slab on the first allocation. */
+       pool->idx = pool->max_elems;
+
+       return 0;
+}
+
+/* Initialize both the base pool allocators and the root chunk of the index. */
+__hidden int
+scx_alloc_init(struct scx_allocator *alloc, __u64 data_size)
+{
+       size_t min_chunk_size;
+       int ret;
+
+       _Static_assert(sizeof(struct sdt_chunk) <= PAGE_SIZE,
+               "chunk size must fit into a page");
+
+       ret = pool_set_size(&chunk_pool, sizeof(struct sdt_chunk), 1);
+       if (ret != 0)
+               return ret;
+
+       ret = pool_set_size(&desc_pool, sizeof(struct sdt_desc), 1);
+       if (ret != 0)
+               return ret;
+
+       /* Wrap data into a descriptor and word align. */
+       data_size += sizeof(struct sdt_data);
+       data_size = round_up(data_size, 8);
+
+       /*
+        * Ensure we allocate large enough chunks from the arena to avoid excessive
+        * internal fragmentation when turning chunks it into structs.
+        */
+       min_chunk_size = div_round_up(SDT_TASK_MIN_ELEM_PER_ALLOC * data_size, PAGE_SIZE);
+       ret = pool_set_size(&alloc->pool, data_size, min_chunk_size);
+       if (ret != 0)
+               return ret;
+
+       bpf_spin_lock(&alloc_lock);
+       alloc->root = scx_alloc_chunk();
+       bpf_spin_unlock(&alloc_lock);
+       if (!alloc->root)
+               return -ENOMEM;
+
+       return 0;
+}
+
+static
+int set_idx_state(sdt_desc_t *desc, __u64 pos, bool state)
+{
+       __u64 __arena *allocated = desc->allocated;
+       __u64 bit;
+
+       if (unlikely(pos >= SDT_TASK_ENTS_PER_CHUNK))
+               return -EINVAL;
+
+       bit = (__u64)1 << (pos % 64);
+
+       if (state)
+               allocated[pos / 64] |= bit;
+       else
+               allocated[pos / 64] &= ~bit;
+
+       return 0;
+}
+
+static __noinline
+int mark_nodes_avail(sdt_desc_t *lv_desc[SDT_TASK_LEVELS], __u64 lv_pos[SDT_TASK_LEVELS])
+{
+       sdt_desc_t *desc;
+       __u64 u, level;
+       int ret;
+
+       for (u = zero; u < SDT_TASK_LEVELS && can_loop; u++) {
+               level = SDT_TASK_LEVELS - 1 - u;
+
+               /* Only propagate upwards if we are the parent's only free chunk. */
+               desc = lv_desc[level];
+
+               ret = set_idx_state(desc, lv_pos[level], false);
+               if (unlikely(ret != 0))
+                       return ret;
+
+               desc->nr_free += 1;
+               if (desc->nr_free > 1)
+                       return 0;
+       }
+
+       return 0;
+}
+
+/*
+ * Free the allocated struct with the given index. Called with the
+ * allocator lock taken.
+ */
+__hidden
+int scx_alloc_free_idx(struct scx_allocator *alloc, __u64 idx)
+{
+       const __u64 mask = (1 << SDT_TASK_ENTS_PER_PAGE_SHIFT) - 1;
+       sdt_desc_t *lv_desc[SDT_TASK_LEVELS];
+       sdt_desc_t * __arena *desc_children;
+       struct sdt_chunk __arena *chunk;
+       sdt_desc_t *desc;
+       struct sdt_data __arena *data;
+       __u64 level, shift, pos;
+       __u64 lv_pos[SDT_TASK_LEVELS];
+       int ret;
+       int i;
+
+       if (!alloc)
+               return 0;
+
+       desc = alloc->root;
+       if (unlikely(!desc))
+               return -EINVAL;
+
+       /* To appease the verifier. */
+       for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) {
+               lv_desc[level] = NULL;
+               lv_pos[level] = 0;
+       }
+
+       /* Find the leaf node containing the index. */
+       for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) {
+               shift = (SDT_TASK_LEVELS - 1 - level) * SDT_TASK_ENTS_PER_PAGE_SHIFT;
+               pos = (idx >> shift) & mask;
+
+               lv_desc[level] = desc;
+               lv_pos[level] = pos;
+
+               if (level == SDT_TASK_LEVELS - 1)
+                       break;
+
+               chunk = desc->chunk;
+
+               desc_children = (sdt_desc_t * __arena *)chunk->descs;
+               desc = desc_children[pos];
+
+               if (unlikely(!desc))
+                       return -EINVAL;
+       }
+
+       chunk = desc->chunk;
+
+       pos = idx & mask;
+       data = chunk->data[pos];
+       if (likely(data)) {
+               data[pos] = (struct sdt_data) {
+                       .tid.genn = data->tid.genn + 1,
+               };
+
+               /* Zero out one word at a time. */
+               for (i = zero; i < alloc->pool.elem_size / 8 && can_loop; i++) {
+                       data->payload[i] = 0;
+               }
+       }
+
+       ret = mark_nodes_avail(lv_desc, lv_pos);
+       if (unlikely(ret != 0))
+               return ret;
+
+       alloc_stats.active_allocs -= 1;
+       alloc_stats.free_ops += 1;
+
+       return 0;
+}
+
+static inline
+int ffs(__u64 word)
+{
+       unsigned int num = 0;
+
+       if ((word & 0xffffffff) == 0) {
+               num += 32;
+               word >>= 32;
+       }
+
+       if ((word & 0xffff) == 0) {
+               num += 16;
+               word >>= 16;
+       }
+
+       if ((word & 0xff) == 0) {
+               num += 8;
+               word >>= 8;
+       }
+
+       if ((word & 0xf) == 0) {
+               num += 4;
+               word >>= 4;
+       }
+
+       if ((word & 0x3) == 0) {
+               num += 2;
+               word >>= 2;
+       }
+
+       if ((word & 0x1) == 0) {
+               num += 1;
+               word >>= 1;
+       }
+
+       return num;
+}
+
+
+/* find the first empty slot */
+__hidden
+__u64 chunk_find_empty(sdt_desc_t __arg_arena *desc)
+{
+       __u64 freeslots;
+       __u64 i;
+
+       for (i = 0; i < SDT_TASK_CHUNK_BITMAP_U64S; i++) {
+               freeslots = ~desc->allocated[i];
+               if (freeslots == (__u64)0)
+                       continue;
+
+               return (i * 64) + ffs(freeslots);
+       }
+
+       return SDT_TASK_ENTS_PER_CHUNK;
+}
+
+/*
+ * Find and return an available idx on the allocator.
+ * Called with the task spinlock held.
+ */
+static sdt_desc_t * desc_find_empty(sdt_desc_t *desc, __u64 *idxp)
+{
+       sdt_desc_t *lv_desc[SDT_TASK_LEVELS];
+       sdt_desc_t * __arena *desc_children;
+       struct sdt_chunk __arena *chunk;
+       sdt_desc_t *tmp;
+       __u64 lv_pos[SDT_TASK_LEVELS];
+       __u64 u, pos, level;
+       __u64 idx = 0;
+       int ret;
+
+       for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) {
+               pos = chunk_find_empty(desc);
+
+               /* If we error out, something has gone very wrong. */
+               if (unlikely(pos > SDT_TASK_ENTS_PER_CHUNK))
+                       return NULL;
+
+               if (pos == SDT_TASK_ENTS_PER_CHUNK)
+                       return NULL;
+
+               idx <<= SDT_TASK_ENTS_PER_PAGE_SHIFT;
+               idx |= pos;
+
+               /* Log the levels to complete allocation. */
+               lv_desc[level] = desc;
+               lv_pos[level] = pos;
+
+               /* The rest of the loop is for internal node traversal. */
+               if (level == SDT_TASK_LEVELS - 1)
+                       break;
+
+               /* Allocate an internal node if necessary. */
+               chunk = desc->chunk;
+               desc_children = (sdt_desc_t * __arena *)chunk->descs;
+
+               desc = desc_children[pos];
+               if (!desc) {
+                       desc = scx_alloc_chunk();
+                       if (!desc)
+                               return NULL;
+
+                       desc_children[pos] = desc;
+               }
+       }
+
+       /*
+        * Finding the descriptor along with any internal node
+        * allocations was successful. Update all levels with
+        * the new allocation.
+        */
+       bpf_for(u, 0, SDT_TASK_LEVELS) {
+               level = SDT_TASK_LEVELS - 1 - u;
+               tmp = lv_desc[level];
+
+               ret = set_idx_state(tmp, lv_pos[level], true);
+               if (ret != 0)
+                       break;
+
+               tmp->nr_free -= 1;
+               if (tmp->nr_free > 0)
+                       break;
+       }
+
+       *idxp = idx;
+
+       return desc;
+}
+
+__hidden
+void __arena *scx_alloc(struct scx_allocator *alloc)
+{
+       struct sdt_data __arena *data = NULL;
+       struct sdt_chunk __arena *chunk;
+       sdt_desc_t *desc;
+       __u64 idx, pos;
+
+       if (!alloc)
+               return NULL;
+
+       bpf_spin_lock(&alloc_lock);
+
+       /* We unlock if we encounter an error in the function. */
+       desc = desc_find_empty(alloc->root, &idx);
+       if (unlikely(desc == NULL)) {
+               bpf_spin_unlock(&alloc_lock);
+               return NULL;
+       }
+
+       chunk = desc->chunk;
+
+       /* Populate the leaf node if necessary. */
+       pos = idx & (SDT_TASK_ENTS_PER_CHUNK - 1);
+       data = chunk->data[pos];
+       if (!data) {
+               data = scx_alloc_from_pool(&alloc->pool);
+               if (!data) {
+                       scx_alloc_free_idx(alloc, idx);
+                       bpf_spin_unlock(&alloc_lock);
+                       return NULL;
+               }
+       }
+
+       chunk->data[pos] = data;
+
+       /* The data counts as a chunk */
+       alloc_stats.data_allocs += 1;
+       alloc_stats.alloc_ops += 1;
+       alloc_stats.active_allocs += 1;
+
+       data->tid.idx = idx;
+
+       bpf_spin_unlock(&alloc_lock);
+
+       return data;
+}
+
+/*
+ * Task BPF map entry recording the task's assigned ID and pointing to the data
+ * area allocated in arena.
+ */
+struct scx_task_map_val {
+       union sdt_id            tid;
+       __u64                   tptr;
+       struct sdt_data __arena *data;
+};
+
+struct {
+       __uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+       __uint(map_flags, BPF_F_NO_PREALLOC);
+       __type(key, int);
+       __type(value, struct scx_task_map_val);
+} scx_task_map SEC(".maps");
+
+static struct scx_allocator scx_task_allocator;
+
+__hidden
+void __arena *scx_task_alloc(struct task_struct *p)
+{
+       struct sdt_data __arena *data = NULL;
+       struct scx_task_map_val *mval;
+
+       mval = bpf_task_storage_get(&scx_task_map, p, 0,
+                                   BPF_LOCAL_STORAGE_GET_F_CREATE);
+       if (!mval)
+               return NULL;
+
+       data = scx_alloc(&scx_task_allocator);
+       if (unlikely(!data))
+               return NULL;
+
+       mval->tid = data->tid;
+       mval->tptr = (__u64) p;
+       mval->data = data;
+
+       return (void __arena *)data->payload;
+}
+
+__hidden
+int scx_task_init(__u64 data_size)
+{
+       return scx_alloc_init(&scx_task_allocator, data_size);
+}
+
+__hidden
+void __arena *scx_task_data(struct task_struct *p)
+{
+       struct sdt_data __arena *data;
+       struct scx_task_map_val *mval;
+
+       scx_arena_subprog_init();
+
+       mval = bpf_task_storage_get(&scx_task_map, p, 0, 0);
+       if (!mval)
+               return NULL;
+
+       data = mval->data;
+
+       return (void __arena *)data->payload;
+}
+
+__hidden
+void scx_task_free(struct task_struct *p)
+{
+       struct scx_task_map_val *mval;
+
+       scx_arena_subprog_init();
+
+       mval = bpf_task_storage_get(&scx_task_map, p, 0, 0);
+       if (!mval)
+               return;
+
+       bpf_spin_lock(&alloc_lock);
+       scx_alloc_free_idx(&scx_task_allocator, mval->tid.idx);
+       bpf_spin_unlock(&alloc_lock);
+
+       bpf_task_storage_delete(&scx_task_map, p);
+}
+
+static inline void
+scx_stat_global_update(struct scx_stats __arena *stats)
+{
+       cast_kern(stats);
+       __sync_fetch_and_add(&stat_enqueue, stats->enqueue);
+       __sync_fetch_and_add(&stat_init, stats->init);
+       __sync_fetch_and_add(&stat_exit, stats->exit);
+       __sync_fetch_and_add(&stat_select_idle_cpu, stats->select_idle_cpu);
+       __sync_fetch_and_add(&stat_select_busy_cpu, stats->select_busy_cpu);
+}
+
+s32 BPF_STRUCT_OPS(sdt_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
+{
+       struct scx_stats __arena *stats;
+       bool is_idle = false;
+       s32 cpu;
+
+       stats = scx_task_data(p);
+       if (!stats) {
+               scx_bpf_error("%s: no stats for pid %d", __func__, p->pid);
+               return 0;
+       }
+
+       cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle);
+       if (is_idle) {
+               stat_inc_select_idle_cpu(stats);
+               scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
+       } else {
+               stat_inc_select_busy_cpu(stats);
+       }
+
+       return cpu;
+}
+
+void BPF_STRUCT_OPS(sdt_enqueue, struct task_struct *p, u64 enq_flags)
+{
+       struct scx_stats __arena *stats;
+
+       stats = scx_task_data(p);
+       if (!stats) {
+               scx_bpf_error("%s: no stats for pid %d", __func__, p->pid);
+               return;
+       }
+
+       stat_inc_enqueue(stats);
+
+       scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);
+}
+
+void BPF_STRUCT_OPS(sdt_dispatch, s32 cpu, struct task_struct *prev)
+{
+       scx_bpf_dsq_move_to_local(SHARED_DSQ);
+}
+
+s32 BPF_STRUCT_OPS_SLEEPABLE(sdt_init_task, struct task_struct *p,
+                            struct scx_init_task_args *args)
+{
+       struct scx_stats __arena *stats;
+
+       stats = scx_task_alloc(p);
+       if (!stats) {
+               scx_bpf_error("arena allocator out of memory");
+               return -ENOMEM;
+       }
+
+       stats->pid = p->pid;
+
+       stat_inc_init(stats);
+
+       return 0;
+}
+
+void BPF_STRUCT_OPS(sdt_exit_task, struct task_struct *p,
+                             struct scx_exit_task_args *args)
+{
+       struct scx_stats __arena *stats;
+
+       stats = scx_task_data(p);
+       if (!stats) {
+               scx_bpf_error("%s: no stats for pid %d", __func__, p->pid);
+               return;
+       }
+
+       stat_inc_exit(stats);
+       scx_stat_global_update(stats);
+
+       scx_task_free(p);
+}
+
+s32 BPF_STRUCT_OPS_SLEEPABLE(sdt_init)
+{
+       int ret;
+
+       ret = scx_task_init(sizeof(struct scx_stats));
+       if (ret < 0) {
+               scx_bpf_error("%s: failed with %d", __func__, ret);
+               return ret;
+       }
+
+       return scx_bpf_create_dsq(SHARED_DSQ, -1);
+}
+
+void BPF_STRUCT_OPS(sdt_exit, struct scx_exit_info *ei)
+{
+       UEI_RECORD(uei, ei);
+}
+
+SCX_OPS_DEFINE(sdt_ops,
+              .select_cpu              = (void *)sdt_select_cpu,
+              .enqueue                 = (void *)sdt_enqueue,
+              .dispatch                = (void *)sdt_dispatch,
+              .init_task               = (void *)sdt_init_task,
+              .exit_task               = (void *)sdt_exit_task,
+              .init                    = (void *)sdt_init,
+              .exit                    = (void *)sdt_exit,
+              .name                    = "sdt");
diff --git a/tools/sched_ext/scx_sdt.c b/tools/sched_ext/scx_sdt.c

new file mode 100644 (file)

index 0000000..b036336
--- /dev/null
+++ b/tools/sched_ext/scx_sdt.c
@@ -0,0 +1,101 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 Emil Tsalapatis <etsal@meta.com>
+ * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <signal.h>
+#include <libgen.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+
+#include "scx_sdt.h"
+#include "scx_sdt.bpf.skel.h"
+
+const char help_fmt[] =
+"A simple arena-based sched_ext scheduler.\n"
+"\n"
+"Modified version of scx_simple that demonstrates arena-based data structures.\n"
+"\n"
+"Usage: %s [-f] [-v]\n"
+"\n"
+"  -v            Print libbpf debug messages\n"
+"  -h            Display this help and exit\n";
+
+static bool verbose;
+static volatile int exit_req;
+
+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
+{
+       if (level == LIBBPF_DEBUG && !verbose)
+               return 0;
+       return vfprintf(stderr, format, args);
+}
+
+static void sigint_handler(int sig)
+{
+       exit_req = 1;
+}
+
+int main(int argc, char **argv)
+{
+       struct scx_sdt *skel;
+       struct bpf_link *link;
+       __u32 opt;
+       __u64 ecode;
+
+       libbpf_set_print(libbpf_print_fn);
+       signal(SIGINT, sigint_handler);
+       signal(SIGTERM, sigint_handler);
+restart:
+       skel = SCX_OPS_OPEN(sdt_ops, scx_sdt);
+
+       while ((opt = getopt(argc, argv, "fvh")) != -1) {
+               switch (opt) {
+               case 'v':
+                       verbose = true;
+                       break;
+               default:
+                       fprintf(stderr, help_fmt, basename(argv[0]));
+                       return opt != 'h';
+               }
+       }
+
+       SCX_OPS_LOAD(skel, sdt_ops, scx_sdt, uei);
+       link = SCX_OPS_ATTACH(skel, sdt_ops, scx_sdt);
+
+       while (!exit_req && !UEI_EXITED(skel, uei)) {
+               printf("====SCHEDULING STATS====\n");
+               printf("enqueues=%llu\t", skel->bss->stat_enqueue);
+               printf("inits=%llu\t", skel->bss->stat_init);
+               printf("exits=%llu\t", skel->bss->stat_exit);
+               printf("\n");
+
+               printf("select_idle_cpu=%llu\t", skel->bss->stat_select_idle_cpu);
+               printf("select_busy_cpu=%llu\t", skel->bss->stat_select_busy_cpu);
+               printf("\n");
+
+               printf("====ALLOCATION STATS====\n");
+               printf("chunk allocs=%llu\t", skel->bss->alloc_stats.chunk_allocs);
+               printf("data_allocs=%llu\n", skel->bss->alloc_stats.data_allocs);
+               printf("alloc_ops=%llu\t", skel->bss->alloc_stats.alloc_ops);
+               printf("free_ops=%llu\t", skel->bss->alloc_stats.free_ops);
+               printf("active_allocs=%llu\t", skel->bss->alloc_stats.active_allocs);
+               printf("arena_pages_used=%llu\t", skel->bss->alloc_stats.arena_pages_used);
+               printf("\n\n");
+
+               fflush(stdout);
+               sleep(1);
+       }
+
+       bpf_link__destroy(link);
+       ecode = UEI_REPORT(skel, uei);
+       scx_sdt__destroy(skel);
+
+       if (UEI_ECODE_RESTART(ecode))
+               goto restart;
+       return 0;
+}
diff --git a/tools/sched_ext/scx_sdt.h b/tools/sched_ext/scx_sdt.h

new file mode 100644 (file)

index 0000000..67982ce
--- /dev/null
+++ b/tools/sched_ext/scx_sdt.h
@@ -0,0 +1,113 @@
+/*
+ * SPDX-License-Identifier: GPL-2.0
+ * Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2025 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2025 Emil Tsalapatis <etsal@meta.com>
+ */
+#pragma once
+
+#ifndef __BPF__
+#define __arena
+#endif /* __BPF__ */
+
+struct scx_alloc_stats {
+       __u64           chunk_allocs;
+       __u64           data_allocs;
+       __u64           alloc_ops;
+       __u64           free_ops;
+       __u64           active_allocs;
+       __u64           arena_pages_used;
+};
+
+struct sdt_pool {
+       void __arena    *slab;
+       __u64           elem_size;
+       __u64           max_elems;
+       __u64           idx;
+};
+
+#ifndef div_round_up
+#define div_round_up(a, b) (((a) + (b) - 1) / (b))
+#endif
+
+#ifndef round_up
+#define round_up(a, b) (div_round_up((a), (b)) * (b))
+#endif
+
+typedef struct sdt_desc __arena sdt_desc_t;
+
+enum sdt_consts {
+       SDT_TASK_ENTS_PER_PAGE_SHIFT    = 9,
+       SDT_TASK_LEVELS                 = 3,
+       SDT_TASK_ENTS_PER_CHUNK         = 1 << SDT_TASK_ENTS_PER_PAGE_SHIFT,
+       SDT_TASK_CHUNK_BITMAP_U64S      = div_round_up(SDT_TASK_ENTS_PER_CHUNK, 64),
+       SDT_TASK_MIN_ELEM_PER_ALLOC     = 8,
+};
+
+union sdt_id {
+       __s64                           val;
+       struct {
+               __s32                   idx;    /* index in the radix tree */
+               __s32                   genn;   /* ++'d on recycle so that it forms unique'ish 64bit ID */
+       };
+};
+
+struct sdt_chunk;
+
+/*
+ * Each index page is described by the following descriptor which carries the
+ * bitmap. This way the actual index can host power-of-two numbers of entries
+ * which makes indexing cheaper.
+ */
+struct sdt_desc {
+       __u64                           allocated[SDT_TASK_CHUNK_BITMAP_U64S];
+       __u64                           nr_free;
+       struct sdt_chunk __arena        *chunk;
+};
+
+/*
+ * Leaf node containing per-task data.
+ */
+struct sdt_data {
+       union sdt_id                    tid;
+       __u64                           payload[];
+};
+
+/*
+ * Intermediate node pointing to another intermediate node or leaf node.
+ */
+struct sdt_chunk {
+       union {
+               sdt_desc_t * descs[SDT_TASK_ENTS_PER_CHUNK];
+               struct sdt_data __arena *data[SDT_TASK_ENTS_PER_CHUNK];
+       };
+};
+
+struct scx_allocator {
+       struct sdt_pool pool;
+       sdt_desc_t      *root;
+};
+
+struct scx_stats {
+       int     seq;
+       pid_t   pid;
+       __u64   enqueue;
+       __u64   exit;
+       __u64   init;
+       __u64   select_busy_cpu;
+       __u64   select_idle_cpu;
+};
+
+#ifdef __BPF__
+
+void __arena *scx_task_data(struct task_struct *p);
+int scx_task_init(__u64 data_size);
+void __arena *scx_task_alloc(struct task_struct *p);
+void scx_task_free(struct task_struct *p);
+void scx_arena_subprog_init(void);
+
+int scx_alloc_init(struct scx_allocator *alloc, __u64 data_size);
+u64 scx_alloc_internal(struct scx_allocator *alloc);
+int scx_alloc_free_idx(struct scx_allocator *alloc, __u64 idx);
+
+#endif /* __BPF__ */
author	Emil Tsalapatis <emil@etsalapatis.com>
	Fri, 23 Jan 2026 03:26:05 +0000 (22:26 -0500)
committer	Tejun Heo <tj@kernel.org>
	Wed, 28 Jan 2026 01:43:34 +0000 (15:43 -1000)
tools/sched_ext/Makefile		patch \| blob \| blame \| history
tools/sched_ext/scx_sdt.bpf.c	[new file with mode: 0644]	patch \| blob
tools/sched_ext/scx_sdt.c	[new file with mode: 0644]	patch \| blob
tools/sched_ext/scx_sdt.h	[new file with mode: 0644]	patch \| blob