sched_ext: Allocate scx_kick_cpus_pnt_seqs lazily using kvzalloc()

author Tejun Heo <tj@kernel.org>

Wed, 8 Oct 2025 23:43:26 +0000 (13:43 -1000)

committer Tejun Heo <tj@kernel.org>

Mon, 13 Oct 2025 18:42:19 +0000 (08:42 -1000)
author Tejun Heo <tj@kernel.org>
Wed, 8 Oct 2025 23:43:26 +0000 (13:43 -1000)
committer Tejun Heo <tj@kernel.org>
Mon, 13 Oct 2025 18:42:19 +0000 (08:42 -1000)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c

index 1352e6a5b0898898a791ad26ac7da342d1b496b9..c645d47124e72aa40762fc8c38791ff1454b03c5 100644 (file)
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -67,8 +67,19 @@ static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES;
  
  static struct delayed_work scx_watchdog_work;
  
-/* for %SCX_KICK_WAIT */
-static unsigned long __percpu *scx_kick_cpus_pnt_seqs;
+/*
+ * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of pick_task sequence
+ * numbers. The arrays are allocated with kvzalloc() as size can exceed percpu
+ * allocator limits on large machines. O(nr_cpu_ids^2) allocation, allocated
+ * lazily when enabling and freed when disabling to avoid waste when sched_ext
+ * isn't active.
+ */
+struct scx_kick_pseqs {
+       struct rcu_head         rcu;
+       unsigned long           seqs[];
+};
+
+static DEFINE_PER_CPU(struct scx_kick_pseqs __rcu *, scx_kick_pseqs);
  
  /*
   * Direct dispatch marker.
@@ -3877,6 +3888,27 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
         }
  }
  
+static void free_kick_pseqs_rcu(struct rcu_head *rcu)
+{
+       struct scx_kick_pseqs *pseqs = container_of(rcu, struct scx_kick_pseqs, rcu);
+
+       kvfree(pseqs);
+}
+
+static void free_kick_pseqs(void)
+{
+       int cpu;
+
+       for_each_possible_cpu(cpu) {
+               struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu);
+               struct scx_kick_pseqs *to_free;
+
+               to_free = rcu_replace_pointer(*pseqs, NULL, true);
+               if (to_free)
+                       call_rcu(&to_free->rcu, free_kick_pseqs_rcu);
+       }
+}
+
  static void scx_disable_workfn(struct kthread_work *work)
  {
         struct scx_sched *sch = container_of(work, struct scx_sched, disable_work);
@@ -4013,6 +4045,7 @@ static void scx_disable_workfn(struct kthread_work *work)
         free_percpu(scx_dsp_ctx);
         scx_dsp_ctx = NULL;
         scx_dsp_max_batch = 0;
+       free_kick_pseqs();
  
         mutex_unlock(&scx_enable_mutex);
  
@@ -4375,6 +4408,33 @@ static void scx_vexit(struct scx_sched *sch,
         irq_work_queue(&sch->error_irq_work);
  }
  
+static int alloc_kick_pseqs(void)
+{
+       int cpu;
+
+       /*
+        * Allocate per-CPU arrays sized by nr_cpu_ids. Use kvzalloc as size
+        * can exceed percpu allocator limits on large machines.
+        */
+       for_each_possible_cpu(cpu) {
+               struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu);
+               struct scx_kick_pseqs *new_pseqs;
+
+               WARN_ON_ONCE(rcu_access_pointer(*pseqs));
+
+               new_pseqs = kvzalloc_node(struct_size(new_pseqs, seqs, nr_cpu_ids),
+                                         GFP_KERNEL, cpu_to_node(cpu));
+               if (!new_pseqs) {
+                       free_kick_pseqs();
+                       return -ENOMEM;
+               }
+
+               rcu_assign_pointer(*pseqs, new_pseqs);
+       }
+
+       return 0;
+}
+
  static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
  {
         struct scx_sched *sch;
@@ -4517,15 +4577,19 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
  
         mutex_lock(&scx_enable_mutex);
  
+       ret = alloc_kick_pseqs();
+       if (ret)
+               goto err_unlock;
+
         if (scx_enable_state() != SCX_DISABLED) {
                 ret = -EBUSY;
-               goto err_unlock;
+               goto err_free_pseqs;
         }
  
         sch = scx_alloc_and_add_sched(ops);
         if (IS_ERR(sch)) {
                 ret = PTR_ERR(sch);
-               goto err_unlock;
+               goto err_free_pseqs;
         }
  
         /*
@@ -4728,6 +4792,8 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
  
         return 0;
  
+err_free_pseqs:
+       free_kick_pseqs();
  err_unlock:
         mutex_unlock(&scx_enable_mutex);
         return ret;
@@ -5109,10 +5175,18 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
  {
         struct rq *this_rq = this_rq();
         struct scx_rq *this_scx = &this_rq->scx;
-       unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs);
+       struct scx_kick_pseqs __rcu *pseqs_pcpu = __this_cpu_read(scx_kick_pseqs);
         bool should_wait = false;
+       unsigned long *pseqs;
         s32 cpu;
  
+       if (unlikely(!pseqs_pcpu)) {
+               pr_warn_once("kick_cpus_irq_workfn() called with NULL scx_kick_pseqs");
+               return;
+       }
+
+       pseqs = rcu_dereference_bh(pseqs_pcpu)->seqs;
+
         for_each_cpu(cpu, this_scx->cpus_to_kick) {
                 should_wait |= kick_one_cpu(cpu, this_rq, pseqs);
                 cpumask_clear_cpu(cpu, this_scx->cpus_to_kick);
@@ -5235,11 +5309,6 @@ void __init init_sched_ext_class(void)
  
         scx_idle_init_masks();
  
-       scx_kick_cpus_pnt_seqs =
-               __alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids,
-                              __alignof__(scx_kick_cpus_pnt_seqs[0]));
-       BUG_ON(!scx_kick_cpus_pnt_seqs);
-
         for_each_possible_cpu(cpu) {
                 struct rq *rq = cpu_rq(cpu);
                 int  n = cpu_to_node(cpu);
author	Tejun Heo <tj@kernel.org>
	Wed, 8 Oct 2025 23:43:26 +0000 (13:43 -1000)
committer	Tejun Heo <tj@kernel.org>
	Mon, 13 Oct 2025 18:42:19 +0000 (08:42 -1000)