sched_ext: Add support for cgroup bandwidth control interface

author Tejun Heo <tj@kernel.org>

Sat, 14 Jun 2025 01:34:22 +0000 (15:34 -1000)

committer Tejun Heo <tj@kernel.org>

Sat, 21 Jun 2025 03:03:51 +0000 (17:03 -1000)
author Tejun Heo <tj@kernel.org>
Sat, 14 Jun 2025 01:34:22 +0000 (15:34 -1000)
committer Tejun Heo <tj@kernel.org>
Sat, 21 Jun 2025 03:03:51 +0000 (17:03 -1000)
diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h

index eda89acdb7ab1eccb5050fa7dbe11db63103cf11..8b92842776cb3b6292e72f40eb7625be00459f80 100644 (file)
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -219,6 +219,9 @@ struct scx_task_group {
  #ifdef CONFIG_EXT_GROUP_SCHED
         u32                     flags;          /* SCX_TG_* */
         u32                     weight;
+       u64                     bw_period_us;
+       u64                     bw_quota_us;
+       u64                     bw_burst_us;
  #endif
  };
  
diff --git a/init/Kconfig b/init/Kconfig

index af4c2f0854554bbcdf193852cf5c1d2c2accc64f..baf59d2a20a29a7c4e0334acec7027a6ea482627 100644 (file)
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1065,6 +1065,9 @@ if CGROUP_SCHED
  config GROUP_SCHED_WEIGHT
         def_bool n
  
+config GROUP_SCHED_BANDWIDTH
+        def_bool n
+
  config FAIR_GROUP_SCHED
         bool "Group scheduling for SCHED_OTHER"
         depends on CGROUP_SCHED
@@ -1074,6 +1077,7 @@ config FAIR_GROUP_SCHED
  config CFS_BANDWIDTH
         bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED"
         depends on FAIR_GROUP_SCHED
+       select GROUP_SCHED_BANDWIDTH
         default n
         help
           This option allows users to define CPU bandwidth rates (limits) for
@@ -1108,6 +1112,7 @@ config EXT_GROUP_SCHED
         bool
         depends on SCHED_CLASS_EXT && CGROUP_SCHED
         select GROUP_SCHED_WEIGHT
+       select GROUP_SCHED_BANDWIDTH
         default y
  
  endif #CGROUP_SCHED
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 0e3a00e2a2cca23543402ed4c8cee4c6355c2761..91845d00a1cd244380069c137ddf24c4a7710e75 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9545,7 +9545,9 @@ static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v)
  
         return 0;
  }
+#endif /* CONFIG_CFS_BANDWIDTH */
  
+#ifdef CONFIG_GROUP_SCHED_BANDWIDTH
  const u64 max_bw_quota_period_us = 1 * USEC_PER_SEC; /* 1s */
  static const u64 min_bw_quota_period_us = 1 * USEC_PER_MSEC; /* 1ms */
  /* More than 203 days if BW_SHIFT equals 20. */
@@ -9554,12 +9556,21 @@ static const u64 max_bw_runtime_us = MAX_BW;
  static void tg_bandwidth(struct task_group *tg,
                          u64 *period_us_p, u64 *quota_us_p, u64 *burst_us_p)
  {
+#ifdef CONFIG_CFS_BANDWIDTH
         if (period_us_p)
                 *period_us_p = tg_get_cfs_period(tg);
         if (quota_us_p)
                 *quota_us_p = tg_get_cfs_quota(tg);
         if (burst_us_p)
                 *burst_us_p = tg_get_cfs_burst(tg);
+#else /* !CONFIG_CFS_BANDWIDTH */
+       if (period_us_p)
+               *period_us_p = tg->scx.bw_period_us;
+       if (quota_us_p)
+               *quota_us_p = tg->scx.bw_quota_us;
+       if (burst_us_p)
+               *burst_us_p = tg->scx.bw_burst_us;
+#endif /* CONFIG_CFS_BANDWIDTH */
  }
  
  static u64 cpu_period_read_u64(struct cgroup_subsys_state *css,
@@ -9575,6 +9586,7 @@ static int tg_set_bandwidth(struct task_group *tg,
                             u64 period_us, u64 quota_us, u64 burst_us)
  {
         const u64 max_usec = U64_MAX / NSEC_PER_USEC;
+       int ret = 0;
  
         if (tg == &root_task_group)
                 return -EINVAL;
@@ -9612,7 +9624,12 @@ static int tg_set_bandwidth(struct task_group *tg,
                                         burst_us + quota_us > max_bw_runtime_us))
                 return -EINVAL;
  
-       return tg_set_cfs_bandwidth(tg, period_us, quota_us, burst_us);
+#ifdef CONFIG_CFS_BANDWIDTH
+       ret = tg_set_cfs_bandwidth(tg, period_us, quota_us, burst_us);
+#endif /* CONFIG_CFS_BANDWIDTH */
+       if (!ret)
+               scx_group_set_bandwidth(tg, period_us, quota_us, burst_us);
+       return ret;
  }
  
  static s64 cpu_quota_read_s64(struct cgroup_subsys_state *css,
@@ -9665,7 +9682,7 @@ static int cpu_burst_write_u64(struct cgroup_subsys_state *css,
         tg_bandwidth(tg, &period_us, &quota_us, NULL);
         return tg_set_bandwidth(tg, period_us, quota_us, burst_us);
  }
-#endif /* CONFIG_CFS_BANDWIDTH */
+#endif /* CONFIG_GROUP_SCHED_BANDWIDTH */
  
  #ifdef CONFIG_RT_GROUP_SCHED
  static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
@@ -9725,7 +9742,7 @@ static struct cftype cpu_legacy_files[] = {
                 .write_s64 = cpu_idle_write_s64,
         },
  #endif
-#ifdef CONFIG_CFS_BANDWIDTH
+#ifdef CONFIG_GROUP_SCHED_BANDWIDTH
         {
                 .name = "cfs_period_us",
                 .read_u64 = cpu_period_read_u64,
@@ -9741,6 +9758,8 @@ static struct cftype cpu_legacy_files[] = {
                 .read_u64 = cpu_burst_read_u64,
                 .write_u64 = cpu_burst_write_u64,
         },
+#endif
+#ifdef CONFIG_CFS_BANDWIDTH
         {
                 .name = "stat",
                 .seq_show = cpu_cfs_stat_show,
@@ -9954,7 +9973,7 @@ static int __maybe_unused cpu_period_quota_parse(char *buf, u64 *period_us_p,
         return 0;
  }
  
-#ifdef CONFIG_CFS_BANDWIDTH
+#ifdef CONFIG_GROUP_SCHED_BANDWIDTH
  static int cpu_max_show(struct seq_file *sf, void *v)
  {
         struct task_group *tg = css_tg(seq_css(sf));
@@ -10001,7 +10020,7 @@ static struct cftype cpu_files[] = {
                 .write_s64 = cpu_idle_write_s64,
         },
  #endif
-#ifdef CONFIG_CFS_BANDWIDTH
+#ifdef CONFIG_GROUP_SCHED_BANDWIDTH
         {
                 .name = "max",
                 .flags = CFTYPE_NOT_ON_ROOT,
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c

index 6732e50e0679334eb135be458888577a8496e1d0..39cba11688a9b5bf52ad824d591191b4bc45e748 100644 (file)
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -203,6 +203,11 @@ struct scx_exit_task_args {
  struct scx_cgroup_init_args {
         /* the weight of the cgroup [1..10000] */
         u32                     weight;
+
+       /* bandwidth control parameters from cpu.max and cpu.max.burst */
+       u64                     bw_period_us;
+       u64                     bw_quota_us;
+       u64                     bw_burst_us;
  };
  
  enum scx_cpu_preempt_reason {
@@ -664,9 +669,31 @@ struct sched_ext_ops {
          * @cgrp: cgroup whose weight is being updated
          * @weight: new weight [1..10000]
          *
-        * Update @tg's weight to @weight.
+        * Update @cgrp's weight to @weight.
          */
         void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight);
+
+       /**
+        * @cgroup_set_bandwidth: A cgroup's bandwidth is being changed
+        * @cgrp: cgroup whose bandwidth is being updated
+        * @period_us: bandwidth control period
+        * @quota_us: bandwidth control quota
+        * @burst_us: bandwidth control burst
+        *
+        * Update @cgrp's bandwidth control parameters. This is from the cpu.max
+        * cgroup interface.
+        *
+        * @quota_us / @period_us determines the CPU bandwidth @cgrp is entitled
+        * to. For example, if @period_us is 1_000_000 and @quota_us is
+        * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be
+        * interpreted in the same fashion and specifies how much @cgrp can
+        * burst temporarily. The specific control mechanism and thus the
+        * interpretation of @period_us and burstiness is upto to the BPF
+        * scheduler.
+        */
+       void (*cgroup_set_bandwidth)(struct cgroup *cgrp,
+                                    u64 period_us, u64 quota_us, u64 burst_us);
+
  #endif /* CONFIG_EXT_GROUP_SCHED */
  
         /*
@@ -4059,6 +4086,8 @@ static bool scx_cgroup_enabled;
  void scx_tg_init(struct task_group *tg)
  {
         tg->scx.weight = CGROUP_WEIGHT_DFL;
+       tg->scx.bw_period_us = default_bw_period_us();
+       tg->scx.bw_quota_us = RUNTIME_INF;
  }
  
  int scx_tg_online(struct task_group *tg)
@@ -4073,7 +4102,10 @@ int scx_tg_online(struct task_group *tg)
         if (scx_cgroup_enabled) {
                 if (SCX_HAS_OP(sch, cgroup_init)) {
                         struct scx_cgroup_init_args args =
-                               { .weight = tg->scx.weight };
+                               { .weight = tg->scx.weight,
+                                 .bw_period_us = tg->scx.bw_period_us,
+                                 .bw_quota_us = tg->scx.bw_quota_us,
+                                 .bw_burst_us = tg->scx.bw_burst_us };
  
                         ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init,
                                               NULL, tg->css.cgroup, &args);
@@ -4225,6 +4257,27 @@ void scx_group_set_idle(struct task_group *tg, bool idle)
         /* TODO: Implement ops->cgroup_set_idle() */
  }
  
+void scx_group_set_bandwidth(struct task_group *tg,
+                            u64 period_us, u64 quota_us, u64 burst_us)
+{
+       struct scx_sched *sch = scx_root;
+
+       percpu_down_read(&scx_cgroup_rwsem);
+
+       if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_bandwidth) &&
+           (tg->scx.bw_period_us != period_us ||
+            tg->scx.bw_quota_us != quota_us ||
+            tg->scx.bw_burst_us != burst_us))
+               SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_bandwidth, NULL,
+                           tg_cgrp(tg), period_us, quota_us, burst_us);
+
+       tg->scx.bw_period_us = period_us;
+       tg->scx.bw_quota_us = quota_us;
+       tg->scx.bw_burst_us = burst_us;
+
+       percpu_up_read(&scx_cgroup_rwsem);
+}
+
  static void scx_cgroup_lock(void)
  {
         percpu_down_write(&scx_cgroup_rwsem);
@@ -4400,7 +4453,12 @@ static int scx_cgroup_init(struct scx_sched *sch)
         rcu_read_lock();
         css_for_each_descendant_pre(css, &root_task_group.css) {
                 struct task_group *tg = css_tg(css);
-               struct scx_cgroup_init_args args = { .weight = tg->scx.weight };
+               struct scx_cgroup_init_args args = {
+                       .weight = tg->scx.weight,
+                       .bw_period_us = tg->scx.bw_period_us,
+                       .bw_quota_us = tg->scx.bw_quota_us,
+                       .bw_burst_us = tg->scx.bw_burst_us,
+               };
  
                 if ((tg->scx.flags &
                      (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE)
@@ -5902,6 +5960,7 @@ static s32 sched_ext_ops__cgroup_prep_move(struct task_struct *p, struct cgroup
  static void sched_ext_ops__cgroup_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
  static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
  static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {}
+static void sched_ext_ops__cgroup_set_bandwidth(struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) {}
  #endif
  static void sched_ext_ops__cpu_online(s32 cpu) {}
  static void sched_ext_ops__cpu_offline(s32 cpu) {}
@@ -5939,6 +5998,7 @@ static struct sched_ext_ops __bpf_ops_sched_ext_ops = {
         .cgroup_move            = sched_ext_ops__cgroup_move,
         .cgroup_cancel_move     = sched_ext_ops__cgroup_cancel_move,
         .cgroup_set_weight      = sched_ext_ops__cgroup_set_weight,
+       .cgroup_set_bandwidth   = sched_ext_ops__cgroup_set_bandwidth,
  #endif
         .cpu_online             = sched_ext_ops__cpu_online,
         .cpu_offline            = sched_ext_ops__cpu_offline,
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h

index e7bcaa02ea56d39a29b2a41e74742092f4aec0bd..292bb41a242ec1882d1f4bab06a78392e1c3d9b3 100644 (file)
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -104,6 +104,7 @@ void scx_cgroup_finish_attach(void);
  void scx_cgroup_cancel_attach(struct cgroup_taskset *tset);
  void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight);
  void scx_group_set_idle(struct task_group *tg, bool idle);
+void scx_group_set_bandwidth(struct task_group *tg, u64 period_us, u64 quota_us, u64 burst_us);
  #else  /* CONFIG_EXT_GROUP_SCHED */
  static inline void scx_tg_init(struct task_group *tg) {}
  static inline int scx_tg_online(struct task_group *tg) { return 0; }
@@ -114,5 +115,6 @@ static inline void scx_cgroup_finish_attach(void) {}
  static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {}
  static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {}
  static inline void scx_group_set_idle(struct task_group *tg, bool idle) {}
+static inline void scx_group_set_bandwidth(struct task_group *tg, u64 period_us, u64 quota_us, u64 burst_us) {}
  #endif /* CONFIG_EXT_GROUP_SCHED */
  #endif /* CONFIG_CGROUP_SCHED */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index fdf5f52b54a3b11090d29ef4042b229d3433ee09..06767a21071754e83f24baa5ba9573cf91cccb89 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -402,7 +402,7 @@ static inline bool dl_server_active(struct sched_dl_entity *dl_se)
  
  extern struct list_head task_groups;
  
-#ifdef CONFIG_CFS_BANDWIDTH
+#ifdef CONFIG_GROUP_SCHED_BANDWIDTH
  extern const u64 max_bw_quota_period_us;
  
  /*
@@ -413,7 +413,7 @@ static inline u64 default_bw_period_us(void)
  {
         return 100000ULL;
  }
-#endif /* CONFIG_CFS_BANDWIDTH */
+#endif /* CONFIG_GROUP_SCHED_BANDWIDTH */
  
  struct cfs_bandwidth {
  #ifdef CONFIG_CFS_BANDWIDTH
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c

index c3cd9a17d48ef83e997a3a4f72e40b34de28688a..69d877501cb727fc315d2619af1a430f26da5d9e 100644 (file)
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -615,6 +615,26 @@ void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struc
                      taskc->force_local, taskc->core_sched_seq);
  }
  
+s32 BPF_STRUCT_OPS(qmap_cgroup_init, struct cgroup *cgrp, struct scx_cgroup_init_args *args)
+{
+       bpf_printk("CGRP INIT %llu weight=%u period=%lu quota=%ld burst=%lu",
+                  cgrp->kn->id, args->weight, args->bw_period_us,
+                  args->bw_quota_us, args->bw_burst_us);
+       return 0;
+}
+
+void BPF_STRUCT_OPS(qmap_cgroup_set_weight, struct cgroup *cgrp, u32 weight)
+{
+       bpf_printk("CGRP SET %llu weight=%u", cgrp->kn->id, weight);
+}
+
+void BPF_STRUCT_OPS(qmap_cgroup_set_bandwidth, struct cgroup *cgrp,
+                   u64 period_us, u64 quota_us, u64 burst_us)
+{
+       bpf_printk("CGRP SET %llu period=%lu quota=%ld burst=%lu", cgrp->kn->id,
+                  period_us, quota_us, burst_us);
+}
+
  /*
   * Print out the online and possible CPU map using bpf_printk() as a
   * demonstration of using the cpumask kfuncs and ops.cpu_on/offline().
@@ -840,6 +860,9 @@ SCX_OPS_DEFINE(qmap_ops,
                .dump                    = (void *)qmap_dump,
                .dump_cpu                = (void *)qmap_dump_cpu,
                .dump_task               = (void *)qmap_dump_task,
+              .cgroup_init             = (void *)qmap_cgroup_init,
+              .cgroup_set_weight       = (void *)qmap_cgroup_set_weight,
+              .cgroup_set_bandwidth    = (void *)qmap_cgroup_set_bandwidth,
                .cpu_online              = (void *)qmap_cpu_online,
                .cpu_offline             = (void *)qmap_cpu_offline,
                .init                    = (void *)qmap_init,
diff --git a/tools/testing/selftests/sched_ext/maximal.bpf.c b/tools/testing/selftests/sched_ext/maximal.bpf.c

index 430f5e13bf554492e6505822f1654b7ebfd4d3af..01cf4f3da4e091366eedb3d7d8fb1f658063ce4b 100644 (file)
--- a/tools/testing/selftests/sched_ext/maximal.bpf.c
+++ b/tools/testing/selftests/sched_ext/maximal.bpf.c
@@ -123,6 +123,10 @@ void BPF_STRUCT_OPS(maximal_cgroup_cancel_move, struct task_struct *p,
  void BPF_STRUCT_OPS(maximal_cgroup_set_weight, struct cgroup *cgrp, u32 weight)
  {}
  
+void BPF_STRUCT_OPS(maximal_cgroup_set_bandwidth, struct cgroup *cgrp,
+                   u64 period_us, u64 quota_us, u64 burst_us)
+{}
+
  s32 BPF_STRUCT_OPS_SLEEPABLE(maximal_init)
  {
         return scx_bpf_create_dsq(DSQ_ID, -1);
@@ -160,6 +164,7 @@ struct sched_ext_ops maximal_ops = {
         .cgroup_move            = (void *) maximal_cgroup_move,
         .cgroup_cancel_move     = (void *) maximal_cgroup_cancel_move,
         .cgroup_set_weight      = (void *) maximal_cgroup_set_weight,
+       .cgroup_set_bandwidth   = (void *) maximal_cgroup_set_bandwidth,
         .init                   = (void *) maximal_init,
         .exit                   = (void *) maximal_exit,
         .name                   = "maximal",
author	Tejun Heo <tj@kernel.org>
	Sat, 14 Jun 2025 01:34:22 +0000 (15:34 -1000)
committer	Tejun Heo <tj@kernel.org>
	Sat, 21 Jun 2025 03:03:51 +0000 (17:03 -1000)
include/linux/sched/ext.h		patch \| blob \| blame \| history
init/Kconfig		patch \| blob \| blame \| history
kernel/sched/core.c		patch \| blob \| blame \| history
kernel/sched/ext.c		patch \| blob \| blame \| history
kernel/sched/ext.h		patch \| blob \| blame \| history
kernel/sched/sched.h		patch \| blob \| blame \| history
tools/sched_ext/scx_qmap.bpf.c		patch \| blob \| blame \| history
tools/testing/selftests/sched_ext/maximal.bpf.c		patch \| blob \| blame \| history