From a5bd6ba30b3364354269b81ac55c2edca9a96d6d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 3 Sep 2025 11:36:07 -1000 Subject: [PATCH] sched_ext: Use cgroup_lock/unlock() to synchronize against cgroup operations SCX hooks into CPU cgroup controller operations and read-locks scx_cgroup_rwsem to exclude them while enabling and disable schedulers. While this works, it's unnecessarily complicated given that cgroup_[un]lock() are available and thus the cgroup operations can be locked out that way. Drop scx_cgroup_rwsem locking from the tg on/offline and cgroup [can_]attach operations. Instead, grab cgroup_lock() from scx_cgroup_lock(). Drop scx_cgroup_finish_attach() which is no longer necessary. Drop the now unnecessary rcu locking and css ref bumping in scx_cgroup_init() and scx_cgroup_exit(). As scx_cgroup_set_weight/bandwidth() paths aren't protected by cgroup_lock(), rename scx_cgroup_rwsem to scx_cgroup_ops_rwsem and retain the locking there. This is overall simpler and will also allow enable/disable paths to synchronize against cgroup changes independent of the CPU controller. Signed-off-by: Tejun Heo Cc: Peter Zijlstra Acked-by: Andrea Righi --- kernel/sched/core.c | 2 -- kernel/sched/ext.c | 66 ++++++++++----------------------------------- kernel/sched/ext.h | 2 -- 3 files changed, 14 insertions(+), 56 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index be00629f0ba4c..27dda808ed839 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9362,8 +9362,6 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset) cgroup_taskset_for_each(task, css, tset) sched_move_task(task, false); - - scx_cgroup_finish_attach(); } static void cpu_cgroup_cancel_attach(struct cgroup_taskset *tset) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 701ca239ad003..520f20ffb7bfd 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -3055,7 +3055,7 @@ bool scx_can_stop_tick(struct rq *rq) #ifdef CONFIG_EXT_GROUP_SCHED -DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem); +DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_ops_rwsem); static bool scx_cgroup_enabled; void scx_tg_init(struct task_group *tg) @@ -3072,8 +3072,6 @@ int scx_tg_online(struct task_group *tg) WARN_ON_ONCE(tg->scx.flags & (SCX_TG_ONLINE | SCX_TG_INITED)); - percpu_down_read(&scx_cgroup_rwsem); - if (scx_cgroup_enabled) { if (SCX_HAS_OP(sch, cgroup_init)) { struct scx_cgroup_init_args args = @@ -3093,7 +3091,6 @@ int scx_tg_online(struct task_group *tg) tg->scx.flags |= SCX_TG_ONLINE; } - percpu_up_read(&scx_cgroup_rwsem); return ret; } @@ -3103,15 +3100,11 @@ void scx_tg_offline(struct task_group *tg) WARN_ON_ONCE(!(tg->scx.flags & SCX_TG_ONLINE)); - percpu_down_read(&scx_cgroup_rwsem); - if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_exit) && (tg->scx.flags & SCX_TG_INITED)) SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL, tg->css.cgroup); tg->scx.flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED); - - percpu_up_read(&scx_cgroup_rwsem); } int scx_cgroup_can_attach(struct cgroup_taskset *tset) @@ -3121,9 +3114,6 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset) struct task_struct *p; int ret; - /* released in scx_finish/cancel_attach() */ - percpu_down_read(&scx_cgroup_rwsem); - if (!scx_cgroup_enabled) return 0; @@ -3163,7 +3153,6 @@ err: p->scx.cgrp_moving_from = NULL; } - percpu_up_read(&scx_cgroup_rwsem); return ops_sanitize_err(sch, "cgroup_prep_move", ret); } @@ -3186,11 +3175,6 @@ void scx_cgroup_move_task(struct task_struct *p) p->scx.cgrp_moving_from = NULL; } -void scx_cgroup_finish_attach(void) -{ - percpu_up_read(&scx_cgroup_rwsem); -} - void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) { struct scx_sched *sch = scx_root; @@ -3198,7 +3182,7 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) struct task_struct *p; if (!scx_cgroup_enabled) - goto out_unlock; + return; cgroup_taskset_for_each(p, css, tset) { if (SCX_HAS_OP(sch, cgroup_cancel_move) && @@ -3207,15 +3191,13 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) p, p->scx.cgrp_moving_from, css->cgroup); p->scx.cgrp_moving_from = NULL; } -out_unlock: - percpu_up_read(&scx_cgroup_rwsem); } void scx_group_set_weight(struct task_group *tg, unsigned long weight) { struct scx_sched *sch = scx_root; - percpu_down_read(&scx_cgroup_rwsem); + percpu_down_read(&scx_cgroup_ops_rwsem); if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) && tg->scx.weight != weight) @@ -3224,7 +3206,7 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight) tg->scx.weight = weight; - percpu_up_read(&scx_cgroup_rwsem); + percpu_up_read(&scx_cgroup_ops_rwsem); } void scx_group_set_idle(struct task_group *tg, bool idle) @@ -3237,7 +3219,7 @@ void scx_group_set_bandwidth(struct task_group *tg, { struct scx_sched *sch = scx_root; - percpu_down_read(&scx_cgroup_rwsem); + percpu_down_read(&scx_cgroup_ops_rwsem); if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_bandwidth) && (tg->scx.bw_period_us != period_us || @@ -3250,23 +3232,25 @@ void scx_group_set_bandwidth(struct task_group *tg, tg->scx.bw_quota_us = quota_us; tg->scx.bw_burst_us = burst_us; - percpu_up_read(&scx_cgroup_rwsem); + percpu_up_read(&scx_cgroup_ops_rwsem); } static void scx_cgroup_lock(void) { - percpu_down_write(&scx_cgroup_rwsem); + percpu_down_write(&scx_cgroup_ops_rwsem); + cgroup_lock(); } static void scx_cgroup_unlock(void) { - percpu_up_write(&scx_cgroup_rwsem); + cgroup_unlock(); + percpu_up_write(&scx_cgroup_ops_rwsem); } #else /* CONFIG_EXT_GROUP_SCHED */ -static inline void scx_cgroup_lock(void) {} -static inline void scx_cgroup_unlock(void) {} +static void scx_cgroup_lock(void) {} +static void scx_cgroup_unlock(void) {} #endif /* CONFIG_EXT_GROUP_SCHED */ @@ -3382,15 +3366,12 @@ static void scx_cgroup_exit(struct scx_sched *sch) { struct cgroup_subsys_state *css; - percpu_rwsem_assert_held(&scx_cgroup_rwsem); - scx_cgroup_enabled = false; /* - * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk + * scx_tg_on/offline() are excluded through cgroup_lock(). If we walk * cgroups and exit all the inited ones, all online cgroups are exited. */ - rcu_read_lock(); css_for_each_descendant_post(css, &root_task_group.css) { struct task_group *tg = css_tg(css); @@ -3401,17 +3382,9 @@ static void scx_cgroup_exit(struct scx_sched *sch) if (!sch->ops.cgroup_exit) continue; - if (WARN_ON_ONCE(!css_tryget(css))) - continue; - rcu_read_unlock(); - SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL, css->cgroup); - - rcu_read_lock(); - css_put(css); } - rcu_read_unlock(); } static int scx_cgroup_init(struct scx_sched *sch) @@ -3419,13 +3392,10 @@ static int scx_cgroup_init(struct scx_sched *sch) struct cgroup_subsys_state *css; int ret; - percpu_rwsem_assert_held(&scx_cgroup_rwsem); - /* - * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk + * scx_tg_on/offline() are excluded through cgroup_lock(). If we walk * cgroups and init, all online cgroups are initialized. */ - rcu_read_lock(); css_for_each_descendant_pre(css, &root_task_group.css) { struct task_group *tg = css_tg(css); struct scx_cgroup_init_args args = { @@ -3444,10 +3414,6 @@ static int scx_cgroup_init(struct scx_sched *sch) continue; } - if (WARN_ON_ONCE(!css_tryget(css))) - continue; - rcu_read_unlock(); - ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, NULL, css->cgroup, &args); if (ret) { @@ -3456,11 +3422,7 @@ static int scx_cgroup_init(struct scx_sched *sch) return ret; } tg->scx.flags |= SCX_TG_INITED; - - rcu_read_lock(); - css_put(css); } - rcu_read_unlock(); WARN_ON_ONCE(scx_cgroup_enabled); scx_cgroup_enabled = true; diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index 33858607bc97f..43429b33e52c1 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -77,7 +77,6 @@ int scx_tg_online(struct task_group *tg); void scx_tg_offline(struct task_group *tg); int scx_cgroup_can_attach(struct cgroup_taskset *tset); void scx_cgroup_move_task(struct task_struct *p); -void scx_cgroup_finish_attach(void); void scx_cgroup_cancel_attach(struct cgroup_taskset *tset); void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight); void scx_group_set_idle(struct task_group *tg, bool idle); @@ -88,7 +87,6 @@ static inline int scx_tg_online(struct task_group *tg) { return 0; } static inline void scx_tg_offline(struct task_group *tg) {} static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; } static inline void scx_cgroup_move_task(struct task_struct *p) {} -static inline void scx_cgroup_finish_attach(void) {} static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {} static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {} static inline void scx_group_set_idle(struct task_group *tg, bool idle) {} -- 2.47.3