From b099076305ba5d019a23eb66ec2be05f59330530 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sun, 27 Aug 2023 10:34:40 +0200 Subject: [PATCH] 6.1-stable patches added patches: cgroup-cpuset-free-dl-bw-in-case-can_attach-fails.patch cgroup-cpuset-iterate-only-if-deadline-tasks-are-present.patch cgroup-cpuset-rename-functions-dealing-with-deadline-accounting.patch sched-cpuset-bring-back-cpuset_mutex.patch sched-cpuset-keep-track-of-sched_deadline-task-in-cpusets.patch sched-deadline-create-dl-bw-alloc-free-check-overflow-interface.patch --- ...-free-dl-bw-in-case-can_attach-fails.patch | 198 +++++ ...e-only-if-deadline-tasks-are-present.patch | 42 + ...ons-dealing-with-deadline-accounting.patch | 67 ++ ...sched-cpuset-bring-back-cpuset_mutex.patch | 722 ++++++++++++++++++ ...ck-of-sched_deadline-task-in-cpusets.patch | 161 ++++ ...-alloc-free-check-overflow-interface.patch | 166 ++++ queue-6.1/series | 6 + 7 files changed, 1362 insertions(+) create mode 100644 queue-6.1/cgroup-cpuset-free-dl-bw-in-case-can_attach-fails.patch create mode 100644 queue-6.1/cgroup-cpuset-iterate-only-if-deadline-tasks-are-present.patch create mode 100644 queue-6.1/cgroup-cpuset-rename-functions-dealing-with-deadline-accounting.patch create mode 100644 queue-6.1/sched-cpuset-bring-back-cpuset_mutex.patch create mode 100644 queue-6.1/sched-cpuset-keep-track-of-sched_deadline-task-in-cpusets.patch create mode 100644 queue-6.1/sched-deadline-create-dl-bw-alloc-free-check-overflow-interface.patch diff --git a/queue-6.1/cgroup-cpuset-free-dl-bw-in-case-can_attach-fails.patch b/queue-6.1/cgroup-cpuset-free-dl-bw-in-case-can_attach-fails.patch new file mode 100644 index 00000000000..4dd59ca22b3 --- /dev/null +++ b/queue-6.1/cgroup-cpuset-free-dl-bw-in-case-can_attach-fails.patch @@ -0,0 +1,198 @@ +From stable-owner@vger.kernel.org Sun Aug 20 17:25:08 2023 +From: Qais Yousef +Date: Sun, 20 Aug 2023 16:24:17 +0100 +Subject: cgroup/cpuset: Free DL BW in case can_attach() fails +To: stable@vger.kernel.org +Cc: Juri Lelli , Waiman Long , Tejun Heo , Dietmar Eggemann , Peter Zijlstra , Vincent Guittot , Ingo Molnar , Hao Luo , John Stultz , cgroups@vger.kernel.org, linux-kernel@vger.kernel.org, Qais Yousef +Message-ID: <20230820152417.518806-7-qyousef@layalina.io> + +From: Dietmar Eggemann + +commit 2ef269ef1ac006acf974793d975539244d77b28f upstream. + +cpuset_can_attach() can fail. Postpone DL BW allocation until all tasks +have been checked. DL BW is not allocated per-task but as a sum over +all DL tasks migrating. + +If multiple controllers are attached to the cgroup next to the cpuset +controller a non-cpuset can_attach() can fail. In this case free DL BW +in cpuset_cancel_attach(). + +Finally, update cpuset DL task count (nr_deadline_tasks) only in +cpuset_attach(). + +Suggested-by: Waiman Long +Signed-off-by: Dietmar Eggemann +Signed-off-by: Juri Lelli +Reviewed-by: Waiman Long +Signed-off-by: Tejun Heo +Signed-off-by: Qais Yousef (Google) +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/sched.h | 2 - + kernel/cgroup/cpuset.c | 53 ++++++++++++++++++++++++++++++++++++++++++++----- + kernel/sched/core.c | 17 +-------------- + 3 files changed, 51 insertions(+), 21 deletions(-) + +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -1846,7 +1846,7 @@ current_restore_flags(unsigned long orig + } + + extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); +-extern int task_can_attach(struct task_struct *p, const struct cpumask *cs_effective_cpus); ++extern int task_can_attach(struct task_struct *p); + extern int dl_bw_alloc(int cpu, u64 dl_bw); + extern void dl_bw_free(int cpu, u64 dl_bw); + #ifdef CONFIG_SMP +--- a/kernel/cgroup/cpuset.c ++++ b/kernel/cgroup/cpuset.c +@@ -198,6 +198,8 @@ struct cpuset { + * know when to rebuild associated root domain bandwidth information. + */ + int nr_deadline_tasks; ++ int nr_migrate_dl_tasks; ++ u64 sum_migrate_dl_bw; + + /* Invalid partition error code, not lock protected */ + enum prs_errcode prs_err; +@@ -2491,16 +2493,23 @@ static int cpuset_can_attach_check(struc + return 0; + } + ++static void reset_migrate_dl_data(struct cpuset *cs) ++{ ++ cs->nr_migrate_dl_tasks = 0; ++ cs->sum_migrate_dl_bw = 0; ++} ++ + /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ + static int cpuset_can_attach(struct cgroup_taskset *tset) + { + struct cgroup_subsys_state *css; +- struct cpuset *cs; ++ struct cpuset *cs, *oldcs; + struct task_struct *task; + int ret; + + /* used later by cpuset_attach() */ + cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); ++ oldcs = cpuset_attach_old_cs; + cs = css_cs(css); + + mutex_lock(&cpuset_mutex); +@@ -2511,7 +2520,7 @@ static int cpuset_can_attach(struct cgro + goto out_unlock; + + cgroup_taskset_for_each(task, css, tset) { +- ret = task_can_attach(task, cs->effective_cpus); ++ ret = task_can_attach(task); + if (ret) + goto out_unlock; + ret = security_task_setscheduler(task); +@@ -2519,11 +2528,31 @@ static int cpuset_can_attach(struct cgro + goto out_unlock; + + if (dl_task(task)) { +- cs->nr_deadline_tasks++; +- cpuset_attach_old_cs->nr_deadline_tasks--; ++ cs->nr_migrate_dl_tasks++; ++ cs->sum_migrate_dl_bw += task->dl.dl_bw; + } + } + ++ if (!cs->nr_migrate_dl_tasks) ++ goto out_success; ++ ++ if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) { ++ int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus); ++ ++ if (unlikely(cpu >= nr_cpu_ids)) { ++ reset_migrate_dl_data(cs); ++ ret = -EINVAL; ++ goto out_unlock; ++ } ++ ++ ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw); ++ if (ret) { ++ reset_migrate_dl_data(cs); ++ goto out_unlock; ++ } ++ } ++ ++out_success: + /* + * Mark attach is in progress. This makes validate_change() fail + * changes which zero cpus/mems_allowed. +@@ -2546,6 +2575,14 @@ static void cpuset_cancel_attach(struct + cs->attach_in_progress--; + if (!cs->attach_in_progress) + wake_up(&cpuset_attach_wq); ++ ++ if (cs->nr_migrate_dl_tasks) { ++ int cpu = cpumask_any(cs->effective_cpus); ++ ++ dl_bw_free(cpu, cs->sum_migrate_dl_bw); ++ reset_migrate_dl_data(cs); ++ } ++ + mutex_unlock(&cpuset_mutex); + } + +@@ -2623,6 +2660,12 @@ static void cpuset_attach(struct cgroup_ + + cs->old_mems_allowed = cpuset_attach_nodemask_to; + ++ if (cs->nr_migrate_dl_tasks) { ++ cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks; ++ oldcs->nr_deadline_tasks -= cs->nr_migrate_dl_tasks; ++ reset_migrate_dl_data(cs); ++ } ++ + cs->attach_in_progress--; + if (!cs->attach_in_progress) + wake_up(&cpuset_attach_wq); +@@ -3298,7 +3341,7 @@ static int cpuset_can_fork(struct task_s + if (ret) + goto out_unlock; + +- ret = task_can_attach(task, cs->effective_cpus); ++ ret = task_can_attach(task); + if (ret) + goto out_unlock; + +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -9083,8 +9083,7 @@ int cpuset_cpumask_can_shrink(const stru + return ret; + } + +-int task_can_attach(struct task_struct *p, +- const struct cpumask *cs_effective_cpus) ++int task_can_attach(struct task_struct *p) + { + int ret = 0; + +@@ -9097,21 +9096,9 @@ int task_can_attach(struct task_struct * + * success of set_cpus_allowed_ptr() on all attached tasks + * before cpus_mask may be changed. + */ +- if (p->flags & PF_NO_SETAFFINITY) { ++ if (p->flags & PF_NO_SETAFFINITY) + ret = -EINVAL; +- goto out; +- } + +- if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span, +- cs_effective_cpus)) { +- int cpu = cpumask_any_and(cpu_active_mask, cs_effective_cpus); +- +- if (unlikely(cpu >= nr_cpu_ids)) +- return -EINVAL; +- ret = dl_bw_alloc(cpu, p->dl.dl_bw); +- } +- +-out: + return ret; + } + diff --git a/queue-6.1/cgroup-cpuset-iterate-only-if-deadline-tasks-are-present.patch b/queue-6.1/cgroup-cpuset-iterate-only-if-deadline-tasks-are-present.patch new file mode 100644 index 00000000000..8e377b5a0f2 --- /dev/null +++ b/queue-6.1/cgroup-cpuset-iterate-only-if-deadline-tasks-are-present.patch @@ -0,0 +1,42 @@ +From stable-owner@vger.kernel.org Sun Aug 20 17:25:09 2023 +From: Qais Yousef +Date: Sun, 20 Aug 2023 16:24:15 +0100 +Subject: cgroup/cpuset: Iterate only if DEADLINE tasks are present +To: stable@vger.kernel.org +Cc: Juri Lelli , Waiman Long , Tejun Heo , Dietmar Eggemann , Peter Zijlstra , Vincent Guittot , Ingo Molnar , Hao Luo , John Stultz , cgroups@vger.kernel.org, linux-kernel@vger.kernel.org, Qais Yousef +Message-ID: <20230820152417.518806-5-qyousef@layalina.io> + +From: Juri Lelli + +commit c0f78fd5edcf29b2822ac165f9248a6c165e8554 upstream. + +update_tasks_root_domain currently iterates over all tasks even if no +DEADLINE task is present on the cpuset/root domain for which bandwidth +accounting is being rebuilt. This has been reported to introduce 10+ ms +delays on suspend-resume operations. + +Skip the costly iteration for cpusets that don't contain DEADLINE tasks. + +Reported-by: Qais Yousef (Google) +Link: https://lore.kernel.org/lkml/20230206221428.2125324-1-qyousef@layalina.io/ +Signed-off-by: Juri Lelli +Reviewed-by: Waiman Long +Signed-off-by: Tejun Heo +Signed-off-by: Qais Yousef (Google) +Signed-off-by: Greg Kroah-Hartman +--- + kernel/cgroup/cpuset.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/kernel/cgroup/cpuset.c ++++ b/kernel/cgroup/cpuset.c +@@ -1092,6 +1092,9 @@ static void dl_update_tasks_root_domain( + struct css_task_iter it; + struct task_struct *task; + ++ if (cs->nr_deadline_tasks == 0) ++ return; ++ + css_task_iter_start(&cs->css, 0, &it); + + while ((task = css_task_iter_next(&it))) diff --git a/queue-6.1/cgroup-cpuset-rename-functions-dealing-with-deadline-accounting.patch b/queue-6.1/cgroup-cpuset-rename-functions-dealing-with-deadline-accounting.patch new file mode 100644 index 00000000000..e29a073aa0a --- /dev/null +++ b/queue-6.1/cgroup-cpuset-rename-functions-dealing-with-deadline-accounting.patch @@ -0,0 +1,67 @@ +From stable-owner@vger.kernel.org Sun Aug 20 17:25:03 2023 +From: Qais Yousef +Date: Sun, 20 Aug 2023 16:24:12 +0100 +Subject: cgroup/cpuset: Rename functions dealing with DEADLINE accounting +To: stable@vger.kernel.org +Cc: Juri Lelli , Waiman Long , Tejun Heo , Dietmar Eggemann , Peter Zijlstra , Vincent Guittot , Ingo Molnar , Hao Luo , John Stultz , cgroups@vger.kernel.org, linux-kernel@vger.kernel.org, Qais Yousef +Message-ID: <20230820152417.518806-2-qyousef@layalina.io> + +From: Juri Lelli + +commit ad3a557daf6915296a43ef97a3e9c48e076c9dd8 upstream. + +rebuild_root_domains() and update_tasks_root_domain() have neutral +names, but actually deal with DEADLINE bandwidth accounting. + +Rename them to use 'dl_' prefix so that intent is more clear. + +No functional change. + +Suggested-by: Qais Yousef (Google) +Signed-off-by: Juri Lelli +Reviewed-by: Waiman Long +Signed-off-by: Tejun Heo +Signed-off-by: Qais Yousef (Google) +Signed-off-by: Greg Kroah-Hartman +--- + kernel/cgroup/cpuset.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +--- a/kernel/cgroup/cpuset.c ++++ b/kernel/cgroup/cpuset.c +@@ -1066,7 +1066,7 @@ done: + return ndoms; + } + +-static void update_tasks_root_domain(struct cpuset *cs) ++static void dl_update_tasks_root_domain(struct cpuset *cs) + { + struct css_task_iter it; + struct task_struct *task; +@@ -1079,7 +1079,7 @@ static void update_tasks_root_domain(str + css_task_iter_end(&it); + } + +-static void rebuild_root_domains(void) ++static void dl_rebuild_rd_accounting(void) + { + struct cpuset *cs = NULL; + struct cgroup_subsys_state *pos_css; +@@ -1107,7 +1107,7 @@ static void rebuild_root_domains(void) + + rcu_read_unlock(); + +- update_tasks_root_domain(cs); ++ dl_update_tasks_root_domain(cs); + + rcu_read_lock(); + css_put(&cs->css); +@@ -1121,7 +1121,7 @@ partition_and_rebuild_sched_domains(int + { + mutex_lock(&sched_domains_mutex); + partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); +- rebuild_root_domains(); ++ dl_rebuild_rd_accounting(); + mutex_unlock(&sched_domains_mutex); + } + diff --git a/queue-6.1/sched-cpuset-bring-back-cpuset_mutex.patch b/queue-6.1/sched-cpuset-bring-back-cpuset_mutex.patch new file mode 100644 index 00000000000..6044e92b9d7 --- /dev/null +++ b/queue-6.1/sched-cpuset-bring-back-cpuset_mutex.patch @@ -0,0 +1,722 @@ +From stable-owner@vger.kernel.org Sun Aug 20 17:25:04 2023 +From: Qais Yousef +Date: Sun, 20 Aug 2023 16:24:13 +0100 +Subject: sched/cpuset: Bring back cpuset_mutex +To: stable@vger.kernel.org +Cc: Juri Lelli , Waiman Long , Tejun Heo , Dietmar Eggemann , Peter Zijlstra , Vincent Guittot , Ingo Molnar , Hao Luo , John Stultz , cgroups@vger.kernel.org, linux-kernel@vger.kernel.org, Qais Yousef +Message-ID: <20230820152417.518806-3-qyousef@layalina.io> + +From: Juri Lelli + +commit 111cd11bbc54850f24191c52ff217da88a5e639b upstream. + +Turns out percpu_cpuset_rwsem - commit 1243dc518c9d ("cgroup/cpuset: +Convert cpuset_mutex to percpu_rwsem") - wasn't such a brilliant idea, +as it has been reported to cause slowdowns in workloads that need to +change cpuset configuration frequently and it is also not implementing +priority inheritance (which causes troubles with realtime workloads). + +Convert percpu_cpuset_rwsem back to regular cpuset_mutex. Also grab it +only for SCHED_DEADLINE tasks (other policies don't care about stable +cpusets anyway). + +Signed-off-by: Juri Lelli +Reviewed-by: Waiman Long +Signed-off-by: Tejun Heo +[ Conflict in kernel/cgroup/cpuset.c due to pulling new code/comments. + Reject all new code. Remove BUG_ON() about rwsem that doesn't exist on + mainline. ] +Signed-off-by: Qais Yousef (Google) +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/cpuset.h | 8 +- + kernel/cgroup/cpuset.c | 161 ++++++++++++++++++++++++------------------------- + kernel/sched/core.c | 22 ++++-- + 3 files changed, 99 insertions(+), 92 deletions(-) + +--- a/include/linux/cpuset.h ++++ b/include/linux/cpuset.h +@@ -71,8 +71,8 @@ extern void cpuset_init_smp(void); + extern void cpuset_force_rebuild(void); + extern void cpuset_update_active_cpus(void); + extern void cpuset_wait_for_hotplug(void); +-extern void cpuset_read_lock(void); +-extern void cpuset_read_unlock(void); ++extern void cpuset_lock(void); ++extern void cpuset_unlock(void); + extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); + extern bool cpuset_cpus_allowed_fallback(struct task_struct *p); + extern nodemask_t cpuset_mems_allowed(struct task_struct *p); +@@ -196,8 +196,8 @@ static inline void cpuset_update_active_ + + static inline void cpuset_wait_for_hotplug(void) { } + +-static inline void cpuset_read_lock(void) { } +-static inline void cpuset_read_unlock(void) { } ++static inline void cpuset_lock(void) { } ++static inline void cpuset_unlock(void) { } + + static inline void cpuset_cpus_allowed(struct task_struct *p, + struct cpumask *mask) +--- a/kernel/cgroup/cpuset.c ++++ b/kernel/cgroup/cpuset.c +@@ -366,22 +366,23 @@ static struct cpuset top_cpuset = { + if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) + + /* +- * There are two global locks guarding cpuset structures - cpuset_rwsem and ++ * There are two global locks guarding cpuset structures - cpuset_mutex and + * callback_lock. We also require taking task_lock() when dereferencing a + * task's cpuset pointer. See "The task_lock() exception", at the end of this +- * comment. The cpuset code uses only cpuset_rwsem write lock. Other +- * kernel subsystems can use cpuset_read_lock()/cpuset_read_unlock() to +- * prevent change to cpuset structures. ++ * comment. The cpuset code uses only cpuset_mutex. Other kernel subsystems ++ * can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset ++ * structures. Note that cpuset_mutex needs to be a mutex as it is used in ++ * paths that rely on priority inheritance (e.g. scheduler - on RT) for ++ * correctness. + * + * A task must hold both locks to modify cpusets. If a task holds +- * cpuset_rwsem, it blocks others wanting that rwsem, ensuring that it +- * is the only task able to also acquire callback_lock and be able to +- * modify cpusets. It can perform various checks on the cpuset structure +- * first, knowing nothing will change. It can also allocate memory while +- * just holding cpuset_rwsem. While it is performing these checks, various +- * callback routines can briefly acquire callback_lock to query cpusets. +- * Once it is ready to make the changes, it takes callback_lock, blocking +- * everyone else. ++ * cpuset_mutex, it blocks others, ensuring that it is the only task able to ++ * also acquire callback_lock and be able to modify cpusets. It can perform ++ * various checks on the cpuset structure first, knowing nothing will change. ++ * It can also allocate memory while just holding cpuset_mutex. While it is ++ * performing these checks, various callback routines can briefly acquire ++ * callback_lock to query cpusets. Once it is ready to make the changes, it ++ * takes callback_lock, blocking everyone else. + * + * Calls to the kernel memory allocator can not be made while holding + * callback_lock, as that would risk double tripping on callback_lock +@@ -403,16 +404,16 @@ static struct cpuset top_cpuset = { + * guidelines for accessing subsystem state in kernel/cgroup.c + */ + +-DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem); ++static DEFINE_MUTEX(cpuset_mutex); + +-void cpuset_read_lock(void) ++void cpuset_lock(void) + { +- percpu_down_read(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + } + +-void cpuset_read_unlock(void) ++void cpuset_unlock(void) + { +- percpu_up_read(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + } + + static DEFINE_SPINLOCK(callback_lock); +@@ -496,7 +497,7 @@ static inline bool partition_is_populate + * One way or another, we guarantee to return some non-empty subset + * of cpu_online_mask. + * +- * Call with callback_lock or cpuset_rwsem held. ++ * Call with callback_lock or cpuset_mutex held. + */ + static void guarantee_online_cpus(struct task_struct *tsk, + struct cpumask *pmask) +@@ -538,7 +539,7 @@ out_unlock: + * One way or another, we guarantee to return some non-empty subset + * of node_states[N_MEMORY]. + * +- * Call with callback_lock or cpuset_rwsem held. ++ * Call with callback_lock or cpuset_mutex held. + */ + static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) + { +@@ -550,7 +551,7 @@ static void guarantee_online_mems(struct + /* + * update task's spread flag if cpuset's page/slab spread flag is set + * +- * Call with callback_lock or cpuset_rwsem held. The check can be skipped ++ * Call with callback_lock or cpuset_mutex held. The check can be skipped + * if on default hierarchy. + */ + static void cpuset_update_task_spread_flags(struct cpuset *cs, +@@ -575,7 +576,7 @@ static void cpuset_update_task_spread_fl + * + * One cpuset is a subset of another if all its allowed CPUs and + * Memory Nodes are a subset of the other, and its exclusive flags +- * are only set if the other's are set. Call holding cpuset_rwsem. ++ * are only set if the other's are set. Call holding cpuset_mutex. + */ + + static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) +@@ -713,7 +714,7 @@ out: + * If we replaced the flag and mask values of the current cpuset + * (cur) with those values in the trial cpuset (trial), would + * our various subset and exclusive rules still be valid? Presumes +- * cpuset_rwsem held. ++ * cpuset_mutex held. + * + * 'cur' is the address of an actual, in-use cpuset. Operations + * such as list traversal that depend on the actual address of the +@@ -829,7 +830,7 @@ static void update_domain_attr_tree(stru + rcu_read_unlock(); + } + +-/* Must be called with cpuset_rwsem held. */ ++/* Must be called with cpuset_mutex held. */ + static inline int nr_cpusets(void) + { + /* jump label reference count + the top-level cpuset */ +@@ -855,7 +856,7 @@ static inline int nr_cpusets(void) + * domains when operating in the severe memory shortage situations + * that could cause allocation failures below. + * +- * Must be called with cpuset_rwsem held. ++ * Must be called with cpuset_mutex held. + * + * The three key local variables below are: + * cp - cpuset pointer, used (together with pos_css) to perform a +@@ -1084,7 +1085,7 @@ static void dl_rebuild_rd_accounting(voi + struct cpuset *cs = NULL; + struct cgroup_subsys_state *pos_css; + +- percpu_rwsem_assert_held(&cpuset_rwsem); ++ lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpus_held(); + lockdep_assert_held(&sched_domains_mutex); + +@@ -1134,7 +1135,7 @@ partition_and_rebuild_sched_domains(int + * 'cpus' is removed, then call this routine to rebuild the + * scheduler's dynamic sched domains. + * +- * Call with cpuset_rwsem held. Takes cpus_read_lock(). ++ * Call with cpuset_mutex held. Takes cpus_read_lock(). + */ + static void rebuild_sched_domains_locked(void) + { +@@ -1145,7 +1146,7 @@ static void rebuild_sched_domains_locked + int ndoms; + + lockdep_assert_cpus_held(); +- percpu_rwsem_assert_held(&cpuset_rwsem); ++ lockdep_assert_held(&cpuset_mutex); + + /* + * If we have raced with CPU hotplug, return early to avoid +@@ -1196,9 +1197,9 @@ static void rebuild_sched_domains_locked + void rebuild_sched_domains(void) + { + cpus_read_lock(); +- percpu_down_write(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + rebuild_sched_domains_locked(); +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + cpus_read_unlock(); + } + +@@ -1208,7 +1209,7 @@ void rebuild_sched_domains(void) + * @new_cpus: the temp variable for the new effective_cpus mask + * + * Iterate through each task of @cs updating its cpus_allowed to the +- * effective cpuset's. As this function is called with cpuset_rwsem held, ++ * effective cpuset's. As this function is called with cpuset_mutex held, + * cpuset membership stays stable. + */ + static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) +@@ -1317,7 +1318,7 @@ static int update_parent_subparts_cpumas + int old_prs, new_prs; + int part_error = PERR_NONE; /* Partition error? */ + +- percpu_rwsem_assert_held(&cpuset_rwsem); ++ lockdep_assert_held(&cpuset_mutex); + + /* + * The parent must be a partition root. +@@ -1540,7 +1541,7 @@ static int update_parent_subparts_cpumas + * + * On legacy hierarchy, effective_cpus will be the same with cpu_allowed. + * +- * Called with cpuset_rwsem held ++ * Called with cpuset_mutex held + */ + static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, + bool force) +@@ -1700,7 +1701,7 @@ static void update_sibling_cpumasks(stru + struct cpuset *sibling; + struct cgroup_subsys_state *pos_css; + +- percpu_rwsem_assert_held(&cpuset_rwsem); ++ lockdep_assert_held(&cpuset_mutex); + + /* + * Check all its siblings and call update_cpumasks_hier() +@@ -1950,12 +1951,12 @@ static void *cpuset_being_rebound; + * @cs: the cpuset in which each task's mems_allowed mask needs to be changed + * + * Iterate through each task of @cs updating its mems_allowed to the +- * effective cpuset's. As this function is called with cpuset_rwsem held, ++ * effective cpuset's. As this function is called with cpuset_mutex held, + * cpuset membership stays stable. + */ + static void update_tasks_nodemask(struct cpuset *cs) + { +- static nodemask_t newmems; /* protected by cpuset_rwsem */ ++ static nodemask_t newmems; /* protected by cpuset_mutex */ + struct css_task_iter it; + struct task_struct *task; + +@@ -1968,7 +1969,7 @@ static void update_tasks_nodemask(struct + * take while holding tasklist_lock. Forks can happen - the + * mpol_dup() cpuset_being_rebound check will catch such forks, + * and rebind their vma mempolicies too. Because we still hold +- * the global cpuset_rwsem, we know that no other rebind effort ++ * the global cpuset_mutex, we know that no other rebind effort + * will be contending for the global variable cpuset_being_rebound. + * It's ok if we rebind the same mm twice; mpol_rebind_mm() + * is idempotent. Also migrate pages in each mm to new nodes. +@@ -2014,7 +2015,7 @@ static void update_tasks_nodemask(struct + * + * On legacy hierarchy, effective_mems will be the same with mems_allowed. + * +- * Called with cpuset_rwsem held ++ * Called with cpuset_mutex held + */ + static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) + { +@@ -2067,7 +2068,7 @@ static void update_nodemasks_hier(struct + * mempolicies and if the cpuset is marked 'memory_migrate', + * migrate the tasks pages to the new memory. + * +- * Call with cpuset_rwsem held. May take callback_lock during call. ++ * Call with cpuset_mutex held. May take callback_lock during call. + * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, + * lock each such tasks mm->mmap_lock, scan its vma's and rebind + * their mempolicies to the cpusets new mems_allowed. +@@ -2159,7 +2160,7 @@ static int update_relax_domain_level(str + * @cs: the cpuset in which each task's spread flags needs to be changed + * + * Iterate through each task of @cs updating its spread flags. As this +- * function is called with cpuset_rwsem held, cpuset membership stays ++ * function is called with cpuset_mutex held, cpuset membership stays + * stable. + */ + static void update_tasks_flags(struct cpuset *cs) +@@ -2179,7 +2180,7 @@ static void update_tasks_flags(struct cp + * cs: the cpuset to update + * turning_on: whether the flag is being set or cleared + * +- * Call with cpuset_rwsem held. ++ * Call with cpuset_mutex held. + */ + + static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, +@@ -2229,7 +2230,7 @@ out: + * @new_prs: new partition root state + * Return: 0 if successful, != 0 if error + * +- * Call with cpuset_rwsem held. ++ * Call with cpuset_mutex held. + */ + static int update_prstate(struct cpuset *cs, int new_prs) + { +@@ -2467,7 +2468,7 @@ static int cpuset_can_attach_check(struc + return 0; + } + +-/* Called by cgroups to determine if a cpuset is usable; cpuset_rwsem held */ ++/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ + static int cpuset_can_attach(struct cgroup_taskset *tset) + { + struct cgroup_subsys_state *css; +@@ -2479,7 +2480,7 @@ static int cpuset_can_attach(struct cgro + cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); + cs = css_cs(css); + +- percpu_down_write(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + + /* Check to see if task is allowed in the cpuset */ + ret = cpuset_can_attach_check(cs); +@@ -2501,7 +2502,7 @@ static int cpuset_can_attach(struct cgro + */ + cs->attach_in_progress++; + out_unlock: +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + return ret; + } + +@@ -2513,15 +2514,15 @@ static void cpuset_cancel_attach(struct + cgroup_taskset_first(tset, &css); + cs = css_cs(css); + +- percpu_down_write(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + cs->attach_in_progress--; + if (!cs->attach_in_progress) + wake_up(&cpuset_attach_wq); +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + } + + /* +- * Protected by cpuset_rwsem. cpus_attach is used only by cpuset_attach_task() ++ * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach_task() + * but we can't allocate it dynamically there. Define it global and + * allocate from cpuset_init(). + */ +@@ -2530,7 +2531,7 @@ static nodemask_t cpuset_attach_nodemask + + static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) + { +- percpu_rwsem_assert_held(&cpuset_rwsem); ++ lockdep_assert_held(&cpuset_mutex); + + if (cs != &top_cpuset) + guarantee_online_cpus(task, cpus_attach); +@@ -2558,7 +2559,7 @@ static void cpuset_attach(struct cgroup_ + cs = css_cs(css); + + lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */ +- percpu_down_write(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + + guarantee_online_mems(cs, &cpuset_attach_nodemask_to); + +@@ -2598,7 +2599,7 @@ static void cpuset_attach(struct cgroup_ + if (!cs->attach_in_progress) + wake_up(&cpuset_attach_wq); + +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + } + + /* The various types of files and directories in a cpuset file system */ +@@ -2630,7 +2631,7 @@ static int cpuset_write_u64(struct cgrou + int retval = 0; + + cpus_read_lock(); +- percpu_down_write(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + if (!is_cpuset_online(cs)) { + retval = -ENODEV; + goto out_unlock; +@@ -2666,7 +2667,7 @@ static int cpuset_write_u64(struct cgrou + break; + } + out_unlock: +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + cpus_read_unlock(); + return retval; + } +@@ -2679,7 +2680,7 @@ static int cpuset_write_s64(struct cgrou + int retval = -ENODEV; + + cpus_read_lock(); +- percpu_down_write(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + if (!is_cpuset_online(cs)) + goto out_unlock; + +@@ -2692,7 +2693,7 @@ static int cpuset_write_s64(struct cgrou + break; + } + out_unlock: +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + cpus_read_unlock(); + return retval; + } +@@ -2725,7 +2726,7 @@ static ssize_t cpuset_write_resmask(stru + * operation like this one can lead to a deadlock through kernfs + * active_ref protection. Let's break the protection. Losing the + * protection is okay as we check whether @cs is online after +- * grabbing cpuset_rwsem anyway. This only happens on the legacy ++ * grabbing cpuset_mutex anyway. This only happens on the legacy + * hierarchies. + */ + css_get(&cs->css); +@@ -2733,7 +2734,7 @@ static ssize_t cpuset_write_resmask(stru + flush_work(&cpuset_hotplug_work); + + cpus_read_lock(); +- percpu_down_write(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + if (!is_cpuset_online(cs)) + goto out_unlock; + +@@ -2757,7 +2758,7 @@ static ssize_t cpuset_write_resmask(stru + + free_cpuset(trialcs); + out_unlock: +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + cpus_read_unlock(); + kernfs_unbreak_active_protection(of->kn); + css_put(&cs->css); +@@ -2905,13 +2906,13 @@ static ssize_t sched_partition_write(str + + css_get(&cs->css); + cpus_read_lock(); +- percpu_down_write(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + if (!is_cpuset_online(cs)) + goto out_unlock; + + retval = update_prstate(cs, val); + out_unlock: +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + cpus_read_unlock(); + css_put(&cs->css); + return retval ?: nbytes; +@@ -3124,7 +3125,7 @@ static int cpuset_css_online(struct cgro + return 0; + + cpus_read_lock(); +- percpu_down_write(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + + set_bit(CS_ONLINE, &cs->flags); + if (is_spread_page(parent)) +@@ -3175,7 +3176,7 @@ static int cpuset_css_online(struct cgro + cpumask_copy(cs->effective_cpus, parent->cpus_allowed); + spin_unlock_irq(&callback_lock); + out_unlock: +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + cpus_read_unlock(); + return 0; + } +@@ -3196,7 +3197,7 @@ static void cpuset_css_offline(struct cg + struct cpuset *cs = css_cs(css); + + cpus_read_lock(); +- percpu_down_write(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + + if (is_partition_valid(cs)) + update_prstate(cs, 0); +@@ -3215,7 +3216,7 @@ static void cpuset_css_offline(struct cg + cpuset_dec(); + clear_bit(CS_ONLINE, &cs->flags); + +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + cpus_read_unlock(); + } + +@@ -3228,7 +3229,7 @@ static void cpuset_css_free(struct cgrou + + static void cpuset_bind(struct cgroup_subsys_state *root_css) + { +- percpu_down_write(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + spin_lock_irq(&callback_lock); + + if (is_in_v2_mode()) { +@@ -3241,7 +3242,7 @@ static void cpuset_bind(struct cgroup_su + } + + spin_unlock_irq(&callback_lock); +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + } + + /* +@@ -3262,7 +3263,7 @@ static int cpuset_can_fork(struct task_s + return 0; + + lockdep_assert_held(&cgroup_mutex); +- percpu_down_write(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + + /* Check to see if task is allowed in the cpuset */ + ret = cpuset_can_attach_check(cs); +@@ -3283,7 +3284,7 @@ static int cpuset_can_fork(struct task_s + */ + cs->attach_in_progress++; + out_unlock: +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + return ret; + } + +@@ -3299,11 +3300,11 @@ static void cpuset_cancel_fork(struct ta + if (same_cs) + return; + +- percpu_down_write(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + cs->attach_in_progress--; + if (!cs->attach_in_progress) + wake_up(&cpuset_attach_wq); +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + } + + /* +@@ -3331,7 +3332,7 @@ static void cpuset_fork(struct task_stru + } + + /* CLONE_INTO_CGROUP */ +- percpu_down_write(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + guarantee_online_mems(cs, &cpuset_attach_nodemask_to); + cpuset_attach_task(cs, task); + +@@ -3339,7 +3340,7 @@ static void cpuset_fork(struct task_stru + if (!cs->attach_in_progress) + wake_up(&cpuset_attach_wq); + +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + } + + struct cgroup_subsys cpuset_cgrp_subsys = { +@@ -3369,8 +3370,6 @@ struct cgroup_subsys cpuset_cgrp_subsys + + int __init cpuset_init(void) + { +- BUG_ON(percpu_init_rwsem(&cpuset_rwsem)); +- + BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); + BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); + BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL)); +@@ -3442,7 +3441,7 @@ hotplug_update_tasks_legacy(struct cpuse + is_empty = cpumask_empty(cs->cpus_allowed) || + nodes_empty(cs->mems_allowed); + +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + + /* + * Move tasks to the nearest ancestor with execution resources, +@@ -3452,7 +3451,7 @@ hotplug_update_tasks_legacy(struct cpuse + if (is_empty) + remove_tasks_in_empty_cpuset(cs); + +- percpu_down_write(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + } + + static void +@@ -3503,14 +3502,14 @@ static void cpuset_hotplug_update_tasks( + retry: + wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); + +- percpu_down_write(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + + /* + * We have raced with task attaching. We wait until attaching + * is finished, so we won't attach a task to an empty cpuset. + */ + if (cs->attach_in_progress) { +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + goto retry; + } + +@@ -3604,7 +3603,7 @@ update_tasks: + hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems, + cpus_updated, mems_updated); + +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + } + + /** +@@ -3634,7 +3633,7 @@ static void cpuset_hotplug_workfn(struct + if (on_dfl && !alloc_cpumasks(NULL, &tmp)) + ptmp = &tmp; + +- percpu_down_write(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + + /* fetch the available cpus/mems and find out which changed how */ + cpumask_copy(&new_cpus, cpu_active_mask); +@@ -3691,7 +3690,7 @@ static void cpuset_hotplug_workfn(struct + update_tasks_nodemask(&top_cpuset); + } + +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + + /* if cpus or mems changed, we need to propagate to descendants */ + if (cpus_updated || mems_updated) { +@@ -4101,7 +4100,7 @@ void __cpuset_memory_pressure_bump(void) + * - Used for /proc//cpuset. + * - No need to task_lock(tsk) on this tsk->cpuset reference, as it + * doesn't really matter if tsk->cpuset changes after we read it, +- * and we take cpuset_rwsem, keeping cpuset_attach() from changing it ++ * and we take cpuset_mutex, keeping cpuset_attach() from changing it + * anyway. + */ + int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -7475,6 +7475,7 @@ static int __sched_setscheduler(struct t + int reset_on_fork; + int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; + struct rq *rq; ++ bool cpuset_locked = false; + + /* The pi code expects interrupts enabled */ + BUG_ON(pi && in_interrupt()); +@@ -7524,8 +7525,14 @@ recheck: + return retval; + } + +- if (pi) +- cpuset_read_lock(); ++ /* ++ * SCHED_DEADLINE bandwidth accounting relies on stable cpusets ++ * information. ++ */ ++ if (dl_policy(policy) || dl_policy(p->policy)) { ++ cpuset_locked = true; ++ cpuset_lock(); ++ } + + /* + * Make sure no PI-waiters arrive (or leave) while we are +@@ -7601,8 +7608,8 @@ change: + if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { + policy = oldpolicy = -1; + task_rq_unlock(rq, p, &rf); +- if (pi) +- cpuset_read_unlock(); ++ if (cpuset_locked) ++ cpuset_unlock(); + goto recheck; + } + +@@ -7669,7 +7676,8 @@ change: + task_rq_unlock(rq, p, &rf); + + if (pi) { +- cpuset_read_unlock(); ++ if (cpuset_locked) ++ cpuset_unlock(); + rt_mutex_adjust_pi(p); + } + +@@ -7681,8 +7689,8 @@ change: + + unlock: + task_rq_unlock(rq, p, &rf); +- if (pi) +- cpuset_read_unlock(); ++ if (cpuset_locked) ++ cpuset_unlock(); + return retval; + } + diff --git a/queue-6.1/sched-cpuset-keep-track-of-sched_deadline-task-in-cpusets.patch b/queue-6.1/sched-cpuset-keep-track-of-sched_deadline-task-in-cpusets.patch new file mode 100644 index 00000000000..f75529bdc66 --- /dev/null +++ b/queue-6.1/sched-cpuset-keep-track-of-sched_deadline-task-in-cpusets.patch @@ -0,0 +1,161 @@ +From stable-owner@vger.kernel.org Sun Aug 20 17:25:09 2023 +From: Qais Yousef +Date: Sun, 20 Aug 2023 16:24:14 +0100 +Subject: sched/cpuset: Keep track of SCHED_DEADLINE task in cpusets +To: stable@vger.kernel.org +Cc: Juri Lelli , Waiman Long , Tejun Heo , Dietmar Eggemann , Peter Zijlstra , Vincent Guittot , Ingo Molnar , Hao Luo , John Stultz , cgroups@vger.kernel.org, linux-kernel@vger.kernel.org, Qais Yousef +Message-ID: <20230820152417.518806-4-qyousef@layalina.io> + +From: Juri Lelli + +commit 6c24849f5515e4966d94fa5279bdff4acf2e9489 upstream. + +Qais reported that iterating over all tasks when rebuilding root domains +for finding out which ones are DEADLINE and need their bandwidth +correctly restored on such root domains can be a costly operation (10+ +ms delays on suspend-resume). + +To fix the problem keep track of the number of DEADLINE tasks belonging +to each cpuset and then use this information (followup patch) to only +perform the above iteration if DEADLINE tasks are actually present in +the cpuset for which a corresponding root domain is being rebuilt. + +Reported-by: Qais Yousef (Google) +Link: https://lore.kernel.org/lkml/20230206221428.2125324-1-qyousef@layalina.io/ +Signed-off-by: Juri Lelli +Reviewed-by: Waiman Long +Signed-off-by: Tejun Heo +Signed-off-by: Qais Yousef (Google) +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/cpuset.h | 4 ++++ + kernel/cgroup/cgroup.c | 4 ++++ + kernel/cgroup/cpuset.c | 25 +++++++++++++++++++++++++ + kernel/sched/deadline.c | 14 ++++++++++++++ + 4 files changed, 47 insertions(+) + +--- a/include/linux/cpuset.h ++++ b/include/linux/cpuset.h +@@ -71,6 +71,8 @@ extern void cpuset_init_smp(void); + extern void cpuset_force_rebuild(void); + extern void cpuset_update_active_cpus(void); + extern void cpuset_wait_for_hotplug(void); ++extern void inc_dl_tasks_cs(struct task_struct *task); ++extern void dec_dl_tasks_cs(struct task_struct *task); + extern void cpuset_lock(void); + extern void cpuset_unlock(void); + extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); +@@ -196,6 +198,8 @@ static inline void cpuset_update_active_ + + static inline void cpuset_wait_for_hotplug(void) { } + ++static inline void inc_dl_tasks_cs(struct task_struct *task) { } ++static inline void dec_dl_tasks_cs(struct task_struct *task) { } + static inline void cpuset_lock(void) { } + static inline void cpuset_unlock(void) { } + +--- a/kernel/cgroup/cgroup.c ++++ b/kernel/cgroup/cgroup.c +@@ -57,6 +57,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -6681,6 +6682,9 @@ void cgroup_exit(struct task_struct *tsk + list_add_tail(&tsk->cg_list, &cset->dying_tasks); + cset->nr_tasks--; + ++ if (dl_task(tsk)) ++ dec_dl_tasks_cs(tsk); ++ + WARN_ON_ONCE(cgroup_task_frozen(tsk)); + if (unlikely(!(tsk->flags & PF_KTHREAD) && + test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags))) +--- a/kernel/cgroup/cpuset.c ++++ b/kernel/cgroup/cpuset.c +@@ -193,6 +193,12 @@ struct cpuset { + int use_parent_ecpus; + int child_ecpus_count; + ++ /* ++ * number of SCHED_DEADLINE tasks attached to this cpuset, so that we ++ * know when to rebuild associated root domain bandwidth information. ++ */ ++ int nr_deadline_tasks; ++ + /* Invalid partition error code, not lock protected */ + enum prs_errcode prs_err; + +@@ -245,6 +251,20 @@ static inline struct cpuset *parent_cs(s + return css_cs(cs->css.parent); + } + ++void inc_dl_tasks_cs(struct task_struct *p) ++{ ++ struct cpuset *cs = task_cs(p); ++ ++ cs->nr_deadline_tasks++; ++} ++ ++void dec_dl_tasks_cs(struct task_struct *p) ++{ ++ struct cpuset *cs = task_cs(p); ++ ++ cs->nr_deadline_tasks--; ++} ++ + /* bits in struct cpuset flags field */ + typedef enum { + CS_ONLINE, +@@ -2494,6 +2514,11 @@ static int cpuset_can_attach(struct cgro + ret = security_task_setscheduler(task); + if (ret) + goto out_unlock; ++ ++ if (dl_task(task)) { ++ cs->nr_deadline_tasks++; ++ cpuset_attach_old_cs->nr_deadline_tasks--; ++ } + } + + /* +--- a/kernel/sched/deadline.c ++++ b/kernel/sched/deadline.c +@@ -16,6 +16,8 @@ + * Fabio Checconi + */ + ++#include ++ + /* + * Default limits for DL period; on the top end we guard against small util + * tasks still getting ridiculously long effective runtimes, on the bottom end we +@@ -2597,6 +2599,12 @@ static void switched_from_dl(struct rq * + if (task_on_rq_queued(p) && p->dl.dl_runtime) + task_non_contending(p); + ++ /* ++ * In case a task is setscheduled out from SCHED_DEADLINE we need to ++ * keep track of that on its cpuset (for correct bandwidth tracking). ++ */ ++ dec_dl_tasks_cs(p); ++ + if (!task_on_rq_queued(p)) { + /* + * Inactive timer is armed. However, p is leaving DEADLINE and +@@ -2637,6 +2645,12 @@ static void switched_to_dl(struct rq *rq + if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) + put_task_struct(p); + ++ /* ++ * In case a task is setscheduled to SCHED_DEADLINE we need to keep ++ * track of that on its cpuset (for correct bandwidth tracking). ++ */ ++ inc_dl_tasks_cs(p); ++ + /* If p is not queued we will update its parameters at next wakeup. */ + if (!task_on_rq_queued(p)) { + add_rq_bw(&p->dl, &rq->dl); diff --git a/queue-6.1/sched-deadline-create-dl-bw-alloc-free-check-overflow-interface.patch b/queue-6.1/sched-deadline-create-dl-bw-alloc-free-check-overflow-interface.patch new file mode 100644 index 00000000000..69c708f85a4 --- /dev/null +++ b/queue-6.1/sched-deadline-create-dl-bw-alloc-free-check-overflow-interface.patch @@ -0,0 +1,166 @@ +From stable-owner@vger.kernel.org Sun Aug 20 17:25:09 2023 +From: Qais Yousef +Date: Sun, 20 Aug 2023 16:24:16 +0100 +Subject: sched/deadline: Create DL BW alloc, free & check overflow interface +To: stable@vger.kernel.org +Cc: Juri Lelli , Waiman Long , Tejun Heo , Dietmar Eggemann , Peter Zijlstra , Vincent Guittot , Ingo Molnar , Hao Luo , John Stultz , cgroups@vger.kernel.org, linux-kernel@vger.kernel.org, Qais Yousef +Message-ID: <20230820152417.518806-6-qyousef@layalina.io> + +From: Dietmar Eggemann + +commit 85989106feb734437e2d598b639991b9185a43a6 upstream. + +While moving a set of tasks between exclusive cpusets, +cpuset_can_attach() -> task_can_attach() calls dl_cpu_busy(..., p) for +DL BW overflow checking and per-task DL BW allocation on the destination +root_domain for the DL tasks in this set. + +This approach has the issue of not freeing already allocated DL BW in +the following error cases: + +(1) The set of tasks includes multiple DL tasks and DL BW overflow + checking fails for one of the subsequent DL tasks. + +(2) Another controller next to the cpuset controller which is attached + to the same cgroup fails in its can_attach(). + +To address this problem rework dl_cpu_busy(): + +(1) Split it into dl_bw_check_overflow() & dl_bw_alloc() and add a + dedicated dl_bw_free(). + +(2) dl_bw_alloc() & dl_bw_free() take a `u64 dl_bw` parameter instead of + a `struct task_struct *p` used in dl_cpu_busy(). This allows to + allocate DL BW for a set of tasks too rather than only for a single + task. + +Signed-off-by: Dietmar Eggemann +Signed-off-by: Juri Lelli +Signed-off-by: Tejun Heo +Signed-off-by: Qais Yousef (Google) +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/sched.h | 2 + + kernel/sched/core.c | 4 +-- + kernel/sched/deadline.c | 53 ++++++++++++++++++++++++++++++++++++------------ + kernel/sched/sched.h | 2 - + 4 files changed, 45 insertions(+), 16 deletions(-) + +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -1847,6 +1847,8 @@ current_restore_flags(unsigned long orig + + extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); + extern int task_can_attach(struct task_struct *p, const struct cpumask *cs_effective_cpus); ++extern int dl_bw_alloc(int cpu, u64 dl_bw); ++extern void dl_bw_free(int cpu, u64 dl_bw); + #ifdef CONFIG_SMP + extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); + extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask); +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -9108,7 +9108,7 @@ int task_can_attach(struct task_struct * + + if (unlikely(cpu >= nr_cpu_ids)) + return -EINVAL; +- ret = dl_cpu_busy(cpu, p); ++ ret = dl_bw_alloc(cpu, p->dl.dl_bw); + } + + out: +@@ -9393,7 +9393,7 @@ static void cpuset_cpu_active(void) + static int cpuset_cpu_inactive(unsigned int cpu) + { + if (!cpuhp_tasks_frozen) { +- int ret = dl_cpu_busy(cpu, NULL); ++ int ret = dl_bw_check_overflow(cpu); + + if (ret) + return ret; +--- a/kernel/sched/deadline.c ++++ b/kernel/sched/deadline.c +@@ -3037,26 +3037,38 @@ int dl_cpuset_cpumask_can_shrink(const s + return ret; + } + +-int dl_cpu_busy(int cpu, struct task_struct *p) ++enum dl_bw_request { ++ dl_bw_req_check_overflow = 0, ++ dl_bw_req_alloc, ++ dl_bw_req_free ++}; ++ ++static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw) + { +- unsigned long flags, cap; ++ unsigned long flags; + struct dl_bw *dl_b; +- bool overflow; ++ bool overflow = 0; + + rcu_read_lock_sched(); + dl_b = dl_bw_of(cpu); + raw_spin_lock_irqsave(&dl_b->lock, flags); +- cap = dl_bw_capacity(cpu); +- overflow = __dl_overflow(dl_b, cap, 0, p ? p->dl.dl_bw : 0); + +- if (!overflow && p) { +- /* +- * We reserve space for this task in the destination +- * root_domain, as we can't fail after this point. +- * We will free resources in the source root_domain +- * later on (see set_cpus_allowed_dl()). +- */ +- __dl_add(dl_b, p->dl.dl_bw, dl_bw_cpus(cpu)); ++ if (req == dl_bw_req_free) { ++ __dl_sub(dl_b, dl_bw, dl_bw_cpus(cpu)); ++ } else { ++ unsigned long cap = dl_bw_capacity(cpu); ++ ++ overflow = __dl_overflow(dl_b, cap, 0, dl_bw); ++ ++ if (req == dl_bw_req_alloc && !overflow) { ++ /* ++ * We reserve space in the destination ++ * root_domain, as we can't fail after this point. ++ * We will free resources in the source root_domain ++ * later on (see set_cpus_allowed_dl()). ++ */ ++ __dl_add(dl_b, dl_bw, dl_bw_cpus(cpu)); ++ } + } + + raw_spin_unlock_irqrestore(&dl_b->lock, flags); +@@ -3064,6 +3076,21 @@ int dl_cpu_busy(int cpu, struct task_str + + return overflow ? -EBUSY : 0; + } ++ ++int dl_bw_check_overflow(int cpu) ++{ ++ return dl_bw_manage(dl_bw_req_check_overflow, cpu, 0); ++} ++ ++int dl_bw_alloc(int cpu, u64 dl_bw) ++{ ++ return dl_bw_manage(dl_bw_req_alloc, cpu, dl_bw); ++} ++ ++void dl_bw_free(int cpu, u64 dl_bw) ++{ ++ dl_bw_manage(dl_bw_req_free, cpu, dl_bw); ++} + #endif + + #ifdef CONFIG_SCHED_DEBUG +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -330,7 +330,7 @@ extern void __getparam_dl(struct task_st + extern bool __checkparam_dl(const struct sched_attr *attr); + extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); + extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); +-extern int dl_cpu_busy(int cpu, struct task_struct *p); ++extern int dl_bw_check_overflow(int cpu); + + #ifdef CONFIG_CGROUP_SCHED + diff --git a/queue-6.1/series b/queue-6.1/series index 0792de319cb..474e4bc4b69 100644 --- a/queue-6.1/series +++ b/queue-6.1/series @@ -102,3 +102,9 @@ hwmon-aquacomputer_d5next-add-selective-200ms-delay-after-sending-ctrl-report.pa selftests-net-mv-bpf-nat6to4.c-to-net-folder.patch nfs-use-vfs-setgid-helper.patch nfsd-use-vfs-setgid-helper.patch +cgroup-cpuset-rename-functions-dealing-with-deadline-accounting.patch +sched-cpuset-bring-back-cpuset_mutex.patch +sched-cpuset-keep-track-of-sched_deadline-task-in-cpusets.patch +cgroup-cpuset-iterate-only-if-deadline-tasks-are-present.patch +sched-deadline-create-dl-bw-alloc-free-check-overflow-interface.patch +cgroup-cpuset-free-dl-bw-in-case-can_attach-fails.patch -- 2.47.3