From 79505d3d2e46db002f6324eba826ddb2217a93d7 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sun, 27 Aug 2023 10:34:15 +0200 Subject: [PATCH] 5.15-stable patches added patches: cgroup-cpuset-free-dl-bw-in-case-can_attach-fails.patch cgroup-cpuset-iterate-only-if-deadline-tasks-are-present.patch cgroup-cpuset-rename-functions-dealing-with-deadline-accounting.patch sched-cpuset-bring-back-cpuset_mutex.patch sched-cpuset-keep-track-of-sched_deadline-task-in-cpusets.patch sched-deadline-create-dl-bw-alloc-free-check-overflow-interface.patch --- ...-free-dl-bw-in-case-can_attach-fails.patch | 191 +++++ ...e-only-if-deadline-tasks-are-present.patch | 42 ++ ...ons-dealing-with-deadline-accounting.patch | 67 ++ ...sched-cpuset-bring-back-cpuset_mutex.patch | 672 ++++++++++++++++++ ...ck-of-sched_deadline-task-in-cpusets.patch | 162 +++++ ...-alloc-free-check-overflow-interface.patch | 166 +++++ queue-5.15/series | 6 + 7 files changed, 1306 insertions(+) create mode 100644 queue-5.15/cgroup-cpuset-free-dl-bw-in-case-can_attach-fails.patch create mode 100644 queue-5.15/cgroup-cpuset-iterate-only-if-deadline-tasks-are-present.patch create mode 100644 queue-5.15/cgroup-cpuset-rename-functions-dealing-with-deadline-accounting.patch create mode 100644 queue-5.15/sched-cpuset-bring-back-cpuset_mutex.patch create mode 100644 queue-5.15/sched-cpuset-keep-track-of-sched_deadline-task-in-cpusets.patch create mode 100644 queue-5.15/sched-deadline-create-dl-bw-alloc-free-check-overflow-interface.patch diff --git a/queue-5.15/cgroup-cpuset-free-dl-bw-in-case-can_attach-fails.patch b/queue-5.15/cgroup-cpuset-free-dl-bw-in-case-can_attach-fails.patch new file mode 100644 index 00000000000..e7532fe3b5c --- /dev/null +++ b/queue-5.15/cgroup-cpuset-free-dl-bw-in-case-can_attach-fails.patch @@ -0,0 +1,191 @@ +From stable-owner@vger.kernel.org Sun Aug 20 17:26:25 2023 +From: Qais Yousef +Date: Sun, 20 Aug 2023 16:22:58 +0100 +Subject: cgroup/cpuset: Free DL BW in case can_attach() fails +To: stable@vger.kernel.org +Cc: Juri Lelli , Waiman Long , Tejun Heo , Dietmar Eggemann , Peter Zijlstra , Vincent Guittot , Ingo Molnar , Hao Luo , John Stultz , cgroups@vger.kernel.org, linux-kernel@vger.kernel.org, Qais Yousef +Message-ID: <20230820152258.518128-7-qyousef@layalina.io> + +From: Dietmar Eggemann + +commit 2ef269ef1ac006acf974793d975539244d77b28f upstream. + +cpuset_can_attach() can fail. Postpone DL BW allocation until all tasks +have been checked. DL BW is not allocated per-task but as a sum over +all DL tasks migrating. + +If multiple controllers are attached to the cgroup next to the cpuset +controller a non-cpuset can_attach() can fail. In this case free DL BW +in cpuset_cancel_attach(). + +Finally, update cpuset DL task count (nr_deadline_tasks) only in +cpuset_attach(). + +Suggested-by: Waiman Long +Signed-off-by: Dietmar Eggemann +Signed-off-by: Juri Lelli +Reviewed-by: Waiman Long +Signed-off-by: Tejun Heo +[ Conflict in kernel/cgroup/cpuset.c due to pulling extra neighboring + functions that are not applicable on this branch. ] +Signed-off-by: Qais Yousef (Google) +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/sched.h | 2 - + kernel/cgroup/cpuset.c | 51 +++++++++++++++++++++++++++++++++++++++++++++---- + kernel/sched/core.c | 17 +--------------- + 3 files changed, 50 insertions(+), 20 deletions(-) + +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -1797,7 +1797,7 @@ current_restore_flags(unsigned long orig + } + + extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); +-extern int task_can_attach(struct task_struct *p, const struct cpumask *cs_effective_cpus); ++extern int task_can_attach(struct task_struct *p); + extern int dl_bw_alloc(int cpu, u64 dl_bw); + extern void dl_bw_free(int cpu, u64 dl_bw); + #ifdef CONFIG_SMP +--- a/kernel/cgroup/cpuset.c ++++ b/kernel/cgroup/cpuset.c +@@ -167,6 +167,8 @@ struct cpuset { + * know when to rebuild associated root domain bandwidth information. + */ + int nr_deadline_tasks; ++ int nr_migrate_dl_tasks; ++ u64 sum_migrate_dl_bw; + + /* Handle for cpuset.cpus.partition */ + struct cgroup_file partition_file; +@@ -2206,16 +2208,23 @@ static int fmeter_getrate(struct fmeter + + static struct cpuset *cpuset_attach_old_cs; + ++static void reset_migrate_dl_data(struct cpuset *cs) ++{ ++ cs->nr_migrate_dl_tasks = 0; ++ cs->sum_migrate_dl_bw = 0; ++} ++ + /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ + static int cpuset_can_attach(struct cgroup_taskset *tset) + { + struct cgroup_subsys_state *css; +- struct cpuset *cs; ++ struct cpuset *cs, *oldcs; + struct task_struct *task; + int ret; + + /* used later by cpuset_attach() */ + cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); ++ oldcs = cpuset_attach_old_cs; + cs = css_cs(css); + + mutex_lock(&cpuset_mutex); +@@ -2227,7 +2236,7 @@ static int cpuset_can_attach(struct cgro + goto out_unlock; + + cgroup_taskset_for_each(task, css, tset) { +- ret = task_can_attach(task, cs->effective_cpus); ++ ret = task_can_attach(task); + if (ret) + goto out_unlock; + ret = security_task_setscheduler(task); +@@ -2235,11 +2244,31 @@ static int cpuset_can_attach(struct cgro + goto out_unlock; + + if (dl_task(task)) { +- cs->nr_deadline_tasks++; +- cpuset_attach_old_cs->nr_deadline_tasks--; ++ cs->nr_migrate_dl_tasks++; ++ cs->sum_migrate_dl_bw += task->dl.dl_bw; + } + } + ++ if (!cs->nr_migrate_dl_tasks) ++ goto out_success; ++ ++ if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) { ++ int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus); ++ ++ if (unlikely(cpu >= nr_cpu_ids)) { ++ reset_migrate_dl_data(cs); ++ ret = -EINVAL; ++ goto out_unlock; ++ } ++ ++ ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw); ++ if (ret) { ++ reset_migrate_dl_data(cs); ++ goto out_unlock; ++ } ++ } ++ ++out_success: + /* + * Mark attach is in progress. This makes validate_change() fail + * changes which zero cpus/mems_allowed. +@@ -2263,6 +2292,14 @@ static void cpuset_cancel_attach(struct + cs->attach_in_progress--; + if (!cs->attach_in_progress) + wake_up(&cpuset_attach_wq); ++ ++ if (cs->nr_migrate_dl_tasks) { ++ int cpu = cpumask_any(cs->effective_cpus); ++ ++ dl_bw_free(cpu, cs->sum_migrate_dl_bw); ++ reset_migrate_dl_data(cs); ++ } ++ + mutex_unlock(&cpuset_mutex); + } + +@@ -2335,6 +2372,12 @@ static void cpuset_attach(struct cgroup_ + + cs->old_mems_allowed = cpuset_attach_nodemask_to; + ++ if (cs->nr_migrate_dl_tasks) { ++ cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks; ++ oldcs->nr_deadline_tasks -= cs->nr_migrate_dl_tasks; ++ reset_migrate_dl_data(cs); ++ } ++ + cs->attach_in_progress--; + if (!cs->attach_in_progress) + wake_up(&cpuset_attach_wq); +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -8789,8 +8789,7 @@ int cpuset_cpumask_can_shrink(const stru + return ret; + } + +-int task_can_attach(struct task_struct *p, +- const struct cpumask *cs_effective_cpus) ++int task_can_attach(struct task_struct *p) + { + int ret = 0; + +@@ -8803,21 +8802,9 @@ int task_can_attach(struct task_struct * + * success of set_cpus_allowed_ptr() on all attached tasks + * before cpus_mask may be changed. + */ +- if (p->flags & PF_NO_SETAFFINITY) { ++ if (p->flags & PF_NO_SETAFFINITY) + ret = -EINVAL; +- goto out; +- } + +- if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span, +- cs_effective_cpus)) { +- int cpu = cpumask_any_and(cpu_active_mask, cs_effective_cpus); +- +- if (unlikely(cpu >= nr_cpu_ids)) +- return -EINVAL; +- ret = dl_bw_alloc(cpu, p->dl.dl_bw); +- } +- +-out: + return ret; + } + diff --git a/queue-5.15/cgroup-cpuset-iterate-only-if-deadline-tasks-are-present.patch b/queue-5.15/cgroup-cpuset-iterate-only-if-deadline-tasks-are-present.patch new file mode 100644 index 00000000000..8281c9a0ca2 --- /dev/null +++ b/queue-5.15/cgroup-cpuset-iterate-only-if-deadline-tasks-are-present.patch @@ -0,0 +1,42 @@ +From stable-owner@vger.kernel.org Sun Aug 20 17:26:08 2023 +From: Qais Yousef +Date: Sun, 20 Aug 2023 16:22:56 +0100 +Subject: cgroup/cpuset: Iterate only if DEADLINE tasks are present +To: stable@vger.kernel.org +Cc: Juri Lelli , Waiman Long , Tejun Heo , Dietmar Eggemann , Peter Zijlstra , Vincent Guittot , Ingo Molnar , Hao Luo , John Stultz , cgroups@vger.kernel.org, linux-kernel@vger.kernel.org, Qais Yousef +Message-ID: <20230820152258.518128-5-qyousef@layalina.io> + +From: Juri Lelli + +commit c0f78fd5edcf29b2822ac165f9248a6c165e8554 upstream. + +update_tasks_root_domain currently iterates over all tasks even if no +DEADLINE task is present on the cpuset/root domain for which bandwidth +accounting is being rebuilt. This has been reported to introduce 10+ ms +delays on suspend-resume operations. + +Skip the costly iteration for cpusets that don't contain DEADLINE tasks. + +Reported-by: Qais Yousef (Google) +Link: https://lore.kernel.org/lkml/20230206221428.2125324-1-qyousef@layalina.io/ +Signed-off-by: Juri Lelli +Reviewed-by: Waiman Long +Signed-off-by: Tejun Heo +Signed-off-by: Qais Yousef (Google) +Signed-off-by: Greg Kroah-Hartman +--- + kernel/cgroup/cpuset.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/kernel/cgroup/cpuset.c ++++ b/kernel/cgroup/cpuset.c +@@ -966,6 +966,9 @@ static void dl_update_tasks_root_domain( + struct css_task_iter it; + struct task_struct *task; + ++ if (cs->nr_deadline_tasks == 0) ++ return; ++ + css_task_iter_start(&cs->css, 0, &it); + + while ((task = css_task_iter_next(&it))) diff --git a/queue-5.15/cgroup-cpuset-rename-functions-dealing-with-deadline-accounting.patch b/queue-5.15/cgroup-cpuset-rename-functions-dealing-with-deadline-accounting.patch new file mode 100644 index 00000000000..7a1fe3206d7 --- /dev/null +++ b/queue-5.15/cgroup-cpuset-rename-functions-dealing-with-deadline-accounting.patch @@ -0,0 +1,67 @@ +From stable-owner@vger.kernel.org Sun Aug 20 17:26:25 2023 +From: Qais Yousef +Date: Sun, 20 Aug 2023 16:22:53 +0100 +Subject: cgroup/cpuset: Rename functions dealing with DEADLINE accounting +To: stable@vger.kernel.org +Cc: Juri Lelli , Waiman Long , Tejun Heo , Dietmar Eggemann , Peter Zijlstra , Vincent Guittot , Ingo Molnar , Hao Luo , John Stultz , cgroups@vger.kernel.org, linux-kernel@vger.kernel.org, Qais Yousef +Message-ID: <20230820152258.518128-2-qyousef@layalina.io> + +From: Juri Lelli + +commit ad3a557daf6915296a43ef97a3e9c48e076c9dd8 upstream. + +rebuild_root_domains() and update_tasks_root_domain() have neutral +names, but actually deal with DEADLINE bandwidth accounting. + +Rename them to use 'dl_' prefix so that intent is more clear. + +No functional change. + +Suggested-by: Qais Yousef (Google) +Signed-off-by: Juri Lelli +Reviewed-by: Waiman Long +Signed-off-by: Tejun Heo +Signed-off-by: Qais Yousef (Google) +Signed-off-by: Greg Kroah-Hartman +--- + kernel/cgroup/cpuset.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +--- a/kernel/cgroup/cpuset.c ++++ b/kernel/cgroup/cpuset.c +@@ -940,7 +940,7 @@ done: + return ndoms; + } + +-static void update_tasks_root_domain(struct cpuset *cs) ++static void dl_update_tasks_root_domain(struct cpuset *cs) + { + struct css_task_iter it; + struct task_struct *task; +@@ -953,7 +953,7 @@ static void update_tasks_root_domain(str + css_task_iter_end(&it); + } + +-static void rebuild_root_domains(void) ++static void dl_rebuild_rd_accounting(void) + { + struct cpuset *cs = NULL; + struct cgroup_subsys_state *pos_css; +@@ -981,7 +981,7 @@ static void rebuild_root_domains(void) + + rcu_read_unlock(); + +- update_tasks_root_domain(cs); ++ dl_update_tasks_root_domain(cs); + + rcu_read_lock(); + css_put(&cs->css); +@@ -995,7 +995,7 @@ partition_and_rebuild_sched_domains(int + { + mutex_lock(&sched_domains_mutex); + partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); +- rebuild_root_domains(); ++ dl_rebuild_rd_accounting(); + mutex_unlock(&sched_domains_mutex); + } + diff --git a/queue-5.15/sched-cpuset-bring-back-cpuset_mutex.patch b/queue-5.15/sched-cpuset-bring-back-cpuset_mutex.patch new file mode 100644 index 00000000000..6b66ce5e012 --- /dev/null +++ b/queue-5.15/sched-cpuset-bring-back-cpuset_mutex.patch @@ -0,0 +1,672 @@ +From stable-owner@vger.kernel.org Sun Aug 20 17:26:09 2023 +From: Qais Yousef +Date: Sun, 20 Aug 2023 16:22:54 +0100 +Subject: sched/cpuset: Bring back cpuset_mutex +To: stable@vger.kernel.org +Cc: Juri Lelli , Waiman Long , Tejun Heo , Dietmar Eggemann , Peter Zijlstra , Vincent Guittot , Ingo Molnar , Hao Luo , John Stultz , cgroups@vger.kernel.org, linux-kernel@vger.kernel.org, Qais Yousef +Message-ID: <20230820152258.518128-3-qyousef@layalina.io> + +From: Juri Lelli + +commit 111cd11bbc54850f24191c52ff217da88a5e639b upstream. + +Turns out percpu_cpuset_rwsem - commit 1243dc518c9d ("cgroup/cpuset: +Convert cpuset_mutex to percpu_rwsem") - wasn't such a brilliant idea, +as it has been reported to cause slowdowns in workloads that need to +change cpuset configuration frequently and it is also not implementing +priority inheritance (which causes troubles with realtime workloads). + +Convert percpu_cpuset_rwsem back to regular cpuset_mutex. Also grab it +only for SCHED_DEADLINE tasks (other policies don't care about stable +cpusets anyway). + +Signed-off-by: Juri Lelli +Reviewed-by: Waiman Long +Signed-off-by: Tejun Heo +[ Conflict in kernel/cgroup/cpuset.c due to pulling changes in functions + or comments that don't exist on this branch. Remove a BUG_ON() for rwsem + that doesn't exist on mainline. ] +Signed-off-by: Qais Yousef (Google) +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/cpuset.h | 8 +- + kernel/cgroup/cpuset.c | 149 ++++++++++++++++++++++++------------------------- + kernel/sched/core.c | 22 ++++--- + 3 files changed, 93 insertions(+), 86 deletions(-) + +--- a/include/linux/cpuset.h ++++ b/include/linux/cpuset.h +@@ -56,8 +56,8 @@ extern void cpuset_init_smp(void); + extern void cpuset_force_rebuild(void); + extern void cpuset_update_active_cpus(void); + extern void cpuset_wait_for_hotplug(void); +-extern void cpuset_read_lock(void); +-extern void cpuset_read_unlock(void); ++extern void cpuset_lock(void); ++extern void cpuset_unlock(void); + extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); + extern bool cpuset_cpus_allowed_fallback(struct task_struct *p); + extern nodemask_t cpuset_mems_allowed(struct task_struct *p); +@@ -179,8 +179,8 @@ static inline void cpuset_update_active_ + + static inline void cpuset_wait_for_hotplug(void) { } + +-static inline void cpuset_read_lock(void) { } +-static inline void cpuset_read_unlock(void) { } ++static inline void cpuset_lock(void) { } ++static inline void cpuset_unlock(void) { } + + static inline void cpuset_cpus_allowed(struct task_struct *p, + struct cpumask *mask) +--- a/kernel/cgroup/cpuset.c ++++ b/kernel/cgroup/cpuset.c +@@ -312,22 +312,23 @@ static struct cpuset top_cpuset = { + if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) + + /* +- * There are two global locks guarding cpuset structures - cpuset_rwsem and ++ * There are two global locks guarding cpuset structures - cpuset_mutex and + * callback_lock. We also require taking task_lock() when dereferencing a + * task's cpuset pointer. See "The task_lock() exception", at the end of this +- * comment. The cpuset code uses only cpuset_rwsem write lock. Other +- * kernel subsystems can use cpuset_read_lock()/cpuset_read_unlock() to +- * prevent change to cpuset structures. ++ * comment. The cpuset code uses only cpuset_mutex. Other kernel subsystems ++ * can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset ++ * structures. Note that cpuset_mutex needs to be a mutex as it is used in ++ * paths that rely on priority inheritance (e.g. scheduler - on RT) for ++ * correctness. + * + * A task must hold both locks to modify cpusets. If a task holds +- * cpuset_rwsem, it blocks others wanting that rwsem, ensuring that it +- * is the only task able to also acquire callback_lock and be able to +- * modify cpusets. It can perform various checks on the cpuset structure +- * first, knowing nothing will change. It can also allocate memory while +- * just holding cpuset_rwsem. While it is performing these checks, various +- * callback routines can briefly acquire callback_lock to query cpusets. +- * Once it is ready to make the changes, it takes callback_lock, blocking +- * everyone else. ++ * cpuset_mutex, it blocks others, ensuring that it is the only task able to ++ * also acquire callback_lock and be able to modify cpusets. It can perform ++ * various checks on the cpuset structure first, knowing nothing will change. ++ * It can also allocate memory while just holding cpuset_mutex. While it is ++ * performing these checks, various callback routines can briefly acquire ++ * callback_lock to query cpusets. Once it is ready to make the changes, it ++ * takes callback_lock, blocking everyone else. + * + * Calls to the kernel memory allocator can not be made while holding + * callback_lock, as that would risk double tripping on callback_lock +@@ -349,16 +350,16 @@ static struct cpuset top_cpuset = { + * guidelines for accessing subsystem state in kernel/cgroup.c + */ + +-DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem); ++static DEFINE_MUTEX(cpuset_mutex); + +-void cpuset_read_lock(void) ++void cpuset_lock(void) + { +- percpu_down_read(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + } + +-void cpuset_read_unlock(void) ++void cpuset_unlock(void) + { +- percpu_up_read(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + } + + static DEFINE_SPINLOCK(callback_lock); +@@ -396,7 +397,7 @@ static inline bool is_in_v2_mode(void) + * One way or another, we guarantee to return some non-empty subset + * of cpu_online_mask. + * +- * Call with callback_lock or cpuset_rwsem held. ++ * Call with callback_lock or cpuset_mutex held. + */ + static void guarantee_online_cpus(struct task_struct *tsk, + struct cpumask *pmask) +@@ -438,7 +439,7 @@ out_unlock: + * One way or another, we guarantee to return some non-empty subset + * of node_states[N_MEMORY]. + * +- * Call with callback_lock or cpuset_rwsem held. ++ * Call with callback_lock or cpuset_mutex held. + */ + static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) + { +@@ -450,7 +451,7 @@ static void guarantee_online_mems(struct + /* + * update task's spread flag if cpuset's page/slab spread flag is set + * +- * Call with callback_lock or cpuset_rwsem held. ++ * Call with callback_lock or cpuset_mutex held. + */ + static void cpuset_update_task_spread_flag(struct cpuset *cs, + struct task_struct *tsk) +@@ -471,7 +472,7 @@ static void cpuset_update_task_spread_fl + * + * One cpuset is a subset of another if all its allowed CPUs and + * Memory Nodes are a subset of the other, and its exclusive flags +- * are only set if the other's are set. Call holding cpuset_rwsem. ++ * are only set if the other's are set. Call holding cpuset_mutex. + */ + + static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) +@@ -580,7 +581,7 @@ static inline void free_cpuset(struct cp + * If we replaced the flag and mask values of the current cpuset + * (cur) with those values in the trial cpuset (trial), would + * our various subset and exclusive rules still be valid? Presumes +- * cpuset_rwsem held. ++ * cpuset_mutex held. + * + * 'cur' is the address of an actual, in-use cpuset. Operations + * such as list traversal that depend on the actual address of the +@@ -703,7 +704,7 @@ static void update_domain_attr_tree(stru + rcu_read_unlock(); + } + +-/* Must be called with cpuset_rwsem held. */ ++/* Must be called with cpuset_mutex held. */ + static inline int nr_cpusets(void) + { + /* jump label reference count + the top-level cpuset */ +@@ -729,7 +730,7 @@ static inline int nr_cpusets(void) + * domains when operating in the severe memory shortage situations + * that could cause allocation failures below. + * +- * Must be called with cpuset_rwsem held. ++ * Must be called with cpuset_mutex held. + * + * The three key local variables below are: + * cp - cpuset pointer, used (together with pos_css) to perform a +@@ -958,7 +959,7 @@ static void dl_rebuild_rd_accounting(voi + struct cpuset *cs = NULL; + struct cgroup_subsys_state *pos_css; + +- percpu_rwsem_assert_held(&cpuset_rwsem); ++ lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpus_held(); + lockdep_assert_held(&sched_domains_mutex); + +@@ -1008,7 +1009,7 @@ partition_and_rebuild_sched_domains(int + * 'cpus' is removed, then call this routine to rebuild the + * scheduler's dynamic sched domains. + * +- * Call with cpuset_rwsem held. Takes cpus_read_lock(). ++ * Call with cpuset_mutex held. Takes cpus_read_lock(). + */ + static void rebuild_sched_domains_locked(void) + { +@@ -1019,7 +1020,7 @@ static void rebuild_sched_domains_locked + int ndoms; + + lockdep_assert_cpus_held(); +- percpu_rwsem_assert_held(&cpuset_rwsem); ++ lockdep_assert_held(&cpuset_mutex); + + /* + * If we have raced with CPU hotplug, return early to avoid +@@ -1070,9 +1071,9 @@ static void rebuild_sched_domains_locked + void rebuild_sched_domains(void) + { + cpus_read_lock(); +- percpu_down_write(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + rebuild_sched_domains_locked(); +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + cpus_read_unlock(); + } + +@@ -1081,7 +1082,7 @@ void rebuild_sched_domains(void) + * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed + * + * Iterate through each task of @cs updating its cpus_allowed to the +- * effective cpuset's. As this function is called with cpuset_rwsem held, ++ * effective cpuset's. As this function is called with cpuset_mutex held, + * cpuset membership stays stable. + */ + static void update_tasks_cpumask(struct cpuset *cs) +@@ -1188,7 +1189,7 @@ static int update_parent_subparts_cpumas + int old_prs, new_prs; + bool part_error = false; /* Partition error? */ + +- percpu_rwsem_assert_held(&cpuset_rwsem); ++ lockdep_assert_held(&cpuset_mutex); + + /* + * The parent must be a partition root. +@@ -1358,7 +1359,7 @@ static int update_parent_subparts_cpumas + * + * On legacy hierarchy, effective_cpus will be the same with cpu_allowed. + * +- * Called with cpuset_rwsem held ++ * Called with cpuset_mutex held + */ + static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) + { +@@ -1521,7 +1522,7 @@ static void update_sibling_cpumasks(stru + struct cpuset *sibling; + struct cgroup_subsys_state *pos_css; + +- percpu_rwsem_assert_held(&cpuset_rwsem); ++ lockdep_assert_held(&cpuset_mutex); + + /* + * Check all its siblings and call update_cpumasks_hier() +@@ -1724,12 +1725,12 @@ static void *cpuset_being_rebound; + * @cs: the cpuset in which each task's mems_allowed mask needs to be changed + * + * Iterate through each task of @cs updating its mems_allowed to the +- * effective cpuset's. As this function is called with cpuset_rwsem held, ++ * effective cpuset's. As this function is called with cpuset_mutex held, + * cpuset membership stays stable. + */ + static void update_tasks_nodemask(struct cpuset *cs) + { +- static nodemask_t newmems; /* protected by cpuset_rwsem */ ++ static nodemask_t newmems; /* protected by cpuset_mutex */ + struct css_task_iter it; + struct task_struct *task; + +@@ -1742,7 +1743,7 @@ static void update_tasks_nodemask(struct + * take while holding tasklist_lock. Forks can happen - the + * mpol_dup() cpuset_being_rebound check will catch such forks, + * and rebind their vma mempolicies too. Because we still hold +- * the global cpuset_rwsem, we know that no other rebind effort ++ * the global cpuset_mutex, we know that no other rebind effort + * will be contending for the global variable cpuset_being_rebound. + * It's ok if we rebind the same mm twice; mpol_rebind_mm() + * is idempotent. Also migrate pages in each mm to new nodes. +@@ -1788,7 +1789,7 @@ static void update_tasks_nodemask(struct + * + * On legacy hierarchy, effective_mems will be the same with mems_allowed. + * +- * Called with cpuset_rwsem held ++ * Called with cpuset_mutex held + */ + static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) + { +@@ -1841,7 +1842,7 @@ static void update_nodemasks_hier(struct + * mempolicies and if the cpuset is marked 'memory_migrate', + * migrate the tasks pages to the new memory. + * +- * Call with cpuset_rwsem held. May take callback_lock during call. ++ * Call with cpuset_mutex held. May take callback_lock during call. + * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, + * lock each such tasks mm->mmap_lock, scan its vma's and rebind + * their mempolicies to the cpusets new mems_allowed. +@@ -1931,7 +1932,7 @@ static int update_relax_domain_level(str + * @cs: the cpuset in which each task's spread flags needs to be changed + * + * Iterate through each task of @cs updating its spread flags. As this +- * function is called with cpuset_rwsem held, cpuset membership stays ++ * function is called with cpuset_mutex held, cpuset membership stays + * stable. + */ + static void update_tasks_flags(struct cpuset *cs) +@@ -1951,7 +1952,7 @@ static void update_tasks_flags(struct cp + * cs: the cpuset to update + * turning_on: whether the flag is being set or cleared + * +- * Call with cpuset_rwsem held. ++ * Call with cpuset_mutex held. + */ + + static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, +@@ -2000,7 +2001,7 @@ out: + * cs: the cpuset to update + * new_prs: new partition root state + * +- * Call with cpuset_rwsem held. ++ * Call with cpuset_mutex held. + */ + static int update_prstate(struct cpuset *cs, int new_prs) + { +@@ -2182,7 +2183,7 @@ static int fmeter_getrate(struct fmeter + + static struct cpuset *cpuset_attach_old_cs; + +-/* Called by cgroups to determine if a cpuset is usable; cpuset_rwsem held */ ++/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ + static int cpuset_can_attach(struct cgroup_taskset *tset) + { + struct cgroup_subsys_state *css; +@@ -2194,7 +2195,7 @@ static int cpuset_can_attach(struct cgro + cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); + cs = css_cs(css); + +- percpu_down_write(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + + /* allow moving tasks into an empty cpuset if on default hierarchy */ + ret = -ENOSPC; +@@ -2218,7 +2219,7 @@ static int cpuset_can_attach(struct cgro + cs->attach_in_progress++; + ret = 0; + out_unlock: +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + return ret; + } + +@@ -2230,15 +2231,15 @@ static void cpuset_cancel_attach(struct + cgroup_taskset_first(tset, &css); + cs = css_cs(css); + +- percpu_down_write(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + cs->attach_in_progress--; + if (!cs->attach_in_progress) + wake_up(&cpuset_attach_wq); +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + } + + /* +- * Protected by cpuset_rwsem. cpus_attach is used only by cpuset_attach() ++ * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach() + * but we can't allocate it dynamically there. Define it global and + * allocate from cpuset_init(). + */ +@@ -2246,7 +2247,7 @@ static cpumask_var_t cpus_attach; + + static void cpuset_attach(struct cgroup_taskset *tset) + { +- /* static buf protected by cpuset_rwsem */ ++ /* static buf protected by cpuset_mutex */ + static nodemask_t cpuset_attach_nodemask_to; + struct task_struct *task; + struct task_struct *leader; +@@ -2258,7 +2259,7 @@ static void cpuset_attach(struct cgroup_ + cs = css_cs(css); + + lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */ +- percpu_down_write(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + + guarantee_online_mems(cs, &cpuset_attach_nodemask_to); + +@@ -2310,7 +2311,7 @@ static void cpuset_attach(struct cgroup_ + if (!cs->attach_in_progress) + wake_up(&cpuset_attach_wq); + +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + } + + /* The various types of files and directories in a cpuset file system */ +@@ -2342,7 +2343,7 @@ static int cpuset_write_u64(struct cgrou + int retval = 0; + + cpus_read_lock(); +- percpu_down_write(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + if (!is_cpuset_online(cs)) { + retval = -ENODEV; + goto out_unlock; +@@ -2378,7 +2379,7 @@ static int cpuset_write_u64(struct cgrou + break; + } + out_unlock: +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + cpus_read_unlock(); + return retval; + } +@@ -2391,7 +2392,7 @@ static int cpuset_write_s64(struct cgrou + int retval = -ENODEV; + + cpus_read_lock(); +- percpu_down_write(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + if (!is_cpuset_online(cs)) + goto out_unlock; + +@@ -2404,7 +2405,7 @@ static int cpuset_write_s64(struct cgrou + break; + } + out_unlock: +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + cpus_read_unlock(); + return retval; + } +@@ -2437,7 +2438,7 @@ static ssize_t cpuset_write_resmask(stru + * operation like this one can lead to a deadlock through kernfs + * active_ref protection. Let's break the protection. Losing the + * protection is okay as we check whether @cs is online after +- * grabbing cpuset_rwsem anyway. This only happens on the legacy ++ * grabbing cpuset_mutex anyway. This only happens on the legacy + * hierarchies. + */ + css_get(&cs->css); +@@ -2445,7 +2446,7 @@ static ssize_t cpuset_write_resmask(stru + flush_work(&cpuset_hotplug_work); + + cpus_read_lock(); +- percpu_down_write(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + if (!is_cpuset_online(cs)) + goto out_unlock; + +@@ -2469,7 +2470,7 @@ static ssize_t cpuset_write_resmask(stru + + free_cpuset(trialcs); + out_unlock: +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + cpus_read_unlock(); + kernfs_unbreak_active_protection(of->kn); + css_put(&cs->css); +@@ -2602,13 +2603,13 @@ static ssize_t sched_partition_write(str + + css_get(&cs->css); + cpus_read_lock(); +- percpu_down_write(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + if (!is_cpuset_online(cs)) + goto out_unlock; + + retval = update_prstate(cs, val); + out_unlock: +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + cpus_read_unlock(); + css_put(&cs->css); + return retval ?: nbytes; +@@ -2821,7 +2822,7 @@ static int cpuset_css_online(struct cgro + return 0; + + cpus_read_lock(); +- percpu_down_write(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + + set_bit(CS_ONLINE, &cs->flags); + if (is_spread_page(parent)) +@@ -2872,7 +2873,7 @@ static int cpuset_css_online(struct cgro + cpumask_copy(cs->effective_cpus, parent->cpus_allowed); + spin_unlock_irq(&callback_lock); + out_unlock: +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + cpus_read_unlock(); + return 0; + } +@@ -2893,7 +2894,7 @@ static void cpuset_css_offline(struct cg + struct cpuset *cs = css_cs(css); + + cpus_read_lock(); +- percpu_down_write(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + + if (is_partition_root(cs)) + update_prstate(cs, 0); +@@ -2912,7 +2913,7 @@ static void cpuset_css_offline(struct cg + cpuset_dec(); + clear_bit(CS_ONLINE, &cs->flags); + +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + cpus_read_unlock(); + } + +@@ -2925,7 +2926,7 @@ static void cpuset_css_free(struct cgrou + + static void cpuset_bind(struct cgroup_subsys_state *root_css) + { +- percpu_down_write(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + spin_lock_irq(&callback_lock); + + if (is_in_v2_mode()) { +@@ -2938,7 +2939,7 @@ static void cpuset_bind(struct cgroup_su + } + + spin_unlock_irq(&callback_lock); +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + } + + /* +@@ -2980,8 +2981,6 @@ struct cgroup_subsys cpuset_cgrp_subsys + + int __init cpuset_init(void) + { +- BUG_ON(percpu_init_rwsem(&cpuset_rwsem)); +- + BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); + BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); + BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL)); +@@ -3053,7 +3052,7 @@ hotplug_update_tasks_legacy(struct cpuse + is_empty = cpumask_empty(cs->cpus_allowed) || + nodes_empty(cs->mems_allowed); + +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + + /* + * Move tasks to the nearest ancestor with execution resources, +@@ -3063,7 +3062,7 @@ hotplug_update_tasks_legacy(struct cpuse + if (is_empty) + remove_tasks_in_empty_cpuset(cs); + +- percpu_down_write(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + } + + static void +@@ -3113,14 +3112,14 @@ static void cpuset_hotplug_update_tasks( + retry: + wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); + +- percpu_down_write(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + + /* + * We have raced with task attaching. We wait until attaching + * is finished, so we won't attach a task to an empty cpuset. + */ + if (cs->attach_in_progress) { +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + goto retry; + } + +@@ -3198,7 +3197,7 @@ update_tasks: + hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems, + cpus_updated, mems_updated); + +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + } + + /** +@@ -3228,7 +3227,7 @@ static void cpuset_hotplug_workfn(struct + if (on_dfl && !alloc_cpumasks(NULL, &tmp)) + ptmp = &tmp; + +- percpu_down_write(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + + /* fetch the available cpus/mems and find out which changed how */ + cpumask_copy(&new_cpus, cpu_active_mask); +@@ -3285,7 +3284,7 @@ static void cpuset_hotplug_workfn(struct + update_tasks_nodemask(&top_cpuset); + } + +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + + /* if cpus or mems changed, we need to propagate to descendants */ + if (cpus_updated || mems_updated) { +@@ -3695,7 +3694,7 @@ void __cpuset_memory_pressure_bump(void) + * - Used for /proc//cpuset. + * - No need to task_lock(tsk) on this tsk->cpuset reference, as it + * doesn't really matter if tsk->cpuset changes after we read it, +- * and we take cpuset_rwsem, keeping cpuset_attach() from changing it ++ * and we take cpuset_mutex, keeping cpuset_attach() from changing it + * anyway. + */ + int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -7309,6 +7309,7 @@ static int __sched_setscheduler(struct t + int reset_on_fork; + int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; + struct rq *rq; ++ bool cpuset_locked = false; + + /* The pi code expects interrupts enabled */ + BUG_ON(pi && in_interrupt()); +@@ -7405,8 +7406,14 @@ recheck: + return retval; + } + +- if (pi) +- cpuset_read_lock(); ++ /* ++ * SCHED_DEADLINE bandwidth accounting relies on stable cpusets ++ * information. ++ */ ++ if (dl_policy(policy) || dl_policy(p->policy)) { ++ cpuset_locked = true; ++ cpuset_lock(); ++ } + + /* + * Make sure no PI-waiters arrive (or leave) while we are +@@ -7482,8 +7489,8 @@ change: + if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { + policy = oldpolicy = -1; + task_rq_unlock(rq, p, &rf); +- if (pi) +- cpuset_read_unlock(); ++ if (cpuset_locked) ++ cpuset_unlock(); + goto recheck; + } + +@@ -7550,7 +7557,8 @@ change: + task_rq_unlock(rq, p, &rf); + + if (pi) { +- cpuset_read_unlock(); ++ if (cpuset_locked) ++ cpuset_unlock(); + rt_mutex_adjust_pi(p); + } + +@@ -7562,8 +7570,8 @@ change: + + unlock: + task_rq_unlock(rq, p, &rf); +- if (pi) +- cpuset_read_unlock(); ++ if (cpuset_locked) ++ cpuset_unlock(); + return retval; + } + diff --git a/queue-5.15/sched-cpuset-keep-track-of-sched_deadline-task-in-cpusets.patch b/queue-5.15/sched-cpuset-keep-track-of-sched_deadline-task-in-cpusets.patch new file mode 100644 index 00000000000..2f763c6de02 --- /dev/null +++ b/queue-5.15/sched-cpuset-keep-track-of-sched_deadline-task-in-cpusets.patch @@ -0,0 +1,162 @@ +From stable-owner@vger.kernel.org Sun Aug 20 17:26:09 2023 +From: Qais Yousef +Date: Sun, 20 Aug 2023 16:22:55 +0100 +Subject: sched/cpuset: Keep track of SCHED_DEADLINE task in cpusets +To: stable@vger.kernel.org +Cc: Juri Lelli , Waiman Long , Tejun Heo , Dietmar Eggemann , Peter Zijlstra , Vincent Guittot , Ingo Molnar , Hao Luo , John Stultz , cgroups@vger.kernel.org, linux-kernel@vger.kernel.org, Qais Yousef +Message-ID: <20230820152258.518128-4-qyousef@layalina.io> + +From: Juri Lelli + +commit 6c24849f5515e4966d94fa5279bdff4acf2e9489 upstream. + +Qais reported that iterating over all tasks when rebuilding root domains +for finding out which ones are DEADLINE and need their bandwidth +correctly restored on such root domains can be a costly operation (10+ +ms delays on suspend-resume). + +To fix the problem keep track of the number of DEADLINE tasks belonging +to each cpuset and then use this information (followup patch) to only +perform the above iteration if DEADLINE tasks are actually present in +the cpuset for which a corresponding root domain is being rebuilt. + +Reported-by: Qais Yousef (Google) +Link: https://lore.kernel.org/lkml/20230206221428.2125324-1-qyousef@layalina.io/ +Signed-off-by: Juri Lelli +Reviewed-by: Waiman Long +Signed-off-by: Tejun Heo +[ Conflict in kernel/cgroup/cpuset.c and kernel/sched/deadline.c due to + pulling new code. Reject new code/fields. ] +Signed-off-by: Qais Yousef (Google) +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/cpuset.h | 4 ++++ + kernel/cgroup/cgroup.c | 4 ++++ + kernel/cgroup/cpuset.c | 25 +++++++++++++++++++++++++ + kernel/sched/deadline.c | 13 +++++++++++++ + 4 files changed, 46 insertions(+) + +--- a/include/linux/cpuset.h ++++ b/include/linux/cpuset.h +@@ -56,6 +56,8 @@ extern void cpuset_init_smp(void); + extern void cpuset_force_rebuild(void); + extern void cpuset_update_active_cpus(void); + extern void cpuset_wait_for_hotplug(void); ++extern void inc_dl_tasks_cs(struct task_struct *task); ++extern void dec_dl_tasks_cs(struct task_struct *task); + extern void cpuset_lock(void); + extern void cpuset_unlock(void); + extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); +@@ -179,6 +181,8 @@ static inline void cpuset_update_active_ + + static inline void cpuset_wait_for_hotplug(void) { } + ++static inline void inc_dl_tasks_cs(struct task_struct *task) { } ++static inline void dec_dl_tasks_cs(struct task_struct *task) { } + static inline void cpuset_lock(void) { } + static inline void cpuset_unlock(void) { } + +--- a/kernel/cgroup/cgroup.c ++++ b/kernel/cgroup/cgroup.c +@@ -56,6 +56,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -6467,6 +6468,9 @@ void cgroup_exit(struct task_struct *tsk + list_add_tail(&tsk->cg_list, &cset->dying_tasks); + cset->nr_tasks--; + ++ if (dl_task(tsk)) ++ dec_dl_tasks_cs(tsk); ++ + WARN_ON_ONCE(cgroup_task_frozen(tsk)); + if (unlikely(!(tsk->flags & PF_KTHREAD) && + test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags))) +--- a/kernel/cgroup/cpuset.c ++++ b/kernel/cgroup/cpuset.c +@@ -162,6 +162,12 @@ struct cpuset { + int use_parent_ecpus; + int child_ecpus_count; + ++ /* ++ * number of SCHED_DEADLINE tasks attached to this cpuset, so that we ++ * know when to rebuild associated root domain bandwidth information. ++ */ ++ int nr_deadline_tasks; ++ + /* Handle for cpuset.cpus.partition */ + struct cgroup_file partition_file; + }; +@@ -209,6 +215,20 @@ static inline struct cpuset *parent_cs(s + return css_cs(cs->css.parent); + } + ++void inc_dl_tasks_cs(struct task_struct *p) ++{ ++ struct cpuset *cs = task_cs(p); ++ ++ cs->nr_deadline_tasks++; ++} ++ ++void dec_dl_tasks_cs(struct task_struct *p) ++{ ++ struct cpuset *cs = task_cs(p); ++ ++ cs->nr_deadline_tasks--; ++} ++ + /* bits in struct cpuset flags field */ + typedef enum { + CS_ONLINE, +@@ -2210,6 +2230,11 @@ static int cpuset_can_attach(struct cgro + ret = security_task_setscheduler(task); + if (ret) + goto out_unlock; ++ ++ if (dl_task(task)) { ++ cs->nr_deadline_tasks++; ++ cpuset_attach_old_cs->nr_deadline_tasks--; ++ } + } + + /* +--- a/kernel/sched/deadline.c ++++ b/kernel/sched/deadline.c +@@ -17,6 +17,7 @@ + */ + #include "sched.h" + #include "pelt.h" ++#include + + struct dl_bandwidth def_dl_bandwidth; + +@@ -2446,6 +2447,12 @@ static void switched_from_dl(struct rq * + if (task_on_rq_queued(p) && p->dl.dl_runtime) + task_non_contending(p); + ++ /* ++ * In case a task is setscheduled out from SCHED_DEADLINE we need to ++ * keep track of that on its cpuset (for correct bandwidth tracking). ++ */ ++ dec_dl_tasks_cs(p); ++ + if (!task_on_rq_queued(p)) { + /* + * Inactive timer is armed. However, p is leaving DEADLINE and +@@ -2486,6 +2493,12 @@ static void switched_to_dl(struct rq *rq + if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) + put_task_struct(p); + ++ /* ++ * In case a task is setscheduled to SCHED_DEADLINE we need to keep ++ * track of that on its cpuset (for correct bandwidth tracking). ++ */ ++ inc_dl_tasks_cs(p); ++ + /* If p is not queued we will update its parameters at next wakeup. */ + if (!task_on_rq_queued(p)) { + add_rq_bw(&p->dl, &rq->dl); diff --git a/queue-5.15/sched-deadline-create-dl-bw-alloc-free-check-overflow-interface.patch b/queue-5.15/sched-deadline-create-dl-bw-alloc-free-check-overflow-interface.patch new file mode 100644 index 00000000000..af4742df30d --- /dev/null +++ b/queue-5.15/sched-deadline-create-dl-bw-alloc-free-check-overflow-interface.patch @@ -0,0 +1,166 @@ +From stable-owner@vger.kernel.org Sun Aug 20 17:26:09 2023 +From: Qais Yousef +Date: Sun, 20 Aug 2023 16:22:57 +0100 +Subject: sched/deadline: Create DL BW alloc, free & check overflow interface +To: stable@vger.kernel.org +Cc: Juri Lelli , Waiman Long , Tejun Heo , Dietmar Eggemann , Peter Zijlstra , Vincent Guittot , Ingo Molnar , Hao Luo , John Stultz , cgroups@vger.kernel.org, linux-kernel@vger.kernel.org, Qais Yousef +Message-ID: <20230820152258.518128-6-qyousef@layalina.io> + +From: Dietmar Eggemann + +commit 85989106feb734437e2d598b639991b9185a43a6 upstream. + +While moving a set of tasks between exclusive cpusets, +cpuset_can_attach() -> task_can_attach() calls dl_cpu_busy(..., p) for +DL BW overflow checking and per-task DL BW allocation on the destination +root_domain for the DL tasks in this set. + +This approach has the issue of not freeing already allocated DL BW in +the following error cases: + +(1) The set of tasks includes multiple DL tasks and DL BW overflow + checking fails for one of the subsequent DL tasks. + +(2) Another controller next to the cpuset controller which is attached + to the same cgroup fails in its can_attach(). + +To address this problem rework dl_cpu_busy(): + +(1) Split it into dl_bw_check_overflow() & dl_bw_alloc() and add a + dedicated dl_bw_free(). + +(2) dl_bw_alloc() & dl_bw_free() take a `u64 dl_bw` parameter instead of + a `struct task_struct *p` used in dl_cpu_busy(). This allows to + allocate DL BW for a set of tasks too rather than only for a single + task. + +Signed-off-by: Dietmar Eggemann +Signed-off-by: Juri Lelli +Signed-off-by: Tejun Heo +Signed-off-by: Qais Yousef (Google) +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/sched.h | 2 + + kernel/sched/core.c | 4 +-- + kernel/sched/deadline.c | 53 ++++++++++++++++++++++++++++++++++++------------ + kernel/sched/sched.h | 2 - + 4 files changed, 45 insertions(+), 16 deletions(-) + +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -1798,6 +1798,8 @@ current_restore_flags(unsigned long orig + + extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); + extern int task_can_attach(struct task_struct *p, const struct cpumask *cs_effective_cpus); ++extern int dl_bw_alloc(int cpu, u64 dl_bw); ++extern void dl_bw_free(int cpu, u64 dl_bw); + #ifdef CONFIG_SMP + extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); + extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask); +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -8814,7 +8814,7 @@ int task_can_attach(struct task_struct * + + if (unlikely(cpu >= nr_cpu_ids)) + return -EINVAL; +- ret = dl_cpu_busy(cpu, p); ++ ret = dl_bw_alloc(cpu, p->dl.dl_bw); + } + + out: +@@ -9099,7 +9099,7 @@ static void cpuset_cpu_active(void) + static int cpuset_cpu_inactive(unsigned int cpu) + { + if (!cpuhp_tasks_frozen) { +- int ret = dl_cpu_busy(cpu, NULL); ++ int ret = dl_bw_check_overflow(cpu); + + if (ret) + return ret; +--- a/kernel/sched/deadline.c ++++ b/kernel/sched/deadline.c +@@ -2898,26 +2898,38 @@ int dl_cpuset_cpumask_can_shrink(const s + return ret; + } + +-int dl_cpu_busy(int cpu, struct task_struct *p) ++enum dl_bw_request { ++ dl_bw_req_check_overflow = 0, ++ dl_bw_req_alloc, ++ dl_bw_req_free ++}; ++ ++static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw) + { +- unsigned long flags, cap; ++ unsigned long flags; + struct dl_bw *dl_b; +- bool overflow; ++ bool overflow = 0; + + rcu_read_lock_sched(); + dl_b = dl_bw_of(cpu); + raw_spin_lock_irqsave(&dl_b->lock, flags); +- cap = dl_bw_capacity(cpu); +- overflow = __dl_overflow(dl_b, cap, 0, p ? p->dl.dl_bw : 0); + +- if (!overflow && p) { +- /* +- * We reserve space for this task in the destination +- * root_domain, as we can't fail after this point. +- * We will free resources in the source root_domain +- * later on (see set_cpus_allowed_dl()). +- */ +- __dl_add(dl_b, p->dl.dl_bw, dl_bw_cpus(cpu)); ++ if (req == dl_bw_req_free) { ++ __dl_sub(dl_b, dl_bw, dl_bw_cpus(cpu)); ++ } else { ++ unsigned long cap = dl_bw_capacity(cpu); ++ ++ overflow = __dl_overflow(dl_b, cap, 0, dl_bw); ++ ++ if (req == dl_bw_req_alloc && !overflow) { ++ /* ++ * We reserve space in the destination ++ * root_domain, as we can't fail after this point. ++ * We will free resources in the source root_domain ++ * later on (see set_cpus_allowed_dl()). ++ */ ++ __dl_add(dl_b, dl_bw, dl_bw_cpus(cpu)); ++ } + } + + raw_spin_unlock_irqrestore(&dl_b->lock, flags); +@@ -2925,6 +2937,21 @@ int dl_cpu_busy(int cpu, struct task_str + + return overflow ? -EBUSY : 0; + } ++ ++int dl_bw_check_overflow(int cpu) ++{ ++ return dl_bw_manage(dl_bw_req_check_overflow, cpu, 0); ++} ++ ++int dl_bw_alloc(int cpu, u64 dl_bw) ++{ ++ return dl_bw_manage(dl_bw_req_alloc, cpu, dl_bw); ++} ++ ++void dl_bw_free(int cpu, u64 dl_bw) ++{ ++ dl_bw_manage(dl_bw_req_free, cpu, dl_bw); ++} + #endif + + #ifdef CONFIG_SCHED_DEBUG +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -349,7 +349,7 @@ extern void __getparam_dl(struct task_st + extern bool __checkparam_dl(const struct sched_attr *attr); + extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); + extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); +-extern int dl_cpu_busy(int cpu, struct task_struct *p); ++extern int dl_bw_check_overflow(int cpu); + + #ifdef CONFIG_CGROUP_SCHED + diff --git a/queue-5.15/series b/queue-5.15/series index 460946fbd4b..eda7a2fd802 100644 --- a/queue-5.15/series +++ b/queue-5.15/series @@ -73,3 +73,9 @@ x86-fpu-set-x86_feature_osxsave-feature-after-enabling-osxsave-in-cr4.patch nfs-use-vfs-setgid-helper.patch nfsd-use-vfs-setgid-helper.patch torture-fix-hang-during-kthread-shutdown-phase.patch +cgroup-cpuset-rename-functions-dealing-with-deadline-accounting.patch +sched-cpuset-bring-back-cpuset_mutex.patch +sched-cpuset-keep-track-of-sched_deadline-task-in-cpusets.patch +cgroup-cpuset-iterate-only-if-deadline-tasks-are-present.patch +sched-deadline-create-dl-bw-alloc-free-check-overflow-interface.patch +cgroup-cpuset-free-dl-bw-in-case-can_attach-fails.patch -- 2.47.3