From fefe072f9390ddc6edd280fd5895df1692176482 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sun, 27 Aug 2023 10:33:36 +0200 Subject: [PATCH] 5.10-stable patches added patches: cgroup-cpuset-free-dl-bw-in-case-can_attach-fails.patch cgroup-cpuset-iterate-only-if-deadline-tasks-are-present.patch cgroup-cpuset-rename-functions-dealing-with-deadline-accounting.patch sched-cpuset-bring-back-cpuset_mutex.patch sched-cpuset-keep-track-of-sched_deadline-task-in-cpusets.patch sched-deadline-create-dl-bw-alloc-free-check-overflow-interface.patch --- ...-free-dl-bw-in-case-can_attach-fails.patch | 191 ++++++++ ...e-only-if-deadline-tasks-are-present.patch | 42 ++ ...ons-dealing-with-deadline-accounting.patch | 67 +++ ...sched-cpuset-bring-back-cpuset_mutex.patch | 437 ++++++++++++++++++ ...ck-of-sched_deadline-task-in-cpusets.patch | 163 +++++++ ...-alloc-free-check-overflow-interface.patch | 166 +++++++ queue-5.10/series | 6 + 7 files changed, 1072 insertions(+) create mode 100644 queue-5.10/cgroup-cpuset-free-dl-bw-in-case-can_attach-fails.patch create mode 100644 queue-5.10/cgroup-cpuset-iterate-only-if-deadline-tasks-are-present.patch create mode 100644 queue-5.10/cgroup-cpuset-rename-functions-dealing-with-deadline-accounting.patch create mode 100644 queue-5.10/sched-cpuset-bring-back-cpuset_mutex.patch create mode 100644 queue-5.10/sched-cpuset-keep-track-of-sched_deadline-task-in-cpusets.patch create mode 100644 queue-5.10/sched-deadline-create-dl-bw-alloc-free-check-overflow-interface.patch diff --git a/queue-5.10/cgroup-cpuset-free-dl-bw-in-case-can_attach-fails.patch b/queue-5.10/cgroup-cpuset-free-dl-bw-in-case-can_attach-fails.patch new file mode 100644 index 00000000000..5e8093ce4d1 --- /dev/null +++ b/queue-5.10/cgroup-cpuset-free-dl-bw-in-case-can_attach-fails.patch @@ -0,0 +1,191 @@ +From stable-owner@vger.kernel.org Sun Aug 20 17:25:35 2023 +From: Qais Yousef +Date: Sun, 20 Aug 2023 16:21:44 +0100 +Subject: cgroup/cpuset: Free DL BW in case can_attach() fails +To: stable@vger.kernel.org +Cc: Juri Lelli , Waiman Long , Tejun Heo , Dietmar Eggemann , Peter Zijlstra , Vincent Guittot , Ingo Molnar , Hao Luo , John Stultz , cgroups@vger.kernel.org, linux-kernel@vger.kernel.org, Qais Yousef +Message-ID: <20230820152144.517461-7-qyousef@layalina.io> + +From: Dietmar Eggemann + +commit 2ef269ef1ac006acf974793d975539244d77b28f upstream. + +cpuset_can_attach() can fail. Postpone DL BW allocation until all tasks +have been checked. DL BW is not allocated per-task but as a sum over +all DL tasks migrating. + +If multiple controllers are attached to the cgroup next to the cpuset +controller a non-cpuset can_attach() can fail. In this case free DL BW +in cpuset_cancel_attach(). + +Finally, update cpuset DL task count (nr_deadline_tasks) only in +cpuset_attach(). + +Suggested-by: Waiman Long +Signed-off-by: Dietmar Eggemann +Signed-off-by: Juri Lelli +Reviewed-by: Waiman Long +Signed-off-by: Tejun Heo +[ Fix conflicts in kernel/cgroup/cpuset.c due to new code being applied + that is not applicable on this branch. Reject new code. ] +Signed-off-by: Qais Yousef (Google) +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/sched.h | 2 - + kernel/cgroup/cpuset.c | 51 +++++++++++++++++++++++++++++++++++++++++++++---- + kernel/sched/core.c | 17 +--------------- + 3 files changed, 50 insertions(+), 20 deletions(-) + +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -1657,7 +1657,7 @@ current_restore_flags(unsigned long orig + } + + extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); +-extern int task_can_attach(struct task_struct *p, const struct cpumask *cs_effective_cpus); ++extern int task_can_attach(struct task_struct *p); + extern int dl_bw_alloc(int cpu, u64 dl_bw); + extern void dl_bw_free(int cpu, u64 dl_bw); + #ifdef CONFIG_SMP +--- a/kernel/cgroup/cpuset.c ++++ b/kernel/cgroup/cpuset.c +@@ -167,6 +167,8 @@ struct cpuset { + * know when to rebuild associated root domain bandwidth information. + */ + int nr_deadline_tasks; ++ int nr_migrate_dl_tasks; ++ u64 sum_migrate_dl_bw; + }; + + /* +@@ -2168,16 +2170,23 @@ static int fmeter_getrate(struct fmeter + + static struct cpuset *cpuset_attach_old_cs; + ++static void reset_migrate_dl_data(struct cpuset *cs) ++{ ++ cs->nr_migrate_dl_tasks = 0; ++ cs->sum_migrate_dl_bw = 0; ++} ++ + /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ + static int cpuset_can_attach(struct cgroup_taskset *tset) + { + struct cgroup_subsys_state *css; +- struct cpuset *cs; ++ struct cpuset *cs, *oldcs; + struct task_struct *task; + int ret; + + /* used later by cpuset_attach() */ + cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); ++ oldcs = cpuset_attach_old_cs; + cs = css_cs(css); + + mutex_lock(&cpuset_mutex); +@@ -2189,7 +2198,7 @@ static int cpuset_can_attach(struct cgro + goto out_unlock; + + cgroup_taskset_for_each(task, css, tset) { +- ret = task_can_attach(task, cs->effective_cpus); ++ ret = task_can_attach(task); + if (ret) + goto out_unlock; + ret = security_task_setscheduler(task); +@@ -2197,11 +2206,31 @@ static int cpuset_can_attach(struct cgro + goto out_unlock; + + if (dl_task(task)) { +- cs->nr_deadline_tasks++; +- cpuset_attach_old_cs->nr_deadline_tasks--; ++ cs->nr_migrate_dl_tasks++; ++ cs->sum_migrate_dl_bw += task->dl.dl_bw; + } + } + ++ if (!cs->nr_migrate_dl_tasks) ++ goto out_success; ++ ++ if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) { ++ int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus); ++ ++ if (unlikely(cpu >= nr_cpu_ids)) { ++ reset_migrate_dl_data(cs); ++ ret = -EINVAL; ++ goto out_unlock; ++ } ++ ++ ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw); ++ if (ret) { ++ reset_migrate_dl_data(cs); ++ goto out_unlock; ++ } ++ } ++ ++out_success: + /* + * Mark attach is in progress. This makes validate_change() fail + * changes which zero cpus/mems_allowed. +@@ -2225,6 +2254,14 @@ static void cpuset_cancel_attach(struct + cs->attach_in_progress--; + if (!cs->attach_in_progress) + wake_up(&cpuset_attach_wq); ++ ++ if (cs->nr_migrate_dl_tasks) { ++ int cpu = cpumask_any(cs->effective_cpus); ++ ++ dl_bw_free(cpu, cs->sum_migrate_dl_bw); ++ reset_migrate_dl_data(cs); ++ } ++ + mutex_unlock(&cpuset_mutex); + } + +@@ -2299,6 +2336,12 @@ static void cpuset_attach(struct cgroup_ + + cs->old_mems_allowed = cpuset_attach_nodemask_to; + ++ if (cs->nr_migrate_dl_tasks) { ++ cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks; ++ oldcs->nr_deadline_tasks -= cs->nr_migrate_dl_tasks; ++ reset_migrate_dl_data(cs); ++ } ++ + cs->attach_in_progress--; + if (!cs->attach_in_progress) + wake_up(&cpuset_attach_wq); +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -6600,8 +6600,7 @@ int cpuset_cpumask_can_shrink(const stru + return ret; + } + +-int task_can_attach(struct task_struct *p, +- const struct cpumask *cs_effective_cpus) ++int task_can_attach(struct task_struct *p) + { + int ret = 0; + +@@ -6614,21 +6613,9 @@ int task_can_attach(struct task_struct * + * success of set_cpus_allowed_ptr() on all attached tasks + * before cpus_mask may be changed. + */ +- if (p->flags & PF_NO_SETAFFINITY) { ++ if (p->flags & PF_NO_SETAFFINITY) + ret = -EINVAL; +- goto out; +- } + +- if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span, +- cs_effective_cpus)) { +- int cpu = cpumask_any_and(cpu_active_mask, cs_effective_cpus); +- +- if (unlikely(cpu >= nr_cpu_ids)) +- return -EINVAL; +- ret = dl_bw_alloc(cpu, p->dl.dl_bw); +- } +- +-out: + return ret; + } + diff --git a/queue-5.10/cgroup-cpuset-iterate-only-if-deadline-tasks-are-present.patch b/queue-5.10/cgroup-cpuset-iterate-only-if-deadline-tasks-are-present.patch new file mode 100644 index 00000000000..f234961ff04 --- /dev/null +++ b/queue-5.10/cgroup-cpuset-iterate-only-if-deadline-tasks-are-present.patch @@ -0,0 +1,42 @@ +From stable-owner@vger.kernel.org Sun Aug 20 17:25:34 2023 +From: Qais Yousef +Date: Sun, 20 Aug 2023 16:21:42 +0100 +Subject: cgroup/cpuset: Iterate only if DEADLINE tasks are present +To: stable@vger.kernel.org +Cc: Juri Lelli , Waiman Long , Tejun Heo , Dietmar Eggemann , Peter Zijlstra , Vincent Guittot , Ingo Molnar , Hao Luo , John Stultz , cgroups@vger.kernel.org, linux-kernel@vger.kernel.org, Qais Yousef +Message-ID: <20230820152144.517461-5-qyousef@layalina.io> + +From: Juri Lelli + +commit c0f78fd5edcf29b2822ac165f9248a6c165e8554 upstream. + +update_tasks_root_domain currently iterates over all tasks even if no +DEADLINE task is present on the cpuset/root domain for which bandwidth +accounting is being rebuilt. This has been reported to introduce 10+ ms +delays on suspend-resume operations. + +Skip the costly iteration for cpusets that don't contain DEADLINE tasks. + +Reported-by: Qais Yousef (Google) +Link: https://lore.kernel.org/lkml/20230206221428.2125324-1-qyousef@layalina.io/ +Signed-off-by: Juri Lelli +Reviewed-by: Waiman Long +Signed-off-by: Tejun Heo +Signed-off-by: Qais Yousef (Google) +Signed-off-by: Greg Kroah-Hartman +--- + kernel/cgroup/cpuset.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/kernel/cgroup/cpuset.c ++++ b/kernel/cgroup/cpuset.c +@@ -937,6 +937,9 @@ static void dl_update_tasks_root_domain( + struct css_task_iter it; + struct task_struct *task; + ++ if (cs->nr_deadline_tasks == 0) ++ return; ++ + css_task_iter_start(&cs->css, 0, &it); + + while ((task = css_task_iter_next(&it))) diff --git a/queue-5.10/cgroup-cpuset-rename-functions-dealing-with-deadline-accounting.patch b/queue-5.10/cgroup-cpuset-rename-functions-dealing-with-deadline-accounting.patch new file mode 100644 index 00000000000..2aac3f17e0a --- /dev/null +++ b/queue-5.10/cgroup-cpuset-rename-functions-dealing-with-deadline-accounting.patch @@ -0,0 +1,67 @@ +From stable-owner@vger.kernel.org Sun Aug 20 17:25:56 2023 +From: Qais Yousef +Date: Sun, 20 Aug 2023 16:21:39 +0100 +Subject: cgroup/cpuset: Rename functions dealing with DEADLINE accounting +To: stable@vger.kernel.org +Cc: Juri Lelli , Waiman Long , Tejun Heo , Dietmar Eggemann , Peter Zijlstra , Vincent Guittot , Ingo Molnar , Hao Luo , John Stultz , cgroups@vger.kernel.org, linux-kernel@vger.kernel.org, Qais Yousef +Message-ID: <20230820152144.517461-2-qyousef@layalina.io> + +From: Juri Lelli + +commit ad3a557daf6915296a43ef97a3e9c48e076c9dd8 upstream. + +rebuild_root_domains() and update_tasks_root_domain() have neutral +names, but actually deal with DEADLINE bandwidth accounting. + +Rename them to use 'dl_' prefix so that intent is more clear. + +No functional change. + +Suggested-by: Qais Yousef (Google) +Signed-off-by: Juri Lelli +Reviewed-by: Waiman Long +Signed-off-by: Tejun Heo +Signed-off-by: Qais Yousef (Google) +Signed-off-by: Greg Kroah-Hartman +--- + kernel/cgroup/cpuset.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +--- a/kernel/cgroup/cpuset.c ++++ b/kernel/cgroup/cpuset.c +@@ -912,7 +912,7 @@ done: + return ndoms; + } + +-static void update_tasks_root_domain(struct cpuset *cs) ++static void dl_update_tasks_root_domain(struct cpuset *cs) + { + struct css_task_iter it; + struct task_struct *task; +@@ -925,7 +925,7 @@ static void update_tasks_root_domain(str + css_task_iter_end(&it); + } + +-static void rebuild_root_domains(void) ++static void dl_rebuild_rd_accounting(void) + { + struct cpuset *cs = NULL; + struct cgroup_subsys_state *pos_css; +@@ -953,7 +953,7 @@ static void rebuild_root_domains(void) + + rcu_read_unlock(); + +- update_tasks_root_domain(cs); ++ dl_update_tasks_root_domain(cs); + + rcu_read_lock(); + css_put(&cs->css); +@@ -967,7 +967,7 @@ partition_and_rebuild_sched_domains(int + { + mutex_lock(&sched_domains_mutex); + partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); +- rebuild_root_domains(); ++ dl_rebuild_rd_accounting(); + mutex_unlock(&sched_domains_mutex); + } + diff --git a/queue-5.10/sched-cpuset-bring-back-cpuset_mutex.patch b/queue-5.10/sched-cpuset-bring-back-cpuset_mutex.patch new file mode 100644 index 00000000000..292b9851e4c --- /dev/null +++ b/queue-5.10/sched-cpuset-bring-back-cpuset_mutex.patch @@ -0,0 +1,437 @@ +From stable-owner@vger.kernel.org Sun Aug 20 17:25:55 2023 +From: Qais Yousef +Date: Sun, 20 Aug 2023 16:21:40 +0100 +Subject: sched/cpuset: Bring back cpuset_mutex +To: stable@vger.kernel.org +Cc: Juri Lelli , Waiman Long , Tejun Heo , Dietmar Eggemann , Peter Zijlstra , Vincent Guittot , Ingo Molnar , Hao Luo , John Stultz , cgroups@vger.kernel.org, linux-kernel@vger.kernel.org, Qais Yousef +Message-ID: <20230820152144.517461-3-qyousef@layalina.io> + +From: Juri Lelli + +commit 111cd11bbc54850f24191c52ff217da88a5e639b upstream. + +Turns out percpu_cpuset_rwsem - commit 1243dc518c9d ("cgroup/cpuset: +Convert cpuset_mutex to percpu_rwsem") - wasn't such a brilliant idea, +as it has been reported to cause slowdowns in workloads that need to +change cpuset configuration frequently and it is also not implementing +priority inheritance (which causes troubles with realtime workloads). + +Convert percpu_cpuset_rwsem back to regular cpuset_mutex. Also grab it +only for SCHED_DEADLINE tasks (other policies don't care about stable +cpusets anyway). + +Signed-off-by: Juri Lelli +Reviewed-by: Waiman Long +Signed-off-by: Tejun Heo +[ Fix conflict in kernel/cgroup/cpuset.c due to pulling new functions or + comment that don't exist on 5.10 or the usage of different cpu hotplug + lock whenever replacing the rwsem with mutex. Remove BUG_ON() for + rwsem that doesn't exist on mainline. ] +Signed-off-by: Qais Yousef (Google) +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/cpuset.h | 8 ++--- + kernel/cgroup/cpuset.c | 78 +++++++++++++++++++++++-------------------------- + kernel/sched/core.c | 22 +++++++++---- + 3 files changed, 57 insertions(+), 51 deletions(-) + +--- a/include/linux/cpuset.h ++++ b/include/linux/cpuset.h +@@ -55,8 +55,8 @@ extern void cpuset_init_smp(void); + extern void cpuset_force_rebuild(void); + extern void cpuset_update_active_cpus(void); + extern void cpuset_wait_for_hotplug(void); +-extern void cpuset_read_lock(void); +-extern void cpuset_read_unlock(void); ++extern void cpuset_lock(void); ++extern void cpuset_unlock(void); + extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); + extern void cpuset_cpus_allowed_fallback(struct task_struct *p); + extern nodemask_t cpuset_mems_allowed(struct task_struct *p); +@@ -178,8 +178,8 @@ static inline void cpuset_update_active_ + + static inline void cpuset_wait_for_hotplug(void) { } + +-static inline void cpuset_read_lock(void) { } +-static inline void cpuset_read_unlock(void) { } ++static inline void cpuset_lock(void) { } ++static inline void cpuset_unlock(void) { } + + static inline void cpuset_cpus_allowed(struct task_struct *p, + struct cpumask *mask) +--- a/kernel/cgroup/cpuset.c ++++ b/kernel/cgroup/cpuset.c +@@ -334,16 +334,16 @@ static struct cpuset top_cpuset = { + * guidelines for accessing subsystem state in kernel/cgroup.c + */ + +-DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem); ++static DEFINE_MUTEX(cpuset_mutex); + +-void cpuset_read_lock(void) ++void cpuset_lock(void) + { +- percpu_down_read(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + } + +-void cpuset_read_unlock(void) ++void cpuset_unlock(void) + { +- percpu_up_read(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + } + + static DEFINE_SPINLOCK(callback_lock); +@@ -930,7 +930,7 @@ static void dl_rebuild_rd_accounting(voi + struct cpuset *cs = NULL; + struct cgroup_subsys_state *pos_css; + +- percpu_rwsem_assert_held(&cpuset_rwsem); ++ lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpus_held(); + lockdep_assert_held(&sched_domains_mutex); + +@@ -991,7 +991,7 @@ static void rebuild_sched_domains_locked + int ndoms; + + lockdep_assert_cpus_held(); +- percpu_rwsem_assert_held(&cpuset_rwsem); ++ lockdep_assert_held(&cpuset_mutex); + + /* + * If we have raced with CPU hotplug, return early to avoid +@@ -1042,9 +1042,9 @@ static void rebuild_sched_domains_locked + void rebuild_sched_domains(void) + { + get_online_cpus(); +- percpu_down_write(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + rebuild_sched_domains_locked(); +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + put_online_cpus(); + } + +@@ -1160,7 +1160,7 @@ static int update_parent_subparts_cpumas + int new_prs; + bool part_error = false; /* Partition error? */ + +- percpu_rwsem_assert_held(&cpuset_rwsem); ++ lockdep_assert_held(&cpuset_mutex); + + /* + * The parent must be a partition root. +@@ -1490,7 +1490,7 @@ static void update_sibling_cpumasks(stru + struct cpuset *sibling; + struct cgroup_subsys_state *pos_css; + +- percpu_rwsem_assert_held(&cpuset_rwsem); ++ lockdep_assert_held(&cpuset_mutex); + + /* + * Check all its siblings and call update_cpumasks_hier() +@@ -2157,7 +2157,7 @@ static int cpuset_can_attach(struct cgro + cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); + cs = css_cs(css); + +- percpu_down_write(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + + /* allow moving tasks into an empty cpuset if on default hierarchy */ + ret = -ENOSPC; +@@ -2181,7 +2181,7 @@ static int cpuset_can_attach(struct cgro + cs->attach_in_progress++; + ret = 0; + out_unlock: +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + return ret; + } + +@@ -2193,11 +2193,11 @@ static void cpuset_cancel_attach(struct + cgroup_taskset_first(tset, &css); + cs = css_cs(css); + +- percpu_down_write(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + cs->attach_in_progress--; + if (!cs->attach_in_progress) + wake_up(&cpuset_attach_wq); +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + } + + /* +@@ -2221,7 +2221,7 @@ static void cpuset_attach(struct cgroup_ + cs = css_cs(css); + + lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */ +- percpu_down_write(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + + /* prepare for attach */ + if (cs == &top_cpuset) +@@ -2275,7 +2275,7 @@ static void cpuset_attach(struct cgroup_ + if (!cs->attach_in_progress) + wake_up(&cpuset_attach_wq); + +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + } + + /* The various types of files and directories in a cpuset file system */ +@@ -2307,7 +2307,7 @@ static int cpuset_write_u64(struct cgrou + int retval = 0; + + get_online_cpus(); +- percpu_down_write(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + if (!is_cpuset_online(cs)) { + retval = -ENODEV; + goto out_unlock; +@@ -2343,7 +2343,7 @@ static int cpuset_write_u64(struct cgrou + break; + } + out_unlock: +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + put_online_cpus(); + return retval; + } +@@ -2356,7 +2356,7 @@ static int cpuset_write_s64(struct cgrou + int retval = -ENODEV; + + get_online_cpus(); +- percpu_down_write(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + if (!is_cpuset_online(cs)) + goto out_unlock; + +@@ -2369,7 +2369,7 @@ static int cpuset_write_s64(struct cgrou + break; + } + out_unlock: +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + put_online_cpus(); + return retval; + } +@@ -2410,7 +2410,7 @@ static ssize_t cpuset_write_resmask(stru + flush_work(&cpuset_hotplug_work); + + get_online_cpus(); +- percpu_down_write(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + if (!is_cpuset_online(cs)) + goto out_unlock; + +@@ -2434,7 +2434,7 @@ static ssize_t cpuset_write_resmask(stru + + free_cpuset(trialcs); + out_unlock: +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + put_online_cpus(); + kernfs_unbreak_active_protection(of->kn); + css_put(&cs->css); +@@ -2567,13 +2567,13 @@ static ssize_t sched_partition_write(str + + css_get(&cs->css); + get_online_cpus(); +- percpu_down_write(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + if (!is_cpuset_online(cs)) + goto out_unlock; + + retval = update_prstate(cs, val); + out_unlock: +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + put_online_cpus(); + css_put(&cs->css); + return retval ?: nbytes; +@@ -2781,7 +2781,7 @@ static int cpuset_css_online(struct cgro + return 0; + + get_online_cpus(); +- percpu_down_write(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + + set_bit(CS_ONLINE, &cs->flags); + if (is_spread_page(parent)) +@@ -2832,7 +2832,7 @@ static int cpuset_css_online(struct cgro + cpumask_copy(cs->effective_cpus, parent->cpus_allowed); + spin_unlock_irq(&callback_lock); + out_unlock: +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + put_online_cpus(); + return 0; + } +@@ -2853,7 +2853,7 @@ static void cpuset_css_offline(struct cg + struct cpuset *cs = css_cs(css); + + get_online_cpus(); +- percpu_down_write(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + + if (is_partition_root(cs)) + update_prstate(cs, 0); +@@ -2872,7 +2872,7 @@ static void cpuset_css_offline(struct cg + cpuset_dec(); + clear_bit(CS_ONLINE, &cs->flags); + +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + put_online_cpus(); + } + +@@ -2885,7 +2885,7 @@ static void cpuset_css_free(struct cgrou + + static void cpuset_bind(struct cgroup_subsys_state *root_css) + { +- percpu_down_write(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + spin_lock_irq(&callback_lock); + + if (is_in_v2_mode()) { +@@ -2898,7 +2898,7 @@ static void cpuset_bind(struct cgroup_su + } + + spin_unlock_irq(&callback_lock); +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + } + + /* +@@ -2940,8 +2940,6 @@ struct cgroup_subsys cpuset_cgrp_subsys + + int __init cpuset_init(void) + { +- BUG_ON(percpu_init_rwsem(&cpuset_rwsem)); +- + BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); + BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); + BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL)); +@@ -3013,7 +3011,7 @@ hotplug_update_tasks_legacy(struct cpuse + is_empty = cpumask_empty(cs->cpus_allowed) || + nodes_empty(cs->mems_allowed); + +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + + /* + * Move tasks to the nearest ancestor with execution resources, +@@ -3023,7 +3021,7 @@ hotplug_update_tasks_legacy(struct cpuse + if (is_empty) + remove_tasks_in_empty_cpuset(cs); + +- percpu_down_write(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + } + + static void +@@ -3073,14 +3071,14 @@ static void cpuset_hotplug_update_tasks( + retry: + wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); + +- percpu_down_write(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + + /* + * We have raced with task attaching. We wait until attaching + * is finished, so we won't attach a task to an empty cpuset. + */ + if (cs->attach_in_progress) { +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + goto retry; + } + +@@ -3152,7 +3150,7 @@ update_tasks: + hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems, + cpus_updated, mems_updated); + +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + } + + /** +@@ -3182,7 +3180,7 @@ static void cpuset_hotplug_workfn(struct + if (on_dfl && !alloc_cpumasks(NULL, &tmp)) + ptmp = &tmp; + +- percpu_down_write(&cpuset_rwsem); ++ mutex_lock(&cpuset_mutex); + + /* fetch the available cpus/mems and find out which changed how */ + cpumask_copy(&new_cpus, cpu_active_mask); +@@ -3239,7 +3237,7 @@ static void cpuset_hotplug_workfn(struct + update_tasks_nodemask(&top_cpuset); + } + +- percpu_up_write(&cpuset_rwsem); ++ mutex_unlock(&cpuset_mutex); + + /* if cpus or mems changed, we need to propagate to descendants */ + if (cpus_updated || mems_updated) { +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -5221,6 +5221,7 @@ static int __sched_setscheduler(struct t + int reset_on_fork; + int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; + struct rq *rq; ++ bool cpuset_locked = false; + + /* The pi code expects interrupts enabled */ + BUG_ON(pi && in_interrupt()); +@@ -5318,8 +5319,14 @@ recheck: + return retval; + } + +- if (pi) +- cpuset_read_lock(); ++ /* ++ * SCHED_DEADLINE bandwidth accounting relies on stable cpusets ++ * information. ++ */ ++ if (dl_policy(policy) || dl_policy(p->policy)) { ++ cpuset_locked = true; ++ cpuset_lock(); ++ } + + /* + * Make sure no PI-waiters arrive (or leave) while we are +@@ -5395,8 +5402,8 @@ change: + if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { + policy = oldpolicy = -1; + task_rq_unlock(rq, p, &rf); +- if (pi) +- cpuset_read_unlock(); ++ if (cpuset_locked) ++ cpuset_unlock(); + goto recheck; + } + +@@ -5462,7 +5469,8 @@ change: + task_rq_unlock(rq, p, &rf); + + if (pi) { +- cpuset_read_unlock(); ++ if (cpuset_locked) ++ cpuset_unlock(); + rt_mutex_adjust_pi(p); + } + +@@ -5474,8 +5482,8 @@ change: + + unlock: + task_rq_unlock(rq, p, &rf); +- if (pi) +- cpuset_read_unlock(); ++ if (cpuset_locked) ++ cpuset_unlock(); + return retval; + } + diff --git a/queue-5.10/sched-cpuset-keep-track-of-sched_deadline-task-in-cpusets.patch b/queue-5.10/sched-cpuset-keep-track-of-sched_deadline-task-in-cpusets.patch new file mode 100644 index 00000000000..3b118213469 --- /dev/null +++ b/queue-5.10/sched-cpuset-keep-track-of-sched_deadline-task-in-cpusets.patch @@ -0,0 +1,163 @@ +From stable-owner@vger.kernel.org Sun Aug 20 17:25:33 2023 +From: Qais Yousef +Date: Sun, 20 Aug 2023 16:21:41 +0100 +Subject: sched/cpuset: Keep track of SCHED_DEADLINE task in cpusets +To: stable@vger.kernel.org +Cc: Juri Lelli , Waiman Long , Tejun Heo , Dietmar Eggemann , Peter Zijlstra , Vincent Guittot , Ingo Molnar , Hao Luo , John Stultz , cgroups@vger.kernel.org, linux-kernel@vger.kernel.org, Qais Yousef +Message-ID: <20230820152144.517461-4-qyousef@layalina.io> + +From: Juri Lelli + +commit 6c24849f5515e4966d94fa5279bdff4acf2e9489 upstream. + +Qais reported that iterating over all tasks when rebuilding root domains +for finding out which ones are DEADLINE and need their bandwidth +correctly restored on such root domains can be a costly operation (10+ +ms delays on suspend-resume). + +To fix the problem keep track of the number of DEADLINE tasks belonging +to each cpuset and then use this information (followup patch) to only +perform the above iteration if DEADLINE tasks are actually present in +the cpuset for which a corresponding root domain is being rebuilt. + +Reported-by: Qais Yousef (Google) +Link: https://lore.kernel.org/lkml/20230206221428.2125324-1-qyousef@layalina.io/ +Signed-off-by: Juri Lelli +Reviewed-by: Waiman Long +Signed-off-by: Tejun Heo +[ Fix conflicts in kernel/cgroup/cpuset.c and kernel/sched/deadline.c + due to pulling new fields and functions. Remove new code and match the + patch diff. ] +Signed-off-by: Qais Yousef (Google) +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/cpuset.h | 4 ++++ + kernel/cgroup/cgroup.c | 4 ++++ + kernel/cgroup/cpuset.c | 25 +++++++++++++++++++++++++ + kernel/sched/deadline.c | 13 +++++++++++++ + 4 files changed, 46 insertions(+) + +--- a/include/linux/cpuset.h ++++ b/include/linux/cpuset.h +@@ -55,6 +55,8 @@ extern void cpuset_init_smp(void); + extern void cpuset_force_rebuild(void); + extern void cpuset_update_active_cpus(void); + extern void cpuset_wait_for_hotplug(void); ++extern void inc_dl_tasks_cs(struct task_struct *task); ++extern void dec_dl_tasks_cs(struct task_struct *task); + extern void cpuset_lock(void); + extern void cpuset_unlock(void); + extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); +@@ -178,6 +180,8 @@ static inline void cpuset_update_active_ + + static inline void cpuset_wait_for_hotplug(void) { } + ++static inline void inc_dl_tasks_cs(struct task_struct *task) { } ++static inline void dec_dl_tasks_cs(struct task_struct *task) { } + static inline void cpuset_lock(void) { } + static inline void cpuset_unlock(void) { } + +--- a/kernel/cgroup/cgroup.c ++++ b/kernel/cgroup/cgroup.c +@@ -56,6 +56,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -6326,6 +6327,9 @@ void cgroup_exit(struct task_struct *tsk + list_add_tail(&tsk->cg_list, &cset->dying_tasks); + cset->nr_tasks--; + ++ if (dl_task(tsk)) ++ dec_dl_tasks_cs(tsk); ++ + WARN_ON_ONCE(cgroup_task_frozen(tsk)); + if (unlikely(cgroup_task_freeze(tsk))) + cgroup_update_frozen(task_dfl_cgroup(tsk)); +--- a/kernel/cgroup/cpuset.c ++++ b/kernel/cgroup/cpuset.c +@@ -161,6 +161,12 @@ struct cpuset { + */ + int use_parent_ecpus; + int child_ecpus_count; ++ ++ /* ++ * number of SCHED_DEADLINE tasks attached to this cpuset, so that we ++ * know when to rebuild associated root domain bandwidth information. ++ */ ++ int nr_deadline_tasks; + }; + + /* +@@ -206,6 +212,20 @@ static inline struct cpuset *parent_cs(s + return css_cs(cs->css.parent); + } + ++void inc_dl_tasks_cs(struct task_struct *p) ++{ ++ struct cpuset *cs = task_cs(p); ++ ++ cs->nr_deadline_tasks++; ++} ++ ++void dec_dl_tasks_cs(struct task_struct *p) ++{ ++ struct cpuset *cs = task_cs(p); ++ ++ cs->nr_deadline_tasks--; ++} ++ + /* bits in struct cpuset flags field */ + typedef enum { + CS_ONLINE, +@@ -2172,6 +2192,11 @@ static int cpuset_can_attach(struct cgro + ret = security_task_setscheduler(task); + if (ret) + goto out_unlock; ++ ++ if (dl_task(task)) { ++ cs->nr_deadline_tasks++; ++ cpuset_attach_old_cs->nr_deadline_tasks--; ++ } + } + + /* +--- a/kernel/sched/deadline.c ++++ b/kernel/sched/deadline.c +@@ -17,6 +17,7 @@ + */ + #include "sched.h" + #include "pelt.h" ++#include + + struct dl_bandwidth def_dl_bandwidth; + +@@ -2417,6 +2418,12 @@ static void switched_from_dl(struct rq * + if (task_on_rq_queued(p) && p->dl.dl_runtime) + task_non_contending(p); + ++ /* ++ * In case a task is setscheduled out from SCHED_DEADLINE we need to ++ * keep track of that on its cpuset (for correct bandwidth tracking). ++ */ ++ dec_dl_tasks_cs(p); ++ + if (!task_on_rq_queued(p)) { + /* + * Inactive timer is armed. However, p is leaving DEADLINE and +@@ -2457,6 +2464,12 @@ static void switched_to_dl(struct rq *rq + if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) + put_task_struct(p); + ++ /* ++ * In case a task is setscheduled to SCHED_DEADLINE we need to keep ++ * track of that on its cpuset (for correct bandwidth tracking). ++ */ ++ inc_dl_tasks_cs(p); ++ + /* If p is not queued we will update its parameters at next wakeup. */ + if (!task_on_rq_queued(p)) { + add_rq_bw(&p->dl, &rq->dl); diff --git a/queue-5.10/sched-deadline-create-dl-bw-alloc-free-check-overflow-interface.patch b/queue-5.10/sched-deadline-create-dl-bw-alloc-free-check-overflow-interface.patch new file mode 100644 index 00000000000..882b0ea0a05 --- /dev/null +++ b/queue-5.10/sched-deadline-create-dl-bw-alloc-free-check-overflow-interface.patch @@ -0,0 +1,166 @@ +From stable-owner@vger.kernel.org Sun Aug 20 17:25:35 2023 +From: Qais Yousef +Date: Sun, 20 Aug 2023 16:21:43 +0100 +Subject: sched/deadline: Create DL BW alloc, free & check overflow interface +To: stable@vger.kernel.org +Cc: Juri Lelli , Waiman Long , Tejun Heo , Dietmar Eggemann , Peter Zijlstra , Vincent Guittot , Ingo Molnar , Hao Luo , John Stultz , cgroups@vger.kernel.org, linux-kernel@vger.kernel.org, Qais Yousef +Message-ID: <20230820152144.517461-6-qyousef@layalina.io> + +From: Dietmar Eggemann + +commit 85989106feb734437e2d598b639991b9185a43a6 upstream. + +While moving a set of tasks between exclusive cpusets, +cpuset_can_attach() -> task_can_attach() calls dl_cpu_busy(..., p) for +DL BW overflow checking and per-task DL BW allocation on the destination +root_domain for the DL tasks in this set. + +This approach has the issue of not freeing already allocated DL BW in +the following error cases: + +(1) The set of tasks includes multiple DL tasks and DL BW overflow + checking fails for one of the subsequent DL tasks. + +(2) Another controller next to the cpuset controller which is attached + to the same cgroup fails in its can_attach(). + +To address this problem rework dl_cpu_busy(): + +(1) Split it into dl_bw_check_overflow() & dl_bw_alloc() and add a + dedicated dl_bw_free(). + +(2) dl_bw_alloc() & dl_bw_free() take a `u64 dl_bw` parameter instead of + a `struct task_struct *p` used in dl_cpu_busy(). This allows to + allocate DL BW for a set of tasks too rather than only for a single + task. + +Signed-off-by: Dietmar Eggemann +Signed-off-by: Juri Lelli +Signed-off-by: Tejun Heo +Signed-off-by: Qais Yousef (Google) +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/sched.h | 2 + + kernel/sched/core.c | 4 +-- + kernel/sched/deadline.c | 53 ++++++++++++++++++++++++++++++++++++------------ + kernel/sched/sched.h | 2 - + 4 files changed, 45 insertions(+), 16 deletions(-) + +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -1658,6 +1658,8 @@ current_restore_flags(unsigned long orig + + extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); + extern int task_can_attach(struct task_struct *p, const struct cpumask *cs_effective_cpus); ++extern int dl_bw_alloc(int cpu, u64 dl_bw); ++extern void dl_bw_free(int cpu, u64 dl_bw); + #ifdef CONFIG_SMP + extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); + extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask); +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -6625,7 +6625,7 @@ int task_can_attach(struct task_struct * + + if (unlikely(cpu >= nr_cpu_ids)) + return -EINVAL; +- ret = dl_cpu_busy(cpu, p); ++ ret = dl_bw_alloc(cpu, p->dl.dl_bw); + } + + out: +@@ -6885,7 +6885,7 @@ static void cpuset_cpu_active(void) + static int cpuset_cpu_inactive(unsigned int cpu) + { + if (!cpuhp_tasks_frozen) { +- int ret = dl_cpu_busy(cpu, NULL); ++ int ret = dl_bw_check_overflow(cpu); + + if (ret) + return ret; +--- a/kernel/sched/deadline.c ++++ b/kernel/sched/deadline.c +@@ -2858,26 +2858,38 @@ int dl_cpuset_cpumask_can_shrink(const s + return ret; + } + +-int dl_cpu_busy(int cpu, struct task_struct *p) ++enum dl_bw_request { ++ dl_bw_req_check_overflow = 0, ++ dl_bw_req_alloc, ++ dl_bw_req_free ++}; ++ ++static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw) + { +- unsigned long flags, cap; ++ unsigned long flags; + struct dl_bw *dl_b; +- bool overflow; ++ bool overflow = 0; + + rcu_read_lock_sched(); + dl_b = dl_bw_of(cpu); + raw_spin_lock_irqsave(&dl_b->lock, flags); +- cap = dl_bw_capacity(cpu); +- overflow = __dl_overflow(dl_b, cap, 0, p ? p->dl.dl_bw : 0); + +- if (!overflow && p) { +- /* +- * We reserve space for this task in the destination +- * root_domain, as we can't fail after this point. +- * We will free resources in the source root_domain +- * later on (see set_cpus_allowed_dl()). +- */ +- __dl_add(dl_b, p->dl.dl_bw, dl_bw_cpus(cpu)); ++ if (req == dl_bw_req_free) { ++ __dl_sub(dl_b, dl_bw, dl_bw_cpus(cpu)); ++ } else { ++ unsigned long cap = dl_bw_capacity(cpu); ++ ++ overflow = __dl_overflow(dl_b, cap, 0, dl_bw); ++ ++ if (req == dl_bw_req_alloc && !overflow) { ++ /* ++ * We reserve space in the destination ++ * root_domain, as we can't fail after this point. ++ * We will free resources in the source root_domain ++ * later on (see set_cpus_allowed_dl()). ++ */ ++ __dl_add(dl_b, dl_bw, dl_bw_cpus(cpu)); ++ } + } + + raw_spin_unlock_irqrestore(&dl_b->lock, flags); +@@ -2885,6 +2897,21 @@ int dl_cpu_busy(int cpu, struct task_str + + return overflow ? -EBUSY : 0; + } ++ ++int dl_bw_check_overflow(int cpu) ++{ ++ return dl_bw_manage(dl_bw_req_check_overflow, cpu, 0); ++} ++ ++int dl_bw_alloc(int cpu, u64 dl_bw) ++{ ++ return dl_bw_manage(dl_bw_req_alloc, cpu, dl_bw); ++} ++ ++void dl_bw_free(int cpu, u64 dl_bw) ++{ ++ dl_bw_manage(dl_bw_req_free, cpu, dl_bw); ++} + #endif + + #ifdef CONFIG_SCHED_DEBUG +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -348,7 +348,7 @@ extern void __getparam_dl(struct task_st + extern bool __checkparam_dl(const struct sched_attr *attr); + extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); + extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); +-extern int dl_cpu_busy(int cpu, struct task_struct *p); ++extern int dl_bw_check_overflow(int cpu); + + #ifdef CONFIG_CGROUP_SCHED + diff --git a/queue-5.10/series b/queue-5.10/series index 84bf2e95ae9..5dde9a8f37c 100644 --- a/queue-5.10/series +++ b/queue-5.10/series @@ -63,3 +63,9 @@ x86-fpu-set-x86_feature_osxsave-feature-after-enabling-osxsave-in-cr4.patch torture-fix-hang-during-kthread-shutdown-phase.patch tick-detect-and-fix-jiffies-update-stall.patch timers-nohz-switch-to-oneshot_stopped-in-the-low-res-handler-when-the-tick-is-stopped.patch +cgroup-cpuset-rename-functions-dealing-with-deadline-accounting.patch +sched-cpuset-bring-back-cpuset_mutex.patch +sched-cpuset-keep-track-of-sched_deadline-task-in-cpusets.patch +cgroup-cpuset-iterate-only-if-deadline-tasks-are-present.patch +sched-deadline-create-dl-bw-alloc-free-check-overflow-interface.patch +cgroup-cpuset-free-dl-bw-in-case-can_attach-fails.patch -- 2.47.3