From db2d40e5fff71df6ce9d3654b66bd93bf4ae5ca6 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 5 Mar 2022 23:26:11 +0100 Subject: [PATCH] 5.15-stable patches added patches: sched-fix-yet-more-sched_fork-races.patch --- .../sched-fix-yet-more-sched_fork-races.patch | 173 ++++++++++++++++++ queue-5.15/series | 1 + 2 files changed, 174 insertions(+) create mode 100644 queue-5.15/sched-fix-yet-more-sched_fork-races.patch diff --git a/queue-5.15/sched-fix-yet-more-sched_fork-races.patch b/queue-5.15/sched-fix-yet-more-sched_fork-races.patch new file mode 100644 index 00000000000..277648532bc --- /dev/null +++ b/queue-5.15/sched-fix-yet-more-sched_fork-races.patch @@ -0,0 +1,173 @@ +From b1e8206582f9d680cff7d04828708c8b6ab32957 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra +Date: Mon, 14 Feb 2022 10:16:57 +0100 +Subject: sched: Fix yet more sched_fork() races + +From: Peter Zijlstra + +commit b1e8206582f9d680cff7d04828708c8b6ab32957 upstream. + +Where commit 4ef0c5c6b5ba ("kernel/sched: Fix sched_fork() access an +invalid sched_task_group") fixed a fork race vs cgroup, it opened up a +race vs syscalls by not placing the task on the runqueue before it +gets exposed through the pidhash. + +Commit 13765de8148f ("sched/fair: Fix fault in reweight_entity") is +trying to fix a single instance of this, instead fix the whole class +of issues, effectively reverting this commit. + +Fixes: 4ef0c5c6b5ba ("kernel/sched: Fix sched_fork() access an invalid sched_task_group") +Reported-by: Linus Torvalds +Signed-off-by: Peter Zijlstra (Intel) +Tested-by: Tadeusz Struk +Tested-by: Zhang Qiao +Tested-by: Dietmar Eggemann +Link: https://lkml.kernel.org/r/YgoeCbwj5mbCR0qA@hirez.programming.kicks-ass.net +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/sched/task.h | 4 ++-- + kernel/fork.c | 13 ++++++++++++- + kernel/sched/core.c | 34 +++++++++++++++++++++------------- + 3 files changed, 35 insertions(+), 16 deletions(-) + +--- a/include/linux/sched/task.h ++++ b/include/linux/sched/task.h +@@ -54,8 +54,8 @@ extern asmlinkage void schedule_tail(str + extern void init_idle(struct task_struct *idle, int cpu); + + extern int sched_fork(unsigned long clone_flags, struct task_struct *p); +-extern void sched_post_fork(struct task_struct *p, +- struct kernel_clone_args *kargs); ++extern void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs); ++extern void sched_post_fork(struct task_struct *p); + extern void sched_dead(struct task_struct *p); + + void __noreturn do_task_dead(void); +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -2297,6 +2297,17 @@ static __latent_entropy struct task_stru + goto bad_fork_put_pidfd; + + /* ++ * Now that the cgroups are pinned, re-clone the parent cgroup and put ++ * the new task on the correct runqueue. All this *before* the task ++ * becomes visible. ++ * ++ * This isn't part of ->can_fork() because while the re-cloning is ++ * cgroup specific, it unconditionally needs to place the task on a ++ * runqueue. ++ */ ++ sched_cgroup_fork(p, args); ++ ++ /* + * From this point on we must avoid any synchronous user-space + * communication until we take the tasklist-lock. In particular, we do + * not want user-space to be able to predict the process start-time by +@@ -2405,7 +2416,7 @@ static __latent_entropy struct task_stru + fd_install(pidfd, pidfile); + + proc_fork_connector(p); +- sched_post_fork(p, args); ++ sched_post_fork(p); + cgroup_post_fork(p, args); + perf_event_fork(p); + +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -1199,9 +1199,8 @@ int tg_nop(struct task_group *tg, void * + } + #endif + +-static void set_load_weight(struct task_struct *p) ++static void set_load_weight(struct task_struct *p, bool update_load) + { +- bool update_load = !(READ_ONCE(p->__state) & TASK_NEW); + int prio = p->static_prio - MAX_RT_PRIO; + struct load_weight *load = &p->se.load; + +@@ -4359,7 +4358,7 @@ int sched_fork(unsigned long clone_flags + p->static_prio = NICE_TO_PRIO(0); + + p->prio = p->normal_prio = p->static_prio; +- set_load_weight(p); ++ set_load_weight(p, false); + + /* + * We don't need the reset flag anymore after the fork. It has +@@ -4377,6 +4376,7 @@ int sched_fork(unsigned long clone_flags + + init_entity_runnable_average(&p->se); + ++ + #ifdef CONFIG_SCHED_INFO + if (likely(sched_info_on())) + memset(&p->sched_info, 0, sizeof(p->sched_info)); +@@ -4392,18 +4392,23 @@ int sched_fork(unsigned long clone_flags + return 0; + } + +-void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) ++void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) + { + unsigned long flags; +-#ifdef CONFIG_CGROUP_SCHED +- struct task_group *tg; +-#endif + ++ /* ++ * Because we're not yet on the pid-hash, p->pi_lock isn't strictly ++ * required yet, but lockdep gets upset if rules are violated. ++ */ + raw_spin_lock_irqsave(&p->pi_lock, flags); + #ifdef CONFIG_CGROUP_SCHED +- tg = container_of(kargs->cset->subsys[cpu_cgrp_id], +- struct task_group, css); +- p->sched_task_group = autogroup_task_group(p, tg); ++ if (1) { ++ struct task_group *tg; ++ tg = container_of(kargs->cset->subsys[cpu_cgrp_id], ++ struct task_group, css); ++ tg = autogroup_task_group(p, tg); ++ p->sched_task_group = tg; ++ } + #endif + rseq_migrate(p); + /* +@@ -4414,7 +4419,10 @@ void sched_post_fork(struct task_struct + if (p->sched_class->task_fork) + p->sched_class->task_fork(p); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} + ++void sched_post_fork(struct task_struct *p) ++{ + uclamp_post_fork(p); + } + +@@ -6903,7 +6911,7 @@ void set_user_nice(struct task_struct *p + put_prev_task(rq, p); + + p->static_prio = NICE_TO_PRIO(nice); +- set_load_weight(p); ++ set_load_weight(p, true); + old_prio = p->prio; + p->prio = effective_prio(p); + +@@ -7194,7 +7202,7 @@ static void __setscheduler_params(struct + */ + p->rt_priority = attr->sched_priority; + p->normal_prio = normal_prio(p); +- set_load_weight(p); ++ set_load_weight(p, true); + } + + /* +@@ -9432,7 +9440,7 @@ void __init sched_init(void) + #endif + } + +- set_load_weight(&init_task); ++ set_load_weight(&init_task, false); + + /* + * The boot idle thread does lazy MMU switching as well: diff --git a/queue-5.15/series b/queue-5.15/series index 832c658a26d..d5007e75bfe 100644 --- a/queue-5.15/series +++ b/queue-5.15/series @@ -199,3 +199,4 @@ igc-igc_write_phy_reg_gpy-drop-premature-return.patch ibmvnic-free-reset-work-item-when-flushing.patch memfd-fix-f_seal_write-after-shmem-huge-page-allocated.patch s390-extable-fix-exception-table-sorting.patch +sched-fix-yet-more-sched_fork-races.patch -- 2.47.2