From db2d40e5fff71df6ce9d3654b66bd93bf4ae5ca6 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Sat, 5 Mar 2022 23:26:11 +0100
Subject: [PATCH] 5.15-stable patches

added patches:
	sched-fix-yet-more-sched_fork-races.patch
---
 .../sched-fix-yet-more-sched_fork-races.patch | 173 ++++++++++++++++++
 queue-5.15/series                             |   1 +
 2 files changed, 174 insertions(+)
 create mode 100644 queue-5.15/sched-fix-yet-more-sched_fork-races.patch

diff --git a/queue-5.15/sched-fix-yet-more-sched_fork-races.patch b/queue-5.15/sched-fix-yet-more-sched_fork-races.patch
new file mode 100644
index 00000000000..277648532bc
--- /dev/null
+++ b/queue-5.15/sched-fix-yet-more-sched_fork-races.patch
@@ -0,0 +1,173 @@
+From b1e8206582f9d680cff7d04828708c8b6ab32957 Mon Sep 17 00:00:00 2001
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Mon, 14 Feb 2022 10:16:57 +0100
+Subject: sched: Fix yet more sched_fork() races
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+commit b1e8206582f9d680cff7d04828708c8b6ab32957 upstream.
+
+Where commit 4ef0c5c6b5ba ("kernel/sched: Fix sched_fork() access an
+invalid sched_task_group") fixed a fork race vs cgroup, it opened up a
+race vs syscalls by not placing the task on the runqueue before it
+gets exposed through the pidhash.
+
+Commit 13765de8148f ("sched/fair: Fix fault in reweight_entity") is
+trying to fix a single instance of this, instead fix the whole class
+of issues, effectively reverting this commit.
+
+Fixes: 4ef0c5c6b5ba ("kernel/sched: Fix sched_fork() access an invalid sched_task_group")
+Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Tested-by: Tadeusz Struk <tadeusz.struk@linaro.org>
+Tested-by: Zhang Qiao <zhangqiao22@huawei.com>
+Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Link: https://lkml.kernel.org/r/YgoeCbwj5mbCR0qA@hirez.programming.kicks-ass.net
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/sched/task.h |    4 ++--
+ kernel/fork.c              |   13 ++++++++++++-
+ kernel/sched/core.c        |   34 +++++++++++++++++++++-------------
+ 3 files changed, 35 insertions(+), 16 deletions(-)
+
+--- a/include/linux/sched/task.h
++++ b/include/linux/sched/task.h
+@@ -54,8 +54,8 @@ extern asmlinkage void schedule_tail(str
+ extern void init_idle(struct task_struct *idle, int cpu);
+ 
+ extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
+-extern void sched_post_fork(struct task_struct *p,
+-			    struct kernel_clone_args *kargs);
++extern void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs);
++extern void sched_post_fork(struct task_struct *p);
+ extern void sched_dead(struct task_struct *p);
+ 
+ void __noreturn do_task_dead(void);
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -2297,6 +2297,17 @@ static __latent_entropy struct task_stru
+ 		goto bad_fork_put_pidfd;
+ 
+ 	/*
++	 * Now that the cgroups are pinned, re-clone the parent cgroup and put
++	 * the new task on the correct runqueue. All this *before* the task
++	 * becomes visible.
++	 *
++	 * This isn't part of ->can_fork() because while the re-cloning is
++	 * cgroup specific, it unconditionally needs to place the task on a
++	 * runqueue.
++	 */
++	sched_cgroup_fork(p, args);
++
++	/*
+ 	 * From this point on we must avoid any synchronous user-space
+ 	 * communication until we take the tasklist-lock. In particular, we do
+ 	 * not want user-space to be able to predict the process start-time by
+@@ -2405,7 +2416,7 @@ static __latent_entropy struct task_stru
+ 		fd_install(pidfd, pidfile);
+ 
+ 	proc_fork_connector(p);
+-	sched_post_fork(p, args);
++	sched_post_fork(p);
+ 	cgroup_post_fork(p, args);
+ 	perf_event_fork(p);
+ 
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -1199,9 +1199,8 @@ int tg_nop(struct task_group *tg, void *
+ }
+ #endif
+ 
+-static void set_load_weight(struct task_struct *p)
++static void set_load_weight(struct task_struct *p, bool update_load)
+ {
+-	bool update_load = !(READ_ONCE(p->__state) & TASK_NEW);
+ 	int prio = p->static_prio - MAX_RT_PRIO;
+ 	struct load_weight *load = &p->se.load;
+ 
+@@ -4359,7 +4358,7 @@ int sched_fork(unsigned long clone_flags
+ 			p->static_prio = NICE_TO_PRIO(0);
+ 
+ 		p->prio = p->normal_prio = p->static_prio;
+-		set_load_weight(p);
++		set_load_weight(p, false);
+ 
+ 		/*
+ 		 * We don't need the reset flag anymore after the fork. It has
+@@ -4377,6 +4376,7 @@ int sched_fork(unsigned long clone_flags
+ 
+ 	init_entity_runnable_average(&p->se);
+ 
++
+ #ifdef CONFIG_SCHED_INFO
+ 	if (likely(sched_info_on()))
+ 		memset(&p->sched_info, 0, sizeof(p->sched_info));
+@@ -4392,18 +4392,23 @@ int sched_fork(unsigned long clone_flags
+ 	return 0;
+ }
+ 
+-void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs)
++void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
+ {
+ 	unsigned long flags;
+-#ifdef CONFIG_CGROUP_SCHED
+-	struct task_group *tg;
+-#endif
+ 
++	/*
++	 * Because we're not yet on the pid-hash, p->pi_lock isn't strictly
++	 * required yet, but lockdep gets upset if rules are violated.
++	 */
+ 	raw_spin_lock_irqsave(&p->pi_lock, flags);
+ #ifdef CONFIG_CGROUP_SCHED
+-	tg = container_of(kargs->cset->subsys[cpu_cgrp_id],
+-			  struct task_group, css);
+-	p->sched_task_group = autogroup_task_group(p, tg);
++	if (1) {
++		struct task_group *tg;
++		tg = container_of(kargs->cset->subsys[cpu_cgrp_id],
++				  struct task_group, css);
++		tg = autogroup_task_group(p, tg);
++		p->sched_task_group = tg;
++	}
+ #endif
+ 	rseq_migrate(p);
+ 	/*
+@@ -4414,7 +4419,10 @@ void sched_post_fork(struct task_struct
+ 	if (p->sched_class->task_fork)
+ 		p->sched_class->task_fork(p);
+ 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
++}
+ 
++void sched_post_fork(struct task_struct *p)
++{
+ 	uclamp_post_fork(p);
+ }
+ 
+@@ -6903,7 +6911,7 @@ void set_user_nice(struct task_struct *p
+ 		put_prev_task(rq, p);
+ 
+ 	p->static_prio = NICE_TO_PRIO(nice);
+-	set_load_weight(p);
++	set_load_weight(p, true);
+ 	old_prio = p->prio;
+ 	p->prio = effective_prio(p);
+ 
+@@ -7194,7 +7202,7 @@ static void __setscheduler_params(struct
+ 	 */
+ 	p->rt_priority = attr->sched_priority;
+ 	p->normal_prio = normal_prio(p);
+-	set_load_weight(p);
++	set_load_weight(p, true);
+ }
+ 
+ /*
+@@ -9432,7 +9440,7 @@ void __init sched_init(void)
+ #endif
+ 	}
+ 
+-	set_load_weight(&init_task);
++	set_load_weight(&init_task, false);
+ 
+ 	/*
+ 	 * The boot idle thread does lazy MMU switching as well:
diff --git a/queue-5.15/series b/queue-5.15/series
index 832c658a26d..d5007e75bfe 100644
--- a/queue-5.15/series
+++ b/queue-5.15/series
@@ -199,3 +199,4 @@ igc-igc_write_phy_reg_gpy-drop-premature-return.patch
 ibmvnic-free-reset-work-item-when-flushing.patch
 memfd-fix-f_seal_write-after-shmem-huge-page-allocated.patch
 s390-extable-fix-exception-table-sorting.patch
+sched-fix-yet-more-sched_fork-races.patch
-- 
2.47.3