--- /dev/null
+From 1612160b91272f5b1596f499584d6064bf5be794 Mon Sep 17 00:00:00 2001
+From: "Paul E. McKenney" <paulmck@kernel.org>
+Date: Fri, 2 Feb 2024 11:49:06 -0800
+Subject: rcu-tasks: Eliminate deadlocks involving do_exit() and RCU tasks
+
+From: Paul E. McKenney <paulmck@kernel.org>
+
+commit 1612160b91272f5b1596f499584d6064bf5be794 upstream.
+
+Holding a mutex across synchronize_rcu_tasks() and acquiring
+that same mutex in code called from do_exit() after its call to
+exit_tasks_rcu_start() but before its call to exit_tasks_rcu_stop()
+results in deadlock. This is by design, because tasks that are far
+enough into do_exit() are no longer present on the tasks list, making
+it a bit difficult for RCU Tasks to find them, let alone wait on them
+to do a voluntary context switch. However, such deadlocks are becoming
+more frequent. In addition, lockdep currently does not detect such
+deadlocks and they can be difficult to reproduce.
+
+In addition, if a task voluntarily context switches during that time
+(for example, if it blocks acquiring a mutex), then this task is in an
+RCU Tasks quiescent state. And with some adjustments, RCU Tasks could
+just as well take advantage of that fact.
+
+This commit therefore eliminates these deadlock by replacing the
+SRCU-based wait for do_exit() completion with per-CPU lists of tasks
+currently exiting. A given task will be on one of these per-CPU lists for
+the same period of time that this task would previously have been in the
+previous SRCU read-side critical section. These lists enable RCU Tasks
+to find the tasks that have already been removed from the tasks list,
+but that must nevertheless be waited upon.
+
+The RCU Tasks grace period gathers any of these do_exit() tasks that it
+must wait on, and adds them to the list of holdouts. Per-CPU locking
+and get_task_struct() are used to synchronize addition to and removal
+from these lists.
+
+Link: https://lore.kernel.org/all/20240118021842.290665-1-chenzhongjin@huawei.com/
+
+Reported-by: Chen Zhongjin <chenzhongjin@huawei.com>
+Reported-by: Yang Jihong <yangjihong1@huawei.com>
+Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
+Tested-by: Yang Jihong <yangjihong1@huawei.com>
+Tested-by: Chen Zhongjin <chenzhongjin@huawei.com>
+Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
+Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
+Cc: Tahera Fahimi <taherafahimi@linux.microsoft.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/rcu/tasks.h | 44 ++++++++++++++++++++++++++++----------------
+ 1 file changed, 28 insertions(+), 16 deletions(-)
+
+--- a/kernel/rcu/tasks.h
++++ b/kernel/rcu/tasks.h
+@@ -150,8 +150,6 @@ static struct rcu_tasks rt_name =
+ }
+
+ #ifdef CONFIG_TASKS_RCU
+-/* Track exiting tasks in order to allow them to be waited for. */
+-DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu);
+
+ /* Report delay in synchronize_srcu() completion in rcu_tasks_postscan(). */
+ static void tasks_rcu_exit_srcu_stall(struct timer_list *unused);
+@@ -879,10 +877,12 @@ static void rcu_tasks_wait_gp(struct rcu
+ // number of voluntary context switches, and add that task to the
+ // holdout list.
+ // rcu_tasks_postscan():
+-// Invoke synchronize_srcu() to ensure that all tasks that were
+-// in the process of exiting (and which thus might not know to
+-// synchronize with this RCU Tasks grace period) have completed
+-// exiting.
++// Gather per-CPU lists of tasks in do_exit() to ensure that all
++// tasks that were in the process of exiting (and which thus might
++// not know to synchronize with this RCU Tasks grace period) have
++// completed exiting. The synchronize_rcu() in rcu_tasks_postgp()
++// will take care of any tasks stuck in the non-preemptible region
++// of do_exit() following its call to exit_tasks_rcu_stop().
+ // check_all_holdout_tasks(), repeatedly until holdout list is empty:
+ // Scans the holdout list, attempting to identify a quiescent state
+ // for each task on the list. If there is a quiescent state, the
+@@ -895,8 +895,10 @@ static void rcu_tasks_wait_gp(struct rcu
+ // with interrupts disabled.
+ //
+ // For each exiting task, the exit_tasks_rcu_start() and
+-// exit_tasks_rcu_finish() functions begin and end, respectively, the SRCU
+-// read-side critical sections waited for by rcu_tasks_postscan().
++// exit_tasks_rcu_finish() functions add and remove, respectively, the
++// current task to a per-CPU list of tasks that rcu_tasks_postscan() must
++// wait on. This is necessary because rcu_tasks_postscan() must wait on
++// tasks that have already been removed from the global list of tasks.
+ //
+ // Pre-grace-period update-side code is ordered before the grace
+ // via the raw_spin_lock.*rcu_node(). Pre-grace-period read-side code
+@@ -960,9 +962,13 @@ static void rcu_tasks_pertask(struct tas
+ }
+ }
+
++void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func);
++DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks");
++
+ /* Processing between scanning taskslist and draining the holdout list. */
+ static void rcu_tasks_postscan(struct list_head *hop)
+ {
++ int cpu;
+ int rtsi = READ_ONCE(rcu_task_stall_info);
+
+ if (!IS_ENABLED(CONFIG_TINY_RCU)) {
+@@ -976,9 +982,9 @@ static void rcu_tasks_postscan(struct li
+ * this, divide the fragile exit path part in two intersecting
+ * read side critical sections:
+ *
+- * 1) An _SRCU_ read side starting before calling exit_notify(),
+- * which may remove the task from the tasklist, and ending after
+- * the final preempt_disable() call in do_exit().
++ * 1) A task_struct list addition before calling exit_notify(),
++ * which may remove the task from the tasklist, with the
++ * removal after the final preempt_disable() call in do_exit().
+ *
+ * 2) An _RCU_ read side starting with the final preempt_disable()
+ * call in do_exit() and ending with the final call to schedule()
+@@ -987,7 +993,17 @@ static void rcu_tasks_postscan(struct li
+ * This handles the part 1). And postgp will handle part 2) with a
+ * call to synchronize_rcu().
+ */
+- synchronize_srcu(&tasks_rcu_exit_srcu);
++
++ for_each_possible_cpu(cpu) {
++ struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rcu_tasks.rtpcpu, cpu);
++ struct task_struct *t;
++
++ raw_spin_lock_irq_rcu_node(rtpcp);
++ list_for_each_entry(t, &rtpcp->rtp_exit_list, rcu_tasks_exit_list)
++ if (list_empty(&t->rcu_tasks_holdout_list))
++ rcu_tasks_pertask(t, hop);
++ raw_spin_unlock_irq_rcu_node(rtpcp);
++ }
+
+ if (!IS_ENABLED(CONFIG_TINY_RCU))
+ del_timer_sync(&tasks_rcu_exit_srcu_stall_timer);
+@@ -1055,7 +1071,6 @@ static void rcu_tasks_postgp(struct rcu_
+ *
+ * In addition, this synchronize_rcu() waits for exiting tasks
+ * to complete their final preempt_disable() region of execution,
+- * cleaning up after synchronize_srcu(&tasks_rcu_exit_srcu),
+ * enforcing the whole region before tasklist removal until
+ * the final schedule() with TASK_DEAD state to be an RCU TASKS
+ * read side critical section.
+@@ -1063,9 +1078,6 @@ static void rcu_tasks_postgp(struct rcu_
+ synchronize_rcu();
+ }
+
+-void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func);
+-DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks");
+-
+ static void tasks_rcu_exit_srcu_stall(struct timer_list *unused)
+ {
+ #ifndef CONFIG_TINY_RCU
--- /dev/null
+From 6b70399f9ef3809f6e308fd99dd78b072c1bd05c Mon Sep 17 00:00:00 2001
+From: "Paul E. McKenney" <paulmck@kernel.org>
+Date: Fri, 2 Feb 2024 11:28:45 -0800
+Subject: rcu-tasks: Maintain lists to eliminate RCU-tasks/do_exit() deadlocks
+
+From: Paul E. McKenney <paulmck@kernel.org>
+
+commit 6b70399f9ef3809f6e308fd99dd78b072c1bd05c upstream.
+
+This commit continues the elimination of deadlocks involving do_exit()
+and RCU tasks by causing exit_tasks_rcu_start() to add the current
+task to a per-CPU list and causing exit_tasks_rcu_stop() to remove the
+current task from whatever list it is on. These lists will be used to
+track tasks that are exiting, while still accounting for any RCU-tasks
+quiescent states that these tasks pass though.
+
+[ paulmck: Apply Frederic Weisbecker feedback. ]
+
+Link: https://lore.kernel.org/all/20240118021842.290665-1-chenzhongjin@huawei.com/
+
+Reported-by: Chen Zhongjin <chenzhongjin@huawei.com>
+Reported-by: Yang Jihong <yangjihong1@huawei.com>
+Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
+Tested-by: Yang Jihong <yangjihong1@huawei.com>
+Tested-by: Chen Zhongjin <chenzhongjin@huawei.com>
+Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
+Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
+Cc: Tahera Fahimi <taherafahimi@linux.microsoft.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/rcu/tasks.h | 43 +++++++++++++++++++++++++++++++++----------
+ 1 file changed, 33 insertions(+), 10 deletions(-)
+
+--- a/kernel/rcu/tasks.h
++++ b/kernel/rcu/tasks.h
+@@ -1175,25 +1175,48 @@ struct task_struct *get_rcu_tasks_gp_kth
+ EXPORT_SYMBOL_GPL(get_rcu_tasks_gp_kthread);
+
+ /*
+- * Contribute to protect against tasklist scan blind spot while the
+- * task is exiting and may be removed from the tasklist. See
+- * corresponding synchronize_srcu() for further details.
++ * Protect against tasklist scan blind spot while the task is exiting and
++ * may be removed from the tasklist. Do this by adding the task to yet
++ * another list.
++ *
++ * Note that the task will remove itself from this list, so there is no
++ * need for get_task_struct(), except in the case where rcu_tasks_pertask()
++ * adds it to the holdout list, in which case rcu_tasks_pertask() supplies
++ * the needed get_task_struct().
+ */
+-void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu)
++void exit_tasks_rcu_start(void)
+ {
+- current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu);
++ unsigned long flags;
++ struct rcu_tasks_percpu *rtpcp;
++ struct task_struct *t = current;
++
++ WARN_ON_ONCE(!list_empty(&t->rcu_tasks_exit_list));
++ preempt_disable();
++ rtpcp = this_cpu_ptr(rcu_tasks.rtpcpu);
++ t->rcu_tasks_exit_cpu = smp_processor_id();
++ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
++ if (!rtpcp->rtp_exit_list.next)
++ INIT_LIST_HEAD(&rtpcp->rtp_exit_list);
++ list_add(&t->rcu_tasks_exit_list, &rtpcp->rtp_exit_list);
++ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
++ preempt_enable();
+ }
+
+ /*
+- * Contribute to protect against tasklist scan blind spot while the
+- * task is exiting and may be removed from the tasklist. See
+- * corresponding synchronize_srcu() for further details.
++ * Remove the task from the "yet another list" because do_exit() is now
++ * non-preemptible, allowing synchronize_rcu() to wait beyond this point.
+ */
+-void exit_tasks_rcu_stop(void) __releases(&tasks_rcu_exit_srcu)
++void exit_tasks_rcu_stop(void)
+ {
++ unsigned long flags;
++ struct rcu_tasks_percpu *rtpcp;
+ struct task_struct *t = current;
+
+- __srcu_read_unlock(&tasks_rcu_exit_srcu, t->rcu_tasks_idx);
++ WARN_ON_ONCE(list_empty(&t->rcu_tasks_exit_list));
++ rtpcp = per_cpu_ptr(rcu_tasks.rtpcpu, t->rcu_tasks_exit_cpu);
++ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
++ list_del_init(&t->rcu_tasks_exit_list);
++ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ }
+
+ /*
--- /dev/null
+From 0bb11a372fc8d7006b4d0f42a2882939747bdbff Mon Sep 17 00:00:00 2001
+From: "Paul E. McKenney" <paulmck@kernel.org>
+Date: Thu, 1 Feb 2024 06:10:26 -0800
+Subject: rcu-tasks: Maintain real-time response in rcu_tasks_postscan()
+
+From: Paul E. McKenney <paulmck@kernel.org>
+
+commit 0bb11a372fc8d7006b4d0f42a2882939747bdbff upstream.
+
+The current code will scan the entirety of each per-CPU list of exiting
+tasks in ->rtp_exit_list with interrupts disabled. This is normally just
+fine, because each CPU typically won't have very many tasks in this state.
+However, if a large number of tasks block late in do_exit(), these lists
+could be arbitrarily long. Low probability, perhaps, but it really
+could happen.
+
+This commit therefore occasionally re-enables interrupts while traversing
+these lists, inserting a dummy element to hold the current place in the
+list. In kernels built with CONFIG_PREEMPT_RT=y, this re-enabling happens
+after each list element is processed, otherwise every one-to-two jiffies.
+
+[ paulmck: Apply Frederic Weisbecker feedback. ]
+
+Link: https://lore.kernel.org/all/ZdeI_-RfdLR8jlsm@localhost.localdomain/
+
+Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Sebastian Siewior <bigeasy@linutronix.de>
+Cc: Anna-Maria Behnsen <anna-maria@linutronix.de>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
+Cc: Tahera Fahimi <taherafahimi@linux.microsoft.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/rcu/tasks.h | 22 +++++++++++++++++++++-
+ 1 file changed, 21 insertions(+), 1 deletion(-)
+
+--- a/kernel/rcu/tasks.h
++++ b/kernel/rcu/tasks.h
+@@ -995,13 +995,33 @@ static void rcu_tasks_postscan(struct li
+ */
+
+ for_each_possible_cpu(cpu) {
++ unsigned long j = jiffies + 1;
+ struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rcu_tasks.rtpcpu, cpu);
+ struct task_struct *t;
++ struct task_struct *t1;
++ struct list_head tmp;
+
+ raw_spin_lock_irq_rcu_node(rtpcp);
+- list_for_each_entry(t, &rtpcp->rtp_exit_list, rcu_tasks_exit_list)
++ list_for_each_entry_safe(t, t1, &rtpcp->rtp_exit_list, rcu_tasks_exit_list) {
+ if (list_empty(&t->rcu_tasks_holdout_list))
+ rcu_tasks_pertask(t, hop);
++
++ // RT kernels need frequent pauses, otherwise
++ // pause at least once per pair of jiffies.
++ if (!IS_ENABLED(CONFIG_PREEMPT_RT) && time_before(jiffies, j))
++ continue;
++
++ // Keep our place in the list while pausing.
++ // Nothing else traverses this list, so adding a
++ // bare list_head is OK.
++ list_add(&tmp, &t->rcu_tasks_exit_list);
++ raw_spin_unlock_irq_rcu_node(rtpcp);
++ cond_resched(); // For CONFIG_PREEMPT=n kernels
++ raw_spin_lock_irq_rcu_node(rtpcp);
++ t1 = list_entry(tmp.next, struct task_struct, rcu_tasks_exit_list);
++ list_del(&tmp);
++ j = jiffies + 1;
++ }
+ raw_spin_unlock_irq_rcu_node(rtpcp);
+ }
+