--- /dev/null
+From 52d7e48b86fc108e45a656d8e53e4237993c481d Mon Sep 17 00:00:00 2001
+From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
+Date: Tue, 10 Jan 2017 02:28:26 -0800
+Subject: rcu: Narrow early boot window of illegal synchronous grace periods
+
+From: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+
+commit 52d7e48b86fc108e45a656d8e53e4237993c481d upstream.
+
+The current preemptible RCU implementation goes through three phases
+during bootup. In the first phase, there is only one CPU that is running
+with preemption disabled, so that a no-op is a synchronous grace period.
+In the second mid-boot phase, the scheduler is running, but RCU has
+not yet gotten its kthreads spawned (and, for expedited grace periods,
+workqueues are not yet running. During this time, any attempt to do
+a synchronous grace period will hang the system (or complain bitterly,
+depending). In the third and final phase, RCU is fully operational and
+everything works normally.
+
+This has been OK for some time, but there has recently been some
+synchronous grace periods showing up during the second mid-boot phase.
+This code worked "by accident" for awhile, but started failing as soon
+as expedited RCU grace periods switched over to workqueues in commit
+8b355e3bc140 ("rcu: Drive expedited grace periods from workqueue").
+Note that the code was buggy even before this commit, as it was subject
+to failure on real-time systems that forced all expedited grace periods
+to run as normal grace periods (for example, using the rcu_normal ksysfs
+parameter). The callchain from the failure case is as follows:
+
+early_amd_iommu_init()
+|-> acpi_put_table(ivrs_base);
+|-> acpi_tb_put_table(table_desc);
+|-> acpi_tb_invalidate_table(table_desc);
+|-> acpi_tb_release_table(...)
+|-> acpi_os_unmap_memory
+|-> acpi_os_unmap_iomem
+|-> acpi_os_map_cleanup
+|-> synchronize_rcu_expedited
+
+The kernel showing this callchain was built with CONFIG_PREEMPT_RCU=y,
+which caused the code to try using workqueues before they were
+initialized, which did not go well.
+
+This commit therefore reworks RCU to permit synchronous grace periods
+to proceed during this mid-boot phase. This commit is therefore a
+fix to a regression introduced in v4.9, and is therefore being put
+forward post-merge-window in v4.10.
+
+This commit sets a flag from the existing rcu_scheduler_starting()
+function which causes all synchronous grace periods to take the expedited
+path. The expedited path now checks this flag, using the requesting task
+to drive the expedited grace period forward during the mid-boot phase.
+Finally, this flag is updated by a core_initcall() function named
+rcu_exp_runtime_mode(), which causes the runtime codepaths to be used.
+
+Note that this arrangement assumes that tasks are not sent POSIX signals
+(or anything similar) from the time that the first task is spawned
+through core_initcall() time.
+
+Fixes: 8b355e3bc140 ("rcu: Drive expedited grace periods from workqueue")
+Reported-by: "Zheng, Lv" <lv.zheng@intel.com>
+Reported-by: Borislav Petkov <bp@alien8.de>
+Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+Tested-by: Stan Kain <stan.kain@gmail.com>
+Tested-by: Ivan <waffolz@hotmail.com>
+Tested-by: Emanuel Castelo <emanuel.castelo@gmail.com>
+Tested-by: Bruno Pesavento <bpesavento@infinito.it>
+Tested-by: Borislav Petkov <bp@suse.de>
+Tested-by: Frederic Bezies <fredbezies@gmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/rcupdate.h | 4 +++
+ kernel/rcu/rcu.h | 1
+ kernel/rcu/tiny_plugin.h | 9 ++++++--
+ kernel/rcu/tree.c | 33 ++++++++++++++++++-----------
+ kernel/rcu/tree_exp.h | 52 +++++++++++++++++++++++++++++++++++++----------
+ kernel/rcu/tree_plugin.h | 2 -
+ kernel/rcu/update.c | 38 +++++++++++++++++++++++++++-------
+ 7 files changed, 104 insertions(+), 35 deletions(-)
+
+--- a/include/linux/rcupdate.h
++++ b/include/linux/rcupdate.h
+@@ -444,6 +444,10 @@ bool __rcu_is_watching(void);
+ #error "Unknown RCU implementation specified to kernel configuration"
+ #endif
+
++#define RCU_SCHEDULER_INACTIVE 0
++#define RCU_SCHEDULER_INIT 1
++#define RCU_SCHEDULER_RUNNING 2
++
+ /*
+ * init_rcu_head_on_stack()/destroy_rcu_head_on_stack() are needed for dynamic
+ * initialization and destruction of rcu_head on the stack. rcu_head structures
+--- a/kernel/rcu/rcu.h
++++ b/kernel/rcu/rcu.h
+@@ -136,6 +136,7 @@ int rcu_jiffies_till_stall_check(void);
+ #define TPS(x) tracepoint_string(x)
+
+ void rcu_early_boot_tests(void);
++void rcu_test_sync_prims(void);
+
+ /*
+ * This function really isn't for public consumption, but RCU is special in
+--- a/kernel/rcu/tiny_plugin.h
++++ b/kernel/rcu/tiny_plugin.h
+@@ -60,12 +60,17 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active);
+
+ /*
+ * During boot, we forgive RCU lockdep issues. After this function is
+- * invoked, we start taking RCU lockdep issues seriously.
++ * invoked, we start taking RCU lockdep issues seriously. Note that unlike
++ * Tree RCU, Tiny RCU transitions directly from RCU_SCHEDULER_INACTIVE
++ * to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage.
++ * The reason for this is that Tiny RCU does not need kthreads, so does
++ * not have to care about the fact that the scheduler is half-initialized
++ * at a certain phase of the boot process.
+ */
+ void __init rcu_scheduler_starting(void)
+ {
+ WARN_ON(nr_context_switches() > 0);
+- rcu_scheduler_active = 1;
++ rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
+ }
+
+ #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+--- a/kernel/rcu/tree.c
++++ b/kernel/rcu/tree.c
+@@ -127,13 +127,16 @@ int rcu_num_nodes __read_mostly = NUM_RC
+ int sysctl_panic_on_rcu_stall __read_mostly;
+
+ /*
+- * The rcu_scheduler_active variable transitions from zero to one just
+- * before the first task is spawned. So when this variable is zero, RCU
+- * can assume that there is but one task, allowing RCU to (for example)
++ * The rcu_scheduler_active variable is initialized to the value
++ * RCU_SCHEDULER_INACTIVE and transitions RCU_SCHEDULER_INIT just before the
++ * first task is spawned. So when this variable is RCU_SCHEDULER_INACTIVE,
++ * RCU can assume that there is but one task, allowing RCU to (for example)
+ * optimize synchronize_rcu() to a simple barrier(). When this variable
+- * is one, RCU must actually do all the hard work required to detect real
+- * grace periods. This variable is also used to suppress boot-time false
+- * positives from lockdep-RCU error checking.
++ * is RCU_SCHEDULER_INIT, RCU must actually do all the hard work required
++ * to detect real grace periods. This variable is also used to suppress
++ * boot-time false positives from lockdep-RCU error checking. Finally, it
++ * transitions from RCU_SCHEDULER_INIT to RCU_SCHEDULER_RUNNING after RCU
++ * is fully initialized, including all of its kthreads having been spawned.
+ */
+ int rcu_scheduler_active __read_mostly;
+ EXPORT_SYMBOL_GPL(rcu_scheduler_active);
+@@ -3985,18 +3988,22 @@ static int __init rcu_spawn_gp_kthread(v
+ early_initcall(rcu_spawn_gp_kthread);
+
+ /*
+- * This function is invoked towards the end of the scheduler's initialization
+- * process. Before this is called, the idle task might contain
+- * RCU read-side critical sections (during which time, this idle
+- * task is booting the system). After this function is called, the
+- * idle tasks are prohibited from containing RCU read-side critical
+- * sections. This function also enables RCU lockdep checking.
++ * This function is invoked towards the end of the scheduler's
++ * initialization process. Before this is called, the idle task might
++ * contain synchronous grace-period primitives (during which time, this idle
++ * task is booting the system, and such primitives are no-ops). After this
++ * function is called, any synchronous grace-period primitives are run as
++ * expedited, with the requesting task driving the grace period forward.
++ * A later core_initcall() rcu_exp_runtime_mode() will switch to full
++ * runtime RCU functionality.
+ */
+ void rcu_scheduler_starting(void)
+ {
+ WARN_ON(num_online_cpus() != 1);
+ WARN_ON(nr_context_switches() > 0);
+- rcu_scheduler_active = 1;
++ rcu_test_sync_prims();
++ rcu_scheduler_active = RCU_SCHEDULER_INIT;
++ rcu_test_sync_prims();
+ }
+
+ /*
+--- a/kernel/rcu/tree_exp.h
++++ b/kernel/rcu/tree_exp.h
+@@ -522,18 +522,28 @@ struct rcu_exp_work {
+ };
+
+ /*
++ * Common code to drive an expedited grace period forward, used by
++ * workqueues and mid-boot-time tasks.
++ */
++static void rcu_exp_sel_wait_wake(struct rcu_state *rsp,
++ smp_call_func_t func, unsigned long s)
++{
++ /* Initialize the rcu_node tree in preparation for the wait. */
++ sync_rcu_exp_select_cpus(rsp, func);
++
++ /* Wait and clean up, including waking everyone. */
++ rcu_exp_wait_wake(rsp, s);
++}
++
++/*
+ * Work-queue handler to drive an expedited grace period forward.
+ */
+ static void wait_rcu_exp_gp(struct work_struct *wp)
+ {
+ struct rcu_exp_work *rewp;
+
+- /* Initialize the rcu_node tree in preparation for the wait. */
+ rewp = container_of(wp, struct rcu_exp_work, rew_work);
+- sync_rcu_exp_select_cpus(rewp->rew_rsp, rewp->rew_func);
+-
+- /* Wait and clean up, including waking everyone. */
+- rcu_exp_wait_wake(rewp->rew_rsp, rewp->rew_s);
++ rcu_exp_sel_wait_wake(rewp->rew_rsp, rewp->rew_func, rewp->rew_s);
+ }
+
+ /*
+@@ -559,12 +569,18 @@ static void _synchronize_rcu_expedited(s
+ if (exp_funnel_lock(rsp, s))
+ return; /* Someone else did our work for us. */
+
+- /* Marshall arguments and schedule the expedited grace period. */
+- rew.rew_func = func;
+- rew.rew_rsp = rsp;
+- rew.rew_s = s;
+- INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp);
+- schedule_work(&rew.rew_work);
++ /* Ensure that load happens before action based on it. */
++ if (unlikely(rcu_scheduler_active == RCU_SCHEDULER_INIT)) {
++ /* Direct call during scheduler init and early_initcalls(). */
++ rcu_exp_sel_wait_wake(rsp, func, s);
++ } else {
++ /* Marshall arguments & schedule the expedited grace period. */
++ rew.rew_func = func;
++ rew.rew_rsp = rsp;
++ rew.rew_s = s;
++ INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp);
++ schedule_work(&rew.rew_work);
++ }
+
+ /* Wait for expedited grace period to complete. */
+ rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
+@@ -666,6 +682,8 @@ void synchronize_rcu_expedited(void)
+ {
+ struct rcu_state *rsp = rcu_state_p;
+
++ if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
++ return;
+ _synchronize_rcu_expedited(rsp, sync_rcu_exp_handler);
+ }
+ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
+@@ -683,3 +701,15 @@ void synchronize_rcu_expedited(void)
+ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
+
+ #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
++
++/*
++ * Switch to run-time mode once Tree RCU has fully initialized.
++ */
++static int __init rcu_exp_runtime_mode(void)
++{
++ rcu_test_sync_prims();
++ rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
++ rcu_test_sync_prims();
++ return 0;
++}
++core_initcall(rcu_exp_runtime_mode);
+--- a/kernel/rcu/tree_plugin.h
++++ b/kernel/rcu/tree_plugin.h
+@@ -670,7 +670,7 @@ void synchronize_rcu(void)
+ lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_rcu() in RCU read-side critical section");
+- if (!rcu_scheduler_active)
++ if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
+ return;
+ if (rcu_gp_is_expedited())
+ synchronize_rcu_expedited();
+--- a/kernel/rcu/update.c
++++ b/kernel/rcu/update.c
+@@ -121,11 +121,14 @@ EXPORT_SYMBOL(rcu_read_lock_sched_held);
+ * Should expedited grace-period primitives always fall back to their
+ * non-expedited counterparts? Intended for use within RCU. Note
+ * that if the user specifies both rcu_expedited and rcu_normal, then
+- * rcu_normal wins.
++ * rcu_normal wins. (Except during the time period during boot from
++ * when the first task is spawned until the rcu_exp_runtime_mode()
++ * core_initcall() is invoked, at which point everything is expedited.)
+ */
+ bool rcu_gp_is_normal(void)
+ {
+- return READ_ONCE(rcu_normal);
++ return READ_ONCE(rcu_normal) &&
++ rcu_scheduler_active != RCU_SCHEDULER_INIT;
+ }
+ EXPORT_SYMBOL_GPL(rcu_gp_is_normal);
+
+@@ -135,13 +138,14 @@ static atomic_t rcu_expedited_nesting =
+ /*
+ * Should normal grace-period primitives be expedited? Intended for
+ * use within RCU. Note that this function takes the rcu_expedited
+- * sysfs/boot variable into account as well as the rcu_expedite_gp()
+- * nesting. So looping on rcu_unexpedite_gp() until rcu_gp_is_expedited()
+- * returns false is a -really- bad idea.
++ * sysfs/boot variable and rcu_scheduler_active into account as well
++ * as the rcu_expedite_gp() nesting. So looping on rcu_unexpedite_gp()
++ * until rcu_gp_is_expedited() returns false is a -really- bad idea.
+ */
+ bool rcu_gp_is_expedited(void)
+ {
+- return rcu_expedited || atomic_read(&rcu_expedited_nesting);
++ return rcu_expedited || atomic_read(&rcu_expedited_nesting) ||
++ rcu_scheduler_active == RCU_SCHEDULER_INIT;
+ }
+ EXPORT_SYMBOL_GPL(rcu_gp_is_expedited);
+
+@@ -257,7 +261,7 @@ EXPORT_SYMBOL_GPL(rcu_callback_map);
+
+ int notrace debug_lockdep_rcu_enabled(void)
+ {
+- return rcu_scheduler_active && debug_locks &&
++ return rcu_scheduler_active != RCU_SCHEDULER_INACTIVE && debug_locks &&
+ current->lockdep_recursion == 0;
+ }
+ EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
+@@ -591,7 +595,7 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks);
+ void synchronize_rcu_tasks(void)
+ {
+ /* Complain if the scheduler has not started. */
+- RCU_LOCKDEP_WARN(!rcu_scheduler_active,
++ RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE,
+ "synchronize_rcu_tasks called too soon");
+
+ /* Wait for the grace period. */
+@@ -813,6 +817,23 @@ static void rcu_spawn_tasks_kthread(void
+
+ #endif /* #ifdef CONFIG_TASKS_RCU */
+
++/*
++ * Test each non-SRCU synchronous grace-period wait API. This is
++ * useful just after a change in mode for these primitives, and
++ * during early boot.
++ */
++void rcu_test_sync_prims(void)
++{
++ if (!IS_ENABLED(CONFIG_PROVE_RCU))
++ return;
++ synchronize_rcu();
++ synchronize_rcu_bh();
++ synchronize_sched();
++ synchronize_rcu_expedited();
++ synchronize_rcu_bh_expedited();
++ synchronize_sched_expedited();
++}
++
+ #ifdef CONFIG_PROVE_RCU
+
+ /*
+@@ -865,6 +886,7 @@ void rcu_early_boot_tests(void)
+ early_boot_test_call_rcu_bh();
+ if (rcu_self_test_sched)
+ early_boot_test_call_rcu_sched();
++ rcu_test_sync_prims();
+ }
+
+ static int rcu_verify_early_boot_tests(void)