]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
rcu: Document concurrent quiescent state reporting for offline CPUs
authorJoel Fernandes <joelagnelf@nvidia.com>
Tue, 15 Jul 2025 20:01:55 +0000 (16:01 -0400)
committerNeeraj Upadhyay <neeraj.iitr10@gmail.com>
Tue, 22 Jul 2025 11:40:50 +0000 (17:10 +0530)
The synchronization of CPU offlining with GP initialization is confusing
to put it mildly (rightfully so as the issue it deals with is complex).
Recent discussions brought up a question -- what prevents the
rcu_implicit_dyntick_qs() from warning about QS reports for offline
CPUs (missing QS reports for offline CPUs causing indefinite hangs).

QS reporting for now-offline CPUs should only happen from:
- gp_init()
- rcutree_cpu_report_dead()

Add some documentation on this and refer to it from comments in the code
explaining how QS reporting is not missed when these functions are
concurrently running.

I referred heavily to this post [1] about the need for the ofl_lock.
[1] https://lore.kernel.org/all/20180924164443.GF4222@linux.ibm.com/

[ Applied paulmck feedback on moving documentation to Requirements.rst ]

Link: https://lore.kernel.org/all/01b4d228-9416-43f8-a62e-124b92e8741a@paulmck-laptop/
Co-developed-by: "Paul E. McKenney" <paulmck@kernel.org>
Signed-off-by: "Paul E. McKenney" <paulmck@kernel.org>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
Signed-off-by: Neeraj Upadhyay (AMD) <neeraj.upadhyay@kernel.org>
Documentation/RCU/Design/Requirements/Requirements.rst
kernel/rcu/tree.c

index 771a1ce4c84d476774ba1fb4b593761f241dc0c0..b0395540296b0075784d5f7659b13bf33c225c04 100644 (file)
@@ -2011,6 +2011,93 @@ after the CPU hotplug scanning.
 By incrementing gp_seq first, CPU1's RCU read-side critical section
 is guaranteed to not be missed by CPU2.
 
+**Concurrent Quiescent State Reporting for Offline CPUs**
+
+RCU must ensure that CPUs going offline report quiescent states to avoid
+blocking grace periods. This requires careful synchronization to handle
+race conditions
+
+**Race condition causing Offline CPU to hang GP**
+
+A race between CPU offlining and new GP initialization (gp_init) may occur
+because `rcu_report_qs_rnp()` in `rcutree_report_cpu_dead()` must temporarily
+release the `rcu_node` lock to wake the RCU grace-period kthread:
+
+.. code-block:: none
+
+   CPU1 (going offline)                 CPU0 (GP kthread)
+   --------------------                 -----------------
+   rcutree_report_cpu_dead()
+     rcu_report_qs_rnp()
+       // Must release rnp->lock to wake GP kthread
+       raw_spin_unlock_irqrestore_rcu_node()
+                                        // Wakes up and starts new GP
+                                        rcu_gp_init()
+                                          // First loop:
+                                          copies qsmaskinitnext->qsmaskinit
+                                          // CPU1 still in qsmaskinitnext!
+
+                                          // Second loop:
+                                          rnp->qsmask = rnp->qsmaskinit
+                                          mask = rnp->qsmask & ~rnp->qsmaskinitnext
+                                          // mask is 0! CPU1 still in both masks
+       // Reacquire lock (but too late)
+     rnp->qsmaskinitnext &= ~mask       // Finally clears bit
+
+Without `ofl_lock`, the new grace period includes the offline CPU and waits
+forever for its quiescent state causing a GP hang.
+
+**A solution with ofl_lock**
+
+The `ofl_lock` (offline lock) prevents `rcu_gp_init()` from running during
+the vulnerable window when `rcu_report_qs_rnp()` has released `rnp->lock`:
+
+.. code-block:: none
+
+   CPU0 (rcu_gp_init)                   CPU1 (rcutree_report_cpu_dead)
+   ------------------                   ------------------------------
+   rcu_for_each_leaf_node(rnp) {
+       arch_spin_lock(&ofl_lock) -----> arch_spin_lock(&ofl_lock) [BLOCKED]
+
+       // Safe: CPU1 can't interfere
+       rnp->qsmaskinit = rnp->qsmaskinitnext
+
+       arch_spin_unlock(&ofl_lock) ---> // Now CPU1 can proceed
+   }                                    // But snapshot already taken
+
+**Another race causing GP hangs in rcu_gpu_init(): Reporting QS for Now-offline CPUs**
+
+After the first loop takes an atomic snapshot of online CPUs, as shown above,
+the second loop in `rcu_gp_init()` detects CPUs that went offline between
+releasing `ofl_lock` and acquiring the per-node `rnp->lock`. This detection is
+crucial because:
+
+1. The CPU might have gone offline after the snapshot but before the second loop
+2. The offline CPU cannot report its own QS if it's already dead
+3. Without this detection, the grace period would wait forever for CPUs that
+   are now offline.
+
+The second loop performs this detection safely:
+
+.. code-block:: none
+
+   rcu_for_each_node_breadth_first(rnp) {
+       raw_spin_lock_irqsave_rcu_node(rnp, flags);
+       rnp->qsmask = rnp->qsmaskinit;  // Apply the snapshot
+
+       // Detect CPUs offline after snapshot
+       mask = rnp->qsmask & ~rnp->qsmaskinitnext;
+
+       if (mask && rcu_is_leaf_node(rnp))
+           rcu_report_qs_rnp(mask, ...)  // Report QS for offline CPUs
+   }
+
+This approach ensures atomicity: quiescent state reporting for offline CPUs
+happens either in `rcu_gp_init()` (second loop) or in `rcutree_report_cpu_dead()`,
+never both and never neither. The `rnp->lock` held throughout the sequence
+prevents races - `rcutree_report_cpu_dead()` also acquires this lock when
+clearing `qsmaskinitnext`, ensuring mutual exclusion.
+
 Scheduler and RCU
 ~~~~~~~~~~~~~~~~~
 
index b206db119546ef46351e21ddc051f5490692573f..dd282a984fec0c95677563b5e23d389352f53a57 100644 (file)
@@ -1886,6 +1886,10 @@ static noinline_for_stack bool rcu_gp_init(void)
        /* Exclude CPU hotplug operations. */
        rcu_for_each_leaf_node(rnp) {
                local_irq_disable();
+               /*
+                * Serialize with CPU offline. See Requirements.rst > Hotplug CPU >
+                * Concurrent Quiescent State Reporting for Offline CPUs.
+                */
                arch_spin_lock(&rcu_state.ofl_lock);
                raw_spin_lock_rcu_node(rnp);
                if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
@@ -1960,7 +1964,12 @@ static noinline_for_stack bool rcu_gp_init(void)
                trace_rcu_grace_period_init(rcu_state.name, rnp->gp_seq,
                                            rnp->level, rnp->grplo,
                                            rnp->grphi, rnp->qsmask);
-               /* Quiescent states for tasks on any now-offline CPUs. */
+               /*
+                * Quiescent states for tasks on any now-offline CPUs. Since we
+                * released the ofl and rnp lock before this loop, CPUs might
+                * have gone offline and we have to report QS on their behalf.
+                * See Requirements.rst > Hotplug CPU > Concurrent QS Reporting.
+                */
                mask = rnp->qsmask & ~rnp->qsmaskinitnext;
                rnp->rcu_gp_init_mask = mask;
                if ((mask || rnp->wait_blkd_tasks) && rcu_is_leaf_node(rnp))
@@ -4386,6 +4395,13 @@ void rcutree_report_cpu_dead(void)
 
        /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
        mask = rdp->grpmask;
+
+       /*
+        * Hold the ofl_lock and rnp lock to avoid races between CPU going
+        * offline and doing a QS report (as below), versus rcu_gp_init().
+        * See Requirements.rst > Hotplug CPU > Concurrent QS Reporting section
+        * for more details.
+        */
        arch_spin_lock(&rcu_state.ofl_lock);
        raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
        rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq);
@@ -4396,6 +4412,7 @@ void rcutree_report_cpu_dead(void)
                rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
                raw_spin_lock_irqsave_rcu_node(rnp, flags);
        }
+       /* Clear from ->qsmaskinitnext to mark offline. */
        WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask);
        raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
        arch_spin_unlock(&rcu_state.ofl_lock);