rcu: Document concurrent quiescent state reporting for offline CPUs

author Joel Fernandes <joelagnelf@nvidia.com>

Tue, 15 Jul 2025 20:01:55 +0000 (16:01 -0400)

committer Neeraj Upadhyay <neeraj.iitr10@gmail.com>

Tue, 22 Jul 2025 11:40:50 +0000 (17:10 +0530)
author Joel Fernandes <joelagnelf@nvidia.com>
Tue, 15 Jul 2025 20:01:55 +0000 (16:01 -0400)
committer Neeraj Upadhyay <neeraj.iitr10@gmail.com>
Tue, 22 Jul 2025 11:40:50 +0000 (17:10 +0530)
diff --git a/Documentation/RCU/Design/Requirements/Requirements.rst b/Documentation/RCU/Design/Requirements/Requirements.rst

index 771a1ce4c84d476774ba1fb4b593761f241dc0c0..b0395540296b0075784d5f7659b13bf33c225c04 100644 (file)
--- a/Documentation/RCU/Design/Requirements/Requirements.rst
+++ b/Documentation/RCU/Design/Requirements/Requirements.rst
@@ -2011,6 +2011,93 @@ after the CPU hotplug scanning.
  By incrementing gp_seq first, CPU1's RCU read-side critical section
  is guaranteed to not be missed by CPU2.
  
+**Concurrent Quiescent State Reporting for Offline CPUs**
+
+RCU must ensure that CPUs going offline report quiescent states to avoid
+blocking grace periods. This requires careful synchronization to handle
+race conditions
+
+**Race condition causing Offline CPU to hang GP**
+
+A race between CPU offlining and new GP initialization (gp_init) may occur
+because `rcu_report_qs_rnp()` in `rcutree_report_cpu_dead()` must temporarily
+release the `rcu_node` lock to wake the RCU grace-period kthread:
+
+.. code-block:: none
+
+   CPU1 (going offline)                 CPU0 (GP kthread)
+   --------------------                 -----------------
+   rcutree_report_cpu_dead()
+     rcu_report_qs_rnp()
+       // Must release rnp->lock to wake GP kthread
+       raw_spin_unlock_irqrestore_rcu_node()
+                                        // Wakes up and starts new GP
+                                        rcu_gp_init()
+                                          // First loop:
+                                          copies qsmaskinitnext->qsmaskinit
+                                          // CPU1 still in qsmaskinitnext!
+
+                                          // Second loop:
+                                          rnp->qsmask = rnp->qsmaskinit
+                                          mask = rnp->qsmask & ~rnp->qsmaskinitnext
+                                          // mask is 0! CPU1 still in both masks
+       // Reacquire lock (but too late)
+     rnp->qsmaskinitnext &= ~mask       // Finally clears bit
+
+Without `ofl_lock`, the new grace period includes the offline CPU and waits
+forever for its quiescent state causing a GP hang.
+
+**A solution with ofl_lock**
+
+The `ofl_lock` (offline lock) prevents `rcu_gp_init()` from running during
+the vulnerable window when `rcu_report_qs_rnp()` has released `rnp->lock`:
+
+.. code-block:: none
+
+   CPU0 (rcu_gp_init)                   CPU1 (rcutree_report_cpu_dead)
+   ------------------                   ------------------------------
+   rcu_for_each_leaf_node(rnp) {
+       arch_spin_lock(&ofl_lock) -----> arch_spin_lock(&ofl_lock) [BLOCKED]
+
+       // Safe: CPU1 can't interfere
+       rnp->qsmaskinit = rnp->qsmaskinitnext
+
+       arch_spin_unlock(&ofl_lock) ---> // Now CPU1 can proceed
+   }                                    // But snapshot already taken
+
+**Another race causing GP hangs in rcu_gpu_init(): Reporting QS for Now-offline CPUs**
+
+After the first loop takes an atomic snapshot of online CPUs, as shown above,
+the second loop in `rcu_gp_init()` detects CPUs that went offline between
+releasing `ofl_lock` and acquiring the per-node `rnp->lock`. This detection is
+crucial because:
+
+1. The CPU might have gone offline after the snapshot but before the second loop
+2. The offline CPU cannot report its own QS if it's already dead
+3. Without this detection, the grace period would wait forever for CPUs that
+   are now offline.
+
+The second loop performs this detection safely:
+
+.. code-block:: none
+
+   rcu_for_each_node_breadth_first(rnp) {
+       raw_spin_lock_irqsave_rcu_node(rnp, flags);
+       rnp->qsmask = rnp->qsmaskinit;  // Apply the snapshot
+
+       // Detect CPUs offline after snapshot
+       mask = rnp->qsmask & ~rnp->qsmaskinitnext;
+
+       if (mask && rcu_is_leaf_node(rnp))
+           rcu_report_qs_rnp(mask, ...)  // Report QS for offline CPUs
+   }
+
+This approach ensures atomicity: quiescent state reporting for offline CPUs
+happens either in `rcu_gp_init()` (second loop) or in `rcutree_report_cpu_dead()`,
+never both and never neither. The `rnp->lock` held throughout the sequence
+prevents races - `rcutree_report_cpu_dead()` also acquires this lock when
+clearing `qsmaskinitnext`, ensuring mutual exclusion.
+
  Scheduler and RCU
  ~~~~~~~~~~~~~~~~~
  
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c

index b206db119546ef46351e21ddc051f5490692573f..dd282a984fec0c95677563b5e23d389352f53a57 100644 (file)
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1886,6 +1886,10 @@ static noinline_for_stack bool rcu_gp_init(void)
         /* Exclude CPU hotplug operations. */
         rcu_for_each_leaf_node(rnp) {
                 local_irq_disable();
+               /*
+                * Serialize with CPU offline. See Requirements.rst > Hotplug CPU >
+                * Concurrent Quiescent State Reporting for Offline CPUs.
+                */
                 arch_spin_lock(&rcu_state.ofl_lock);
                 raw_spin_lock_rcu_node(rnp);
                 if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
@@ -1960,7 +1964,12 @@ static noinline_for_stack bool rcu_gp_init(void)
                 trace_rcu_grace_period_init(rcu_state.name, rnp->gp_seq,
                                             rnp->level, rnp->grplo,
                                             rnp->grphi, rnp->qsmask);
-               /* Quiescent states for tasks on any now-offline CPUs. */
+               /*
+                * Quiescent states for tasks on any now-offline CPUs. Since we
+                * released the ofl and rnp lock before this loop, CPUs might
+                * have gone offline and we have to report QS on their behalf.
+                * See Requirements.rst > Hotplug CPU > Concurrent QS Reporting.
+                */
                 mask = rnp->qsmask & ~rnp->qsmaskinitnext;
                 rnp->rcu_gp_init_mask = mask;
                 if ((mask || rnp->wait_blkd_tasks) && rcu_is_leaf_node(rnp))
@@ -4386,6 +4395,13 @@ void rcutree_report_cpu_dead(void)
  
         /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
         mask = rdp->grpmask;
+
+       /*
+        * Hold the ofl_lock and rnp lock to avoid races between CPU going
+        * offline and doing a QS report (as below), versus rcu_gp_init().
+        * See Requirements.rst > Hotplug CPU > Concurrent QS Reporting section
+        * for more details.
+        */
         arch_spin_lock(&rcu_state.ofl_lock);
         raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
         rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq);
@@ -4396,6 +4412,7 @@ void rcutree_report_cpu_dead(void)
                 rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
                 raw_spin_lock_irqsave_rcu_node(rnp, flags);
         }
+       /* Clear from ->qsmaskinitnext to mark offline. */
         WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask);
         raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
         arch_spin_unlock(&rcu_state.ofl_lock);
author	Joel Fernandes <joelagnelf@nvidia.com>
	Tue, 15 Jul 2025 20:01:55 +0000 (16:01 -0400)
committer	Neeraj Upadhyay <neeraj.iitr10@gmail.com>
	Tue, 22 Jul 2025 11:40:50 +0000 (17:10 +0530)
Documentation/RCU/Design/Requirements/Requirements.rst		patch \| blob \| blame \| history
kernel/rcu/tree.c		patch \| blob \| blame \| history