]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
rcu: Use an intermediate irq_work to start process_srcu()
authorBoqun Feng <boqun@kernel.org>
Thu, 19 Mar 2026 00:56:21 +0000 (17:56 -0700)
committerBoqun Feng <boqun@kernel.org>
Wed, 25 Mar 2026 15:59:59 +0000 (08:59 -0700)
Since commit c27cea4416a3 ("rcu: Re-implement RCU Tasks Trace in terms
of SRCU-fast") we switched to SRCU in BPF. However as BPF instrument can
happen basically everywhere (including where a scheduler lock is held),
call_srcu() now needs to avoid acquiring scheduler lock because
otherwise it could cause deadlock [1]. Fix this by following what the
previous RCU Tasks Trace did: using an irq_work to delay the queuing of
the work to start process_srcu().

[boqun: Apply Joel's feedback]
[boqun: Apply Andrea's test feedback]

Reported-by: Andrea Righi <arighi@nvidia.com>
Closes: https://lore.kernel.org/all/abjzvz_tL_siV17s@gpd4/
Fixes: commit c27cea4416a3 ("rcu: Re-implement RCU Tasks Trace in terms of SRCU-fast")
Link: https://lore.kernel.org/rcu/3c4c5a29-24ea-492d-aeee-e0d9605b4183@nvidia.com/
Suggested-by: Zqiang <qiang.zhang@linux.dev>
Tested-by: Andrea Righi <arighi@nvidia.com>
Tested-by: Paul E. McKenney <paulmck@kernel.org>
Tested-by: Joel Fernandes <joelagnelf@nvidia.com>
Signed-off-by: Boqun Feng <boqun@kernel.org>
include/linux/srcutree.h
kernel/rcu/srcutree.c

index dfb31d11ff05f06eca6858294bc64ef00fc2b688..be76fa4fc1700c51bd625de7703b96f38459a39f 100644 (file)
@@ -95,6 +95,7 @@ struct srcu_usage {
        unsigned long reschedule_jiffies;
        unsigned long reschedule_count;
        struct delayed_work work;
+       struct irq_work irq_work;
        struct srcu_struct *srcu_ssp;
 };
 
index 678bd9a73875b20b2ef820ea49c94140ffd1e1f3..0d01cd8c4b4a7bb9da0a39d4305e8bfef61898d6 100644 (file)
@@ -19,6 +19,7 @@
 #include <linux/mutex.h>
 #include <linux/percpu.h>
 #include <linux/preempt.h>
+#include <linux/irq_work.h>
 #include <linux/rcupdate_wait.h>
 #include <linux/sched.h>
 #include <linux/smp.h>
@@ -75,6 +76,7 @@ static bool __read_mostly srcu_init_done;
 static void srcu_invoke_callbacks(struct work_struct *work);
 static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay);
 static void process_srcu(struct work_struct *work);
+static void srcu_irq_work(struct irq_work *work);
 static void srcu_delay_timer(struct timer_list *t);
 
 /*
@@ -216,6 +218,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
        mutex_init(&ssp->srcu_sup->srcu_barrier_mutex);
        atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 0);
        INIT_DELAYED_WORK(&ssp->srcu_sup->work, process_srcu);
+       init_irq_work(&ssp->srcu_sup->irq_work, srcu_irq_work);
        ssp->srcu_sup->sda_is_static = is_static;
        if (!is_static) {
                ssp->sda = alloc_percpu(struct srcu_data);
@@ -716,6 +719,8 @@ void cleanup_srcu_struct(struct srcu_struct *ssp)
                return; /* Just leak it! */
        if (WARN_ON(srcu_readers_active(ssp)))
                return; /* Just leak it! */
+       /* Wait for irq_work to finish first as it may queue a new work. */
+       irq_work_sync(&sup->irq_work);
        flush_delayed_work(&sup->work);
        for_each_possible_cpu(cpu) {
                struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu);
@@ -1121,9 +1126,13 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
                // it isn't.  And it does not have to be.  After all, it
                // can only be executed during early boot when there is only
                // the one boot CPU running with interrupts still disabled.
+               //
+               // Use an irq_work here to avoid acquiring runqueue lock with
+               // srcu rcu_node::lock held. BPF instrument could introduce the
+               // opposite dependency, hence we need to break the possible
+               // locking dependency here.
                if (likely(srcu_init_done))
-                       queue_delayed_work(rcu_gp_wq, &sup->work,
-                                          !!srcu_get_delay(ssp));
+                       irq_work_queue(&sup->irq_work);
                else if (list_empty(&sup->work.work.entry))
                        list_add(&sup->work.work.entry, &srcu_boot_list);
        }
@@ -1982,6 +1991,23 @@ static void process_srcu(struct work_struct *work)
        srcu_reschedule(ssp, curdelay);
 }
 
+static void srcu_irq_work(struct irq_work *work)
+{
+       struct srcu_struct *ssp;
+       struct srcu_usage *sup;
+       unsigned long delay;
+       unsigned long flags;
+
+       sup = container_of(work, struct srcu_usage, irq_work);
+       ssp = sup->srcu_ssp;
+
+       raw_spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags);
+       delay = srcu_get_delay(ssp);
+       raw_spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
+
+       queue_delayed_work(rcu_gp_wq, &sup->work, !!delay);
+}
+
 void srcutorture_get_gp_data(struct srcu_struct *ssp, int *flags,
                             unsigned long *gp_seq)
 {