]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
lib/nmi_backtrace: print out the CPUs which fail to respond to NMI
authorFeng Tang <feng.tang@linux.alibaba.com>
Thu, 21 May 2026 03:03:36 +0000 (11:03 +0800)
committerAndrew Morton <akpm@linux-foundation.org>
Fri, 29 May 2026 04:24:59 +0000 (21:24 -0700)
When debugging RCU stall cases, usually all CPUs will respond to the NMI
and print out the backtrace.  But in some nasty or hardware related cases,
some CPUs may fail to respond in 10 seconds, and very likely this is sign
of severe issues.

Paul McKenney has implemented the NMI backtrace stall check for x86, and
for other architectures, it should be also helpful to at least print out
those CPUs which failed to repond to the NMI, so that users can get an
early heads-up for possible CPU hard stall.

[feng.tang@linux.alibaba.com: avoid hard-coding "10" in two places and in a comment]
Link: https://lore.kernel.org/ag-1ciG0FSomBf7q@U-2FWC9VHC-2323.local
[akpm@linux-foundation.org: use __stringify()]
Link: https://lore.kernel.org/20260521030336.92172-1-feng.tang@linux.alibaba.com
Signed-off-by: Feng Tang <feng.tang@linux.alibaba.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
lib/nmi_backtrace.c

index 33c154264bfe2e9afee85df470574f1a1efe6aa3..a3bfa9360b23df134a96894031c86277e952e344 100644 (file)
@@ -16,6 +16,7 @@
 #include <linux/cpumask.h>
 #include <linux/delay.h>
 #include <linux/kprobes.h>
+#include <linux/stringify.h>
 #include <linux/nmi.h>
 #include <linux/cpu.h>
 #include <linux/sched/debug.h>
@@ -27,6 +28,8 @@ static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
 /* "in progress" flag of arch_trigger_cpumask_backtrace */
 static unsigned long backtrace_flag;
 
+#define NMI_BT_TIMEOUT_SEC     10
+
 /*
  * When raise() is called it will be passed a pointer to the
  * backtrace_mask. Architectures that call nmi_cpu_backtrace()
@@ -68,14 +71,20 @@ void nmi_trigger_cpumask_backtrace(const cpumask_t *mask,
                raise(to_cpumask(backtrace_mask));
        }
 
-       /* Wait for up to 10 seconds for all CPUs to do the backtrace */
-       for (i = 0; i < 10 * 1000; i++) {
+       /* Wait for up to NMI_BT_TIMEOUT_SEC seconds for all CPUs to do the backtrace */
+       for (i = 0; i < NMI_BT_TIMEOUT_SEC * 1000; i++) {
                if (cpumask_empty(to_cpumask(backtrace_mask)))
                        break;
                mdelay(1);
                touch_softlockup_watchdog();
        }
-       nmi_backtrace_stall_check(to_cpumask(backtrace_mask));
+
+       if (!cpumask_empty(to_cpumask(backtrace_mask))) {
+               pr_warn("After " __stringify(NMI_BT_TIMEOUT_SEC) " seconds, these CPUS still haven't responded to the NMI: %*pbl\n",
+                       cpumask_pr_args(to_cpumask(backtrace_mask)));
+
+               nmi_backtrace_stall_check(to_cpumask(backtrace_mask));
+       }
 
        /*
         * Force flush any remote buffers that might be stuck in IRQ context