From f84c9dd34e8dce3fb42598344da711573b383626 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Fri, 6 Feb 2026 03:18:01 -0800 Subject: [PATCH] workqueue: add time-based panic for stalls Add a new module parameter 'panic_on_stall_time' that triggers a panic when a workqueue stall persists for longer than the specified duration in seconds. Unlike 'panic_on_stall' which counts accumulated stall events, this parameter triggers based on the duration of a single continuous stall. This is useful for catching truly stuck workqueues rather than accumulating transient stalls. Usage: workqueue.panic_on_stall_time=120 This would panic if any workqueue pool has been stalled for 120 seconds or more. The stall duration is measured from the workqueue last progress (poll_ts) which accounts for legitimate system stalls. Signed-off-by: Breno Leitao Signed-off-by: Tejun Heo --- .../admin-guide/kernel-parameters.txt | 8 +++++++ kernel/workqueue.c | 22 +++++++++++++++---- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 5a9df399dd722..3fe4672d1bb92 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -8339,6 +8339,14 @@ Kernel parameters The default is set by CONFIG_BOOTPARAM_WQ_STALL_PANIC, which is 0 (disabled) if not configured. + workqueue.panic_on_stall_time= + Panic when a workqueue stall has been continuous for + the specified number of seconds. Unlike panic_on_stall + which counts accumulated stall events, this triggers + based on the duration of a single continuous stall. + + The default is 0, which disables the time-based panic. + workqueue.cpu_intensive_thresh_us= Per-cpu work items which run for longer than this threshold are automatically considered CPU intensive diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 2e7fd46fce170..68e664d7dbecf 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -7571,6 +7571,10 @@ static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES; static unsigned int wq_panic_on_stall = CONFIG_BOOTPARAM_WQ_STALL_PANIC; module_param_named(panic_on_stall, wq_panic_on_stall, uint, 0644); +static unsigned int wq_panic_on_stall_time; +module_param_named(panic_on_stall_time, wq_panic_on_stall_time, uint, 0644); +MODULE_PARM_DESC(panic_on_stall_time, "Panic if stall exceeds this many seconds (0=disabled)"); + /* * Show workers that might prevent the processing of pending work items. * The only candidates are CPU-bound workers in the running state. @@ -7622,7 +7626,12 @@ static void show_cpu_pools_hogs(void) rcu_read_unlock(); } -static void panic_on_wq_watchdog(void) +/* + * It triggers a panic in two scenarios: when the total number of stalls + * exceeds a threshold, and when a stall lasts longer than + * wq_panic_on_stall_time + */ +static void panic_on_wq_watchdog(unsigned int stall_time_sec) { static unsigned int wq_stall; @@ -7630,6 +7639,8 @@ static void panic_on_wq_watchdog(void) wq_stall++; BUG_ON(wq_stall >= wq_panic_on_stall); } + + BUG_ON(wq_panic_on_stall_time && stall_time_sec >= wq_panic_on_stall_time); } static void wq_watchdog_reset_touched(void) @@ -7644,10 +7655,12 @@ static void wq_watchdog_reset_touched(void) static void wq_watchdog_timer_fn(struct timer_list *unused) { unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ; + unsigned int max_stall_time = 0; bool lockup_detected = false; bool cpu_pool_stall = false; unsigned long now = jiffies; struct worker_pool *pool; + unsigned int stall_time; int pi; if (!thresh) @@ -7681,14 +7694,15 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) /* did we stall? */ if (time_after(now, ts + thresh)) { lockup_detected = true; + stall_time = jiffies_to_msecs(now - pool_ts) / 1000; + max_stall_time = max(max_stall_time, stall_time); if (pool->cpu >= 0 && !(pool->flags & POOL_BH)) { pool->cpu_stall = true; cpu_pool_stall = true; } pr_emerg("BUG: workqueue lockup - pool"); pr_cont_pool_info(pool); - pr_cont(" stuck for %us!\n", - jiffies_to_msecs(now - pool_ts) / 1000); + pr_cont(" stuck for %us!\n", stall_time); } @@ -7701,7 +7715,7 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) show_cpu_pools_hogs(); if (lockup_detected) - panic_on_wq_watchdog(); + panic_on_wq_watchdog(max_stall_time); wq_watchdog_reset_touched(); mod_timer(&wq_watchdog_timer, jiffies + thresh); -- 2.47.3