]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
PM: sleep: Allow configuring the DPM watchdog to warn earlier than panic
authorDouglas Anderson <dianders@chromium.org>
Thu, 9 Jan 2025 20:59:58 +0000 (12:59 -0800)
committerRafael J. Wysocki <rafael.j.wysocki@intel.com>
Tue, 14 Jan 2025 20:23:57 +0000 (21:23 +0100)
Allow configuring the DPM watchdog to warn about slow suspend/resume
functions without causing a system panic(). This allows you to set the
DPM_WATCHDOG_WARNING_TIMEOUT to something like 5 or 10 seconds to get
warnings about slow suspend/resume functions that eventually succeed.

Signed-off-by: Douglas Anderson <dianders@chromium.org>
Reviewed-by: Tomasz Figa <tfiga@chromium.org>
Link: https://patch.msgid.link/20250109125957.v2.1.I4554f931b8da97948f308ecc651b124338ee9603@changeid
[ rjw: Subject edit ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
drivers/base/power/main.c
kernel/power/Kconfig

index 495ca6d7193a26df388b9f13d1c2aaec60668306..cbc9a7a75def7625b5f1f6c95664d95b92c30037 100644 (file)
@@ -496,6 +496,7 @@ struct dpm_watchdog {
        struct device           *dev;
        struct task_struct      *tsk;
        struct timer_list       timer;
+       bool                    fatal;
 };
 
 #define DECLARE_DPM_WATCHDOG_ON_STACK(wd) \
@@ -512,11 +513,23 @@ struct dpm_watchdog {
 static void dpm_watchdog_handler(struct timer_list *t)
 {
        struct dpm_watchdog *wd = from_timer(wd, t, timer);
+       struct timer_list *timer = &wd->timer;
+       unsigned int time_left;
+
+       if (wd->fatal) {
+               dev_emerg(wd->dev, "**** DPM device timeout ****\n");
+               show_stack(wd->tsk, NULL, KERN_EMERG);
+               panic("%s %s: unrecoverable failure\n",
+                       dev_driver_string(wd->dev), dev_name(wd->dev));
+       }
+
+       time_left = CONFIG_DPM_WATCHDOG_TIMEOUT - CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT;
+       dev_warn(wd->dev, "**** DPM device timeout after %u seconds; %u seconds until panic ****\n",
+                CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT, time_left);
+       show_stack(wd->tsk, NULL, KERN_WARNING);
 
-       dev_emerg(wd->dev, "**** DPM device timeout ****\n");
-       show_stack(wd->tsk, NULL, KERN_EMERG);
-       panic("%s %s: unrecoverable failure\n",
-               dev_driver_string(wd->dev), dev_name(wd->dev));
+       wd->fatal = true;
+       mod_timer(timer, jiffies + HZ * time_left);
 }
 
 /**
@@ -530,10 +543,11 @@ static void dpm_watchdog_set(struct dpm_watchdog *wd, struct device *dev)
 
        wd->dev = dev;
        wd->tsk = current;
+       wd->fatal = CONFIG_DPM_WATCHDOG_TIMEOUT == CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT;
 
        timer_setup_on_stack(timer, dpm_watchdog_handler, 0);
        /* use same timeout value for both suspend and resume */
-       timer->expires = jiffies + HZ * CONFIG_DPM_WATCHDOG_TIMEOUT;
+       timer->expires = jiffies + HZ * CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT;
        add_timer(timer);
 }
 
index afce8130d8b92c71f1db895f4039e5a38bf3722b..ca947ed32e3ddb9b05bbe3cdfbc37109280f64dc 100644 (file)
@@ -257,11 +257,30 @@ config DPM_WATCHDOG
          boot session.
 
 config DPM_WATCHDOG_TIMEOUT
-       int "Watchdog timeout in seconds"
+       int "Watchdog timeout to panic in seconds"
        range 1 120
        default 120
        depends on DPM_WATCHDOG
 
+config DPM_WATCHDOG_WARNING_TIMEOUT
+       int "Watchdog timeout to warn in seconds"
+       range 1 DPM_WATCHDOG_TIMEOUT
+       default DPM_WATCHDOG_TIMEOUT
+       depends on DPM_WATCHDOG
+       help
+         If the DPM watchdog warning timeout and main timeout are
+         different then a non-fatal warning (with a stack trace of
+         the stuck suspend routine) will be printed when the warning
+         timeout expires. If the suspend routine gets un-stuck
+         before the main timeout expires then no other action is
+         taken. If the routine continues to be stuck and the main
+         timeout expires then an emergency-level message and stack
+         trace will be printed and the system will panic.
+
+         If the warning timeout is equal to the main timeout (the
+         default) then the warning will never happen and the system
+         will jump straight to panic when the main timeout expires.
+
 config PM_TRACE
        bool
        help