]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
sched_ext: Enable the ops breather and eject BPF scheduler on softlockup
authorTejun Heo <tj@kernel.org>
Tue, 5 Nov 2024 21:49:04 +0000 (11:49 -1000)
committerTejun Heo <tj@kernel.org>
Fri, 8 Nov 2024 20:42:22 +0000 (10:42 -1000)
On 2 x Intel Sapphire Rapids machines with 224 logical CPUs, a poorly
behaving BPF scheduler can live-lock the system by making multiple CPUs bang
on the same DSQ to the point where soft-lockup detection triggers before
SCX's own watchdog can take action. It also seems possible that the machine
can be live-locked enough to prevent scx_ops_helper, which is an RT task,
from running in a timely manner.

Implement scx_softlockup() which is called when three quarters of
soft-lockup threshold has passed. The function immediately enables the ops
breather and triggers an ops error to initiate ejection of the BPF
scheduler.

The previous and this patch combined enable the kernel to reliably recover
the system from live-lock conditions that can be triggered by a poorly
behaving BPF scheduler on Intel dual socket systems.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Douglas Anderson <dianders@chromium.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
include/linux/sched/ext.h
kernel/sched/ext.c
kernel/watchdog.c
tools/sched_ext/scx_show_state.py

index 1ddbde64a31b4a233b8cd48fdd4971c18102575b..65bc0a489cd2e07873e826d7e460d9f4c0386357 100644 (file)
@@ -205,11 +205,13 @@ struct sched_ext_entity {
 
 void sched_ext_free(struct task_struct *p);
 void print_scx_info(const char *log_lvl, struct task_struct *p);
+void scx_softlockup(u32 dur_s);
 
 #else  /* !CONFIG_SCHED_CLASS_EXT */
 
 static inline void sched_ext_free(struct task_struct *p) {}
 static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {}
+static inline void scx_softlockup(u32 dur_s) {}
 
 #endif /* CONFIG_SCHED_CLASS_EXT */
 #endif /* _LINUX_SCHED_EXT_H */
index 2d41f1917464f753a32b589ce33c5227b0ade53e..02f39314ef8ae293dade1568c57712bd8d4e7c70 100644 (file)
@@ -867,6 +867,7 @@ static DEFINE_MUTEX(scx_ops_enable_mutex);
 DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled);
 DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
 static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED);
+static unsigned long scx_in_softlockup;
 static atomic_t scx_ops_breather_depth = ATOMIC_INIT(0);
 static int scx_ops_bypass_depth;
 static bool scx_ops_init_task_enabled;
@@ -4614,6 +4615,49 @@ bool task_should_scx(struct task_struct *p)
        return p->policy == SCHED_EXT;
 }
 
+/**
+ * scx_softlockup - sched_ext softlockup handler
+ *
+ * On some multi-socket setups (e.g. 2x Intel 8480c), the BPF scheduler can
+ * live-lock the system by making many CPUs target the same DSQ to the point
+ * where soft-lockup detection triggers. This function is called from
+ * soft-lockup watchdog when the triggering point is close and tries to unjam
+ * the system by enabling the breather and aborting the BPF scheduler.
+ */
+void scx_softlockup(u32 dur_s)
+{
+       switch (scx_ops_enable_state()) {
+       case SCX_OPS_ENABLING:
+       case SCX_OPS_ENABLED:
+               break;
+       default:
+               return;
+       }
+
+       /* allow only one instance, cleared at the end of scx_ops_bypass() */
+       if (test_and_set_bit(0, &scx_in_softlockup))
+               return;
+
+       printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU%d stuck for %us, disabling \"%s\"\n",
+                       smp_processor_id(), dur_s, scx_ops.name);
+
+       /*
+        * Some CPUs may be trapped in the dispatch paths. Enable breather
+        * immediately; otherwise, we might even be able to get to
+        * scx_ops_bypass().
+        */
+       atomic_inc(&scx_ops_breather_depth);
+
+       scx_ops_error("soft lockup - CPU#%d stuck for %us",
+                     smp_processor_id(), dur_s);
+}
+
+static void scx_clear_softlockup(void)
+{
+       if (test_and_clear_bit(0, &scx_in_softlockup))
+               atomic_dec(&scx_ops_breather_depth);
+}
+
 /**
  * scx_ops_bypass - [Un]bypass scx_ops and guarantee forward progress
  *
@@ -4724,6 +4768,7 @@ static void scx_ops_bypass(bool bypass)
        atomic_dec(&scx_ops_breather_depth);
 unlock:
        raw_spin_unlock_irqrestore(&bypass_lock, flags);
+       scx_clear_softlockup();
 }
 
 static void free_exit_info(struct scx_exit_info *ei)
index 262691ba62b7ad6f92612e24b0866864424e02b3..5a93d4c446b81eaad8bf2b059c7055f9d8d5c570 100644 (file)
@@ -644,6 +644,14 @@ static int is_softlockup(unsigned long touch_ts,
                    need_counting_irqs())
                        start_counting_irqs();
 
+               /*
+                * A poorly behaving BPF scheduler can live-lock the system into
+                * soft lockups. Tell sched_ext to try ejecting the BPF
+                * scheduler when close to a soft lockup.
+                */
+               if (time_after_eq(now, period_ts + get_softlockup_thresh() * 3 / 4))
+                       scx_softlockup(now - touch_ts);
+
                /* Warn about unreasonable delays. */
                if (time_after(now, period_ts + get_softlockup_thresh()))
                        return now - touch_ts;
index c4b3fdda9a0b7cd9cc316d540c9d65fa4979201d..b800d4f5f2e9952baeb8c7ad2a73df9f4ac8e512 100644 (file)
@@ -35,6 +35,8 @@ print(f'enabled       : {read_static_key("__scx_ops_enabled")}')
 print(f'switching_all : {read_int("scx_switching_all")}')
 print(f'switched_all  : {read_static_key("__scx_switched_all")}')
 print(f'enable_state  : {ops_state_str(enable_state)} ({enable_state})')
+print(f'in_softlockup : {prog["scx_in_softlockup"].value_()}')
+print(f'breather_depth: {read_atomic("scx_ops_breather_depth")}')
 print(f'bypass_depth  : {prog["scx_ops_bypass_depth"].value_()}')
 print(f'nr_rejected   : {read_atomic("scx_nr_rejected")}')
 print(f'enable_seq    : {read_atomic("scx_enable_seq")}')