From 7c1be1cd5001db89f19f1a22041d7844e54a03de Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sun, 27 Aug 2023 10:03:58 +0200 Subject: [PATCH] 5.10-stable patches added patches: tick-detect-and-fix-jiffies-update-stall.patch timers-nohz-switch-to-oneshot_stopped-in-the-low-res-handler-when-the-tick-is-stopped.patch torture-fix-hang-during-kthread-shutdown-phase.patch --- queue-5.10/series | 3 + ...-detect-and-fix-jiffies-update-stall.patch | 85 +++++++++++++++++++ ...res-handler-when-the-tick-is-stopped.patch | 58 +++++++++++++ ...x-hang-during-kthread-shutdown-phase.patch | 54 ++++++++++++ 4 files changed, 200 insertions(+) create mode 100644 queue-5.10/tick-detect-and-fix-jiffies-update-stall.patch create mode 100644 queue-5.10/timers-nohz-switch-to-oneshot_stopped-in-the-low-res-handler-when-the-tick-is-stopped.patch create mode 100644 queue-5.10/torture-fix-hang-during-kthread-shutdown-phase.patch diff --git a/queue-5.10/series b/queue-5.10/series index b252d585bf9..84bf2e95ae9 100644 --- a/queue-5.10/series +++ b/queue-5.10/series @@ -60,3 +60,6 @@ pci-acpiphp-use-pci_assign_unassigned_bridge_resources-only-for-non-root-bus.pat drm-vmwgfx-fix-shader-stage-validation.patch drm-display-dp-fix-the-dp-dsc-receiver-cap-size.patch x86-fpu-set-x86_feature_osxsave-feature-after-enabling-osxsave-in-cr4.patch +torture-fix-hang-during-kthread-shutdown-phase.patch +tick-detect-and-fix-jiffies-update-stall.patch +timers-nohz-switch-to-oneshot_stopped-in-the-low-res-handler-when-the-tick-is-stopped.patch diff --git a/queue-5.10/tick-detect-and-fix-jiffies-update-stall.patch b/queue-5.10/tick-detect-and-fix-jiffies-update-stall.patch new file mode 100644 index 00000000000..d01195d7432 --- /dev/null +++ b/queue-5.10/tick-detect-and-fix-jiffies-update-stall.patch @@ -0,0 +1,85 @@ +From a1ff03cd6fb9c501fff63a4a2bface9adcfa81cd Mon Sep 17 00:00:00 2001 +From: Frederic Weisbecker +Date: Wed, 2 Feb 2022 01:01:07 +0100 +Subject: tick: Detect and fix jiffies update stall + +From: Frederic Weisbecker + +commit a1ff03cd6fb9c501fff63a4a2bface9adcfa81cd upstream. + +On some rare cases, the timekeeper CPU may be delaying its jiffies +update duty for a while. Known causes include: + +* The timekeeper is waiting on stop_machine in a MULTI_STOP_DISABLE_IRQ + or MULTI_STOP_RUN state. Disabled interrupts prevent from timekeeping + updates while waiting for the target CPU to complete its + stop_machine() callback. + +* The timekeeper vcpu has VMEXIT'ed for a long while due to some overload + on the host. + +Detect and fix these situations with emergency timekeeping catchups. + +Original-patch-by: Paul E. McKenney +Signed-off-by: Frederic Weisbecker +Cc: Thomas Gleixner +Signed-off-by: Joel Fernandes (Google) +Signed-off-by: Greg Kroah-Hartman +--- + kernel/time/tick-sched.c | 17 +++++++++++++++++ + kernel/time/tick-sched.h | 4 ++++ + 2 files changed, 21 insertions(+) + +--- a/kernel/time/tick-sched.c ++++ b/kernel/time/tick-sched.c +@@ -148,6 +148,8 @@ static ktime_t tick_init_jiffy_update(vo + return period; + } + ++#define MAX_STALLED_JIFFIES 5 ++ + static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) + { + int cpu = smp_processor_id(); +@@ -175,6 +177,21 @@ static void tick_sched_do_timer(struct t + if (tick_do_timer_cpu == cpu) + tick_do_update_jiffies64(now); + ++ /* ++ * If jiffies update stalled for too long (timekeeper in stop_machine() ++ * or VMEXIT'ed for several msecs), force an update. ++ */ ++ if (ts->last_tick_jiffies != jiffies) { ++ ts->stalled_jiffies = 0; ++ ts->last_tick_jiffies = READ_ONCE(jiffies); ++ } else { ++ if (++ts->stalled_jiffies == MAX_STALLED_JIFFIES) { ++ tick_do_update_jiffies64(now); ++ ts->stalled_jiffies = 0; ++ ts->last_tick_jiffies = READ_ONCE(jiffies); ++ } ++ } ++ + if (ts->inidle) + ts->got_idle_tick = 1; + } +--- a/kernel/time/tick-sched.h ++++ b/kernel/time/tick-sched.h +@@ -49,6 +49,8 @@ enum tick_nohz_mode { + * @timer_expires_base: Base time clock monotonic for @timer_expires + * @next_timer: Expiry time of next expiring timer for debugging purpose only + * @tick_dep_mask: Tick dependency mask - is set, if someone needs the tick ++ * @last_tick_jiffies: Value of jiffies seen on last tick ++ * @stalled_jiffies: Number of stalled jiffies detected across ticks + */ + struct tick_sched { + struct hrtimer sched_timer; +@@ -77,6 +79,8 @@ struct tick_sched { + u64 next_timer; + ktime_t idle_expires; + atomic_t tick_dep_mask; ++ unsigned long last_tick_jiffies; ++ unsigned int stalled_jiffies; + }; + + extern struct tick_sched *tick_get_tick_sched(int cpu); diff --git a/queue-5.10/timers-nohz-switch-to-oneshot_stopped-in-the-low-res-handler-when-the-tick-is-stopped.patch b/queue-5.10/timers-nohz-switch-to-oneshot_stopped-in-the-low-res-handler-when-the-tick-is-stopped.patch new file mode 100644 index 00000000000..a7ef482b851 --- /dev/null +++ b/queue-5.10/timers-nohz-switch-to-oneshot_stopped-in-the-low-res-handler-when-the-tick-is-stopped.patch @@ -0,0 +1,58 @@ +From 62c1256d544747b38e77ca9b5bfe3a26f9592576 Mon Sep 17 00:00:00 2001 +From: Nicholas Piggin +Date: Sat, 23 Apr 2022 00:14:46 +1000 +Subject: timers/nohz: Switch to ONESHOT_STOPPED in the low-res handler when the tick is stopped + +From: Nicholas Piggin + +commit 62c1256d544747b38e77ca9b5bfe3a26f9592576 upstream. + +When tick_nohz_stop_tick() stops the tick and high resolution timers are +disabled, then the clock event device is not put into ONESHOT_STOPPED +mode. This can lead to spurious timer interrupts with some clock event +device drivers that don't shut down entirely after firing. + +Eliminate these by putting the device into ONESHOT_STOPPED mode at points +where it is not being reprogrammed. When there are no timers active, then +tick_program_event() with KTIME_MAX can be used to stop the device. When +there is a timer active, the device can be stopped at the next tick (any +new timer added by timers will reprogram the tick). + +Signed-off-by: Nicholas Piggin +Signed-off-by: Thomas Gleixner +Link: https://lore.kernel.org/r/20220422141446.915024-1-npiggin@gmail.com +Signed-off-by: Joel Fernandes (Google) +Signed-off-by: Greg Kroah-Hartman +--- + kernel/time/tick-sched.c | 12 ++++++++++-- + 1 file changed, 10 insertions(+), 2 deletions(-) + +--- a/kernel/time/tick-sched.c ++++ b/kernel/time/tick-sched.c +@@ -884,6 +884,8 @@ static void tick_nohz_stop_tick(struct t + if (unlikely(expires == KTIME_MAX)) { + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) + hrtimer_cancel(&ts->sched_timer); ++ else ++ tick_program_event(KTIME_MAX, 1); + return; + } + +@@ -1274,9 +1276,15 @@ static void tick_nohz_handler(struct clo + tick_sched_do_timer(ts, now); + tick_sched_handle(ts, regs); + +- /* No need to reprogram if we are running tickless */ +- if (unlikely(ts->tick_stopped)) ++ if (unlikely(ts->tick_stopped)) { ++ /* ++ * The clockevent device is not reprogrammed, so change the ++ * clock event device to ONESHOT_STOPPED to avoid spurious ++ * interrupts on devices which might not be truly one shot. ++ */ ++ tick_program_event(KTIME_MAX, 1); + return; ++ } + + hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); + tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); diff --git a/queue-5.10/torture-fix-hang-during-kthread-shutdown-phase.patch b/queue-5.10/torture-fix-hang-during-kthread-shutdown-phase.patch new file mode 100644 index 00000000000..30c102b1c27 --- /dev/null +++ b/queue-5.10/torture-fix-hang-during-kthread-shutdown-phase.patch @@ -0,0 +1,54 @@ +From d52d3a2bf408ff86f3a79560b5cce80efb340239 Mon Sep 17 00:00:00 2001 +From: "Joel Fernandes (Google)" +Date: Sun, 1 Jan 2023 06:15:55 +0000 +Subject: torture: Fix hang during kthread shutdown phase + +From: Joel Fernandes (Google) + +commit d52d3a2bf408ff86f3a79560b5cce80efb340239 upstream. + +During rcutorture shutdown, the rcu_torture_cleanup() function calls +torture_cleanup_begin(), which sets the fullstop global variable to +FULLSTOP_RMMOD. This causes the rcutorture threads for readers and +fakewriters to exit all of their "while" loops and start shutting down. + +They then call torture_kthread_stopping(), which in turn waits for +kthread_stop() to be called. However, rcu_torture_cleanup() has +not yet called kthread_stop() on those threads, and before it gets a +chance to do so, multiple instances of torture_kthread_stopping() invoke +schedule_timeout_interruptible(1) in a tight loop. Tracing confirms that +TIMER_SOFTIRQ can then continuously execute timer callbacks. If that +TIMER_SOFTIRQ preempts the task executing rcu_torture_cleanup(), that +task might never invoke kthread_stop(). + +This commit improves this situation by increasing the timeout passed to +schedule_timeout_interruptible() from one jiffy to 1/20th of a second. +This change prevents TIMER_SOFTIRQ from monopolizing its CPU, thus +allowing rcu_torture_cleanup() to carry out the needed kthread_stop() +invocations. Testing has shown 100 runs of TREE07 passing reliably, +as oppose to the tens-of-percent failure rates seen beforehand. + +Cc: Paul McKenney +Cc: Frederic Weisbecker +Cc: Zhouyi Zhou +Cc: # 6.0.x +Signed-off-by: Joel Fernandes (Google) +Tested-by: Zhouyi Zhou +Reviewed-by: Davidlohr Bueso +Signed-off-by: Paul E. McKenney +Signed-off-by: Greg Kroah-Hartman +--- + kernel/torture.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/kernel/torture.c ++++ b/kernel/torture.c +@@ -788,7 +788,7 @@ void torture_kthread_stopping(char *titl + VERBOSE_TOROUT_STRING(buf); + while (!kthread_should_stop()) { + torture_shutdown_absorb(title); +- schedule_timeout_uninterruptible(1); ++ schedule_timeout_uninterruptible(HZ / 20); + } + } + EXPORT_SYMBOL_GPL(torture_kthread_stopping); -- 2.47.3