From: Heiko Carstens Date: Wed, 13 May 2026 14:01:28 +0000 (+0200) Subject: s390/idle: Provide arch specific kcpustat_field_idle()/kcpustat_field_iowait() X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=670e057744e0cc8047bf96d15d18c46e16ae2e93;p=thirdparty%2Flinux.git s390/idle: Provide arch specific kcpustat_field_idle()/kcpustat_field_iowait() The former s390 specific arch_cpu_idle_time() implementation was removed, since its implementation was racy and reported idle time could go backwards [1]. However this removal was not necessary, since independently of the s390 architecture specific races there exists the iowait counter update race, which can also lead to reported idle time going backwards [2]. With Frederic Weisbecker's recent cpu idle time accounting refactoring kernel_cpustat got a sequence counter. Use this to implement s390 specific variants of kcpustat_field_idle() and kcpustat_field_iowait(). This is logically a revert of [1] and moves cpu idle time accounting back into s390 architecture code, which is also more precise than the dyntick idle time accounting by nohz/scheduler. For comparing cross cpu time stamps it is necessary to use the stcke instead of the stckf instruction in irq entry path. Furthermore this open-codes a sequence lock in assembler and C code, which is required to update the irq entry time stamp to the per cpu idle_data structure in a race free manner. [1] commit be76ea614460 ("s390/idle: remove arch_cpu_idle_time() and corresponding code") [2] commit ead70b752373 ("timers/nohz: Add a comment about broken iowait counter update race") Signed-off-by: Heiko Carstens Acked-by: Frederic Weisbecker Signed-off-by: Alexander Gordeev --- diff --git a/arch/s390/include/asm/idle.h b/arch/s390/include/asm/idle.h index 6963e92b60a1d..f3502d5621c05 100644 --- a/arch/s390/include/asm/idle.h +++ b/arch/s390/include/asm/idle.h @@ -14,12 +14,15 @@ #include struct s390_idle_data { - bool idle_dyntick; +#ifdef CONFIG_NO_HZ_COMMON + bool in_idle; +#endif unsigned long idle_count; unsigned long idle_time; unsigned long timer_idle_enter; unsigned long mt_cycles_enter[8]; union tod_clock clock_idle_enter; + union tod_clock clock_idle_exit; }; DECLARE_PER_CPU(struct s390_idle_data, s390_idle); diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index fbd26f3e9f96b..f6dd2b67dcee7 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -11,9 +11,11 @@ #include #include #include +#include #include #include #include +#include int main(void) { @@ -128,6 +130,7 @@ int main(void) OFFSET(__LC_LAST_UPDATE_CLOCK, lowcore, last_update_clock); OFFSET(__LC_INT_CLOCK, lowcore, int_clock); OFFSET(__LC_CURRENT, lowcore, current_task); + OFFSET(__LC_PERCPU_OFFSET, lowcore, percpu_offset); OFFSET(__LC_KERNEL_STACK, lowcore, kernel_stack); OFFSET(__LC_ASYNC_STACK, lowcore, async_stack); OFFSET(__LC_NODAT_STACK, lowcore, nodat_stack); @@ -180,6 +183,10 @@ int main(void) DEFINE(OLDMEM_SIZE, PARMAREA + offsetof(struct parmarea, oldmem_size)); DEFINE(COMMAND_LINE, PARMAREA + offsetof(struct parmarea, command_line)); DEFINE(MAX_COMMAND_LINE_SIZE, PARMAREA + offsetof(struct parmarea, max_command_line_size)); + OFFSET(__IDLE_CLOCK_EXIT, s390_idle_data, clock_idle_exit); +#ifdef CONFIG_NO_HZ_COMMON + OFFSET(__KCPUSTAT_SEQUENCE, kernel_cpustat, idle_sleeptime_seq); +#endif OFFSET(__FTRACE_REGS_PT_REGS, __arch_ftrace_regs, regs); DEFINE(__FTRACE_REGS_SIZE, sizeof(struct __arch_ftrace_regs)); diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 7147f3e51acec..79a45efae23d0 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -379,8 +379,19 @@ SYM_CODE_END(pgm_check_handler) SYM_CODE_START(\name) STMG_LC %r8,%r15,__LC_SAVE_AREA GET_LC %r13 +#ifdef CONFIG_NO_HZ_COMMON + larl %r12,kernel_cpustat + ag %r12,__LC_PERCPU_OFFSET(%r13) + asi __KCPUSTAT_SEQUENCE(%r12),1 +#endif stcke __LC_INT_CLOCK(%r13) stpt __LC_SYS_ENTER_TIMER(%r13) + larl %r10,s390_idle + ag %r10,__LC_PERCPU_OFFSET(%r13) + mvc __IDLE_CLOCK_EXIT(16,%r10),__LC_INT_CLOCK(%r13) +#ifdef CONFIG_NO_HZ_COMMON + asi __KCPUSTAT_SEQUENCE(%r12),1 +#endif STBEAR __LC_LAST_BREAK(%r13) BPOFF lmg %r8,%r9,\lc_old_psw(%r13) @@ -407,7 +418,6 @@ SYM_CODE_START(\name) xgr %r5,%r5 xgr %r6,%r6 xgr %r7,%r7 - xgr %r10,%r10 xgr %r12,%r12 xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11) mvc __PT_R8(64,%r11),__LC_SAVE_AREA(%r13) diff --git a/arch/s390/kernel/idle.c b/arch/s390/kernel/idle.c index 36020dffb86be..b5fae512fc9ca 100644 --- a/arch/s390/kernel/idle.c +++ b/arch/s390/kernel/idle.c @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -21,22 +22,111 @@ DEFINE_PER_CPU(struct s390_idle_data, s390_idle); -void account_idle_time_irq(void) +static __always_inline void __account_idle_time_irq(void) { struct s390_idle_data *idle = this_cpu_ptr(&s390_idle); unsigned long idle_time; - idle_time = get_lowcore()->int_clock.tod - idle->clock_idle_enter.tod; - - /* Account time spent with enabled wait psw loaded as idle time. */ + idle_time = idle->clock_idle_exit.tod - idle->clock_idle_enter.tod; __atomic64_add(idle_time, &idle->idle_time); __atomic64_add_const(1, &idle->idle_count); + account_idle_time(cputime_to_nsecs(idle_time)); +} + +static __always_inline void __account_idle_time_setup(void) +{ + struct s390_idle_data *idle = this_cpu_ptr(&s390_idle); - /* Dyntick idle time accounted by nohz/scheduler */ - if (!idle->idle_dyntick) - account_idle_time(cputime_to_nsecs(idle_time)); + store_tod_clock_ext(&idle->clock_idle_enter); + idle->timer_idle_enter = get_cpu_timer(); + idle->clock_idle_exit = idle->clock_idle_enter; +} + +#ifdef CONFIG_NO_HZ_COMMON + +static u64 arch_cpu_in_idle_time(int cpu) +{ + struct s390_idle_data *idle = &per_cpu(s390_idle, cpu); + union tod_clock now; + u64 idle_time; + + if (!idle->in_idle) + return 0; + store_tod_clock_ext(&now); + if (tod_after(idle->clock_idle_exit.tod, idle->clock_idle_enter.tod)) + idle_time = idle->clock_idle_exit.tod - idle->clock_idle_enter.tod; + else + idle_time = now.tod - idle->clock_idle_enter.tod; + return cputime_to_nsecs(idle_time); +} + +static u64 arch_cpu_idle_time(int cpu, enum cpu_usage_stat idx, bool compute_delta) +{ + struct kernel_cpustat *kc = &kcpustat_cpu(cpu); + u64 *cpustat = kc->cpustat; + unsigned int seq; + u64 idle_time; + + /* + * The open coded seqcount writer in entry.S relies on the + * raw counting mechanism without any writer protection. + */ + typecheck(typeof(kc->idle_sleeptime_seq), seqcount_t); + do { + seq = read_seqcount_begin(&kc->idle_sleeptime_seq); + idle_time = cpustat[idx]; + if (compute_delta) + idle_time += arch_cpu_in_idle_time(cpu); + } while (read_seqcount_retry(&kc->idle_sleeptime_seq, seq)); + return idle_time; +} + +u64 arch_kcpustat_field_idle(int cpu) +{ + return arch_cpu_idle_time(cpu, CPUTIME_IDLE, !nr_iowait_cpu(cpu)); +} + +u64 arch_kcpustat_field_iowait(int cpu) +{ + return arch_cpu_idle_time(cpu, CPUTIME_IOWAIT, nr_iowait_cpu(cpu)); } +void account_idle_time_irq(void) +{ + struct s390_idle_data *idle = this_cpu_ptr(&s390_idle); + struct kernel_cpustat *kc = kcpustat_this_cpu; + + write_seqcount_begin(&kc->idle_sleeptime_seq); + idle->in_idle = false; + __account_idle_time_irq(); + write_seqcount_end(&kc->idle_sleeptime_seq); +} + +static __always_inline void account_idle_time_setup(void) +{ + struct s390_idle_data *idle = this_cpu_ptr(&s390_idle); + struct kernel_cpustat *kc = kcpustat_this_cpu; + + raw_write_seqcount_begin(&kc->idle_sleeptime_seq); + idle->in_idle = true; + __account_idle_time_setup(); + raw_write_seqcount_end(&kc->idle_sleeptime_seq); +} + +#else /* CONFIG_NO_HZ_COMMON */ + +void account_idle_time_irq(void) +{ + __account_idle_time_irq(); +} + +static __always_inline void account_idle_time_setup(void) +{ + __account_idle_time_setup(); +} + +#endif /* CONFIG_NO_HZ_COMMON */ + void noinstr arch_cpu_idle(void) { struct s390_idle_data *idle = this_cpu_ptr(&s390_idle); @@ -49,8 +139,7 @@ void noinstr arch_cpu_idle(void) set_cpu_flag(CIF_ENABLED_WAIT); if (smp_cpu_mtid) stcctm(MT_DIAG, smp_cpu_mtid, (u64 *)&idle->mt_cycles_enter); - store_tod_clock_ext(&idle->clock_idle_enter); - idle->timer_idle_enter = get_cpu_timer(); + account_idle_time_setup(); bpon(); __load_psw_mask(psw_mask); } diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index d1102a6f80bda..d804e1140c2e9 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -140,8 +140,6 @@ static int do_account_vtime(struct task_struct *tsk) if (hardirq_count()) lc->hardirq_timer += timer; - else if (in_serving_softirq()) - lc->softirq_timer += timer; else lc->system_timer += timer; @@ -241,62 +239,13 @@ EXPORT_SYMBOL_GPL(vtime_account_kernel); void vtime_account_softirq(struct task_struct *tsk) { - if (!__this_cpu_read(s390_idle.idle_dyntick)) - get_lowcore()->softirq_timer += vtime_delta(); - else - vtime_flush(tsk); + get_lowcore()->softirq_timer += vtime_delta(); } void vtime_account_hardirq(struct task_struct *tsk) { - if (!__this_cpu_read(s390_idle.idle_dyntick)) { - get_lowcore()->hardirq_timer += vtime_delta(); - } else { - /* - * In dynticks mode, the idle cputime is accounted by the nohz - * subsystem. Therefore the s390 timer/clocks are reset on IRQ - * entry and steal time must be accounted now. - */ - vtime_flush(tsk); - } -} - -#ifdef CONFIG_NO_HZ_COMMON -/** - * vtime_reset - Fast forward vtime entry clocks - * - * Called from dynticks idle IRQ entry to fast-forward the clocks to current time - * so that the IRQ time is still accounted by vtime while nohz cputime is paused. - */ -void vtime_reset(void) -{ - vtime_reset_last_update(get_lowcore()); -} - -/** - * vtime_dyntick_start - Inform vtime about entry to idle-dynticks - * - * Called when idle enters in dyntick mode. The idle cputime that elapsed so far - * is flushed and the tick subsystem takes over the idle cputime accounting. - */ -void vtime_dyntick_start(void) -{ - __this_cpu_write(s390_idle.idle_dyntick, true); - vtime_flush(current); -} - -/** - * vtime_dyntick_stop - Inform vtime about exit from idle-dynticks - * - * Called when idle exits from dyntick mode. The vtime entry clocks are - * fast-forward to current time and idle accounting resumes. - */ -void vtime_dyntick_stop(void) -{ - vtime_reset_last_update(get_lowcore()); - __this_cpu_write(s390_idle.idle_dyntick, false); + get_lowcore()->hardirq_timer += vtime_delta(); } -#endif /* CONFIG_NO_HZ_COMMON */ /* * Sorted add to a list. List is linear searched until first bigger diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index fce1392e21403..9ca6c2259dfea 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -107,6 +107,30 @@ static inline unsigned long kstat_cpu_irqs_sum(unsigned int cpu) } #ifdef CONFIG_NO_HZ_COMMON + +#ifdef CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE + +static inline void kcpustat_dyntick_start(u64 now) { } +static inline void kcpustat_dyntick_stop(u64 now) { } +static inline void kcpustat_irq_enter(u64 now) { } +static inline void kcpustat_irq_exit(u64 now) { } +static inline bool kcpustat_idle_dyntick(void) { return false; } + +extern u64 arch_kcpustat_field_idle(int cpu); +extern u64 arch_kcpustat_field_iowait(int cpu); + +static inline u64 kcpustat_field_idle(int cpu) +{ + return arch_kcpustat_field_idle(cpu); +} + +static inline u64 kcpustat_field_iowait(int cpu) +{ + return arch_kcpustat_field_iowait(cpu); +} + +#else /* !CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE */ + extern void kcpustat_dyntick_start(u64 now); extern void kcpustat_dyntick_stop(u64 now); extern void kcpustat_irq_enter(u64 now); @@ -118,6 +142,9 @@ static inline bool kcpustat_idle_dyntick(void) { return __this_cpu_read(kernel_cpustat.idle_dyntick); } + +#endif /* !CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE */ + #else static inline u64 kcpustat_field_idle(int cpu) { diff --git a/include/linux/vtime.h b/include/linux/vtime.h index 9dc25b04a119e..82825e7754993 100644 --- a/include/linux/vtime.h +++ b/include/linux/vtime.h @@ -42,9 +42,15 @@ extern void vtime_account_irq(struct task_struct *tsk, unsigned int offset); extern void vtime_account_softirq(struct task_struct *tsk); extern void vtime_account_hardirq(struct task_struct *tsk); extern void vtime_flush(struct task_struct *tsk); +#ifdef CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE +static inline void vtime_reset(void) { } +static inline void vtime_dyntick_start(void) { } +static inline void vtime_dyntick_stop(void) { } +#else extern void vtime_reset(void); extern void vtime_dyntick_start(void); extern void vtime_dyntick_stop(void); +#endif #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ static inline void vtime_account_irq(struct task_struct *tsk, unsigned int offset) { } static inline void vtime_account_softirq(struct task_struct *tsk) { } diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 244b574172402..ed49a1e23d17f 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -421,7 +421,7 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_ int nr_ticks) { } #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ -#ifdef CONFIG_NO_HZ_COMMON +#if defined(CONFIG_NO_HZ_COMMON) && !defined(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) static void kcpustat_idle_stop(struct kernel_cpustat *kc, u64 now) { u64 *cpustat = kc->cpustat; @@ -560,7 +560,7 @@ static u64 kcpustat_field_dyntick(int cpu, enum cpu_usage_stat idx, { return kcpustat_cpu(cpu).cpustat[idx]; } -#endif /* CONFIG_NO_HZ_COMMON */ +#endif /* CONFIG_NO_HZ_COMMON && !CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE */ static u64 get_cpu_sleep_time_us(int cpu, enum cpu_usage_stat idx, bool compute_delta, u64 *last_update_time)