From: Greg Kroah-Hartman Date: Tue, 16 Jul 2024 14:16:29 +0000 (+0200) Subject: 6.6-stable patches X-Git-Tag: v4.19.318~21 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=55f682639e00816071448312dc3f98a4c2013d66;p=thirdparty%2Fkernel%2Fstable-queue.git 6.6-stable patches added patches: sched-move-psi_account_irqtime-out-of-update_rq_clock_task-hotpath.patch --- diff --git a/queue-6.6/sched-move-psi_account_irqtime-out-of-update_rq_clock_task-hotpath.patch b/queue-6.6/sched-move-psi_account_irqtime-out-of-update_rq_clock_task-hotpath.patch new file mode 100644 index 00000000000..288f3e4de95 --- /dev/null +++ b/queue-6.6/sched-move-psi_account_irqtime-out-of-update_rq_clock_task-hotpath.patch @@ -0,0 +1,186 @@ +From ddae0ca2a8fe12d0e24ab10ba759c3fbd755ada8 Mon Sep 17 00:00:00 2001 +From: John Stultz +Date: Tue, 18 Jun 2024 14:58:55 -0700 +Subject: sched: Move psi_account_irqtime() out of update_rq_clock_task() hotpath + +From: John Stultz + +commit ddae0ca2a8fe12d0e24ab10ba759c3fbd755ada8 upstream. + +It was reported that in moving to 6.1, a larger then 10% +regression was seen in the performance of +clock_gettime(CLOCK_THREAD_CPUTIME_ID,...). + +Using a simple reproducer, I found: +5.10: +100000000 calls in 24345994193 ns => 243.460 ns per call +100000000 calls in 24288172050 ns => 242.882 ns per call +100000000 calls in 24289135225 ns => 242.891 ns per call + +6.1: +100000000 calls in 28248646742 ns => 282.486 ns per call +100000000 calls in 28227055067 ns => 282.271 ns per call +100000000 calls in 28177471287 ns => 281.775 ns per call + +The cause of this was finally narrowed down to the addition of +psi_account_irqtime() in update_rq_clock_task(), in commit +52b1364ba0b1 ("sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ +pressure"). + +In my initial attempt to resolve this, I leaned towards moving +all accounting work out of the clock_gettime() call path, but it +wasn't very pretty, so it will have to wait for a later deeper +rework. Instead, Peter shared this approach: + +Rework psi_account_irqtime() to use its own psi_irq_time base +for accounting, and move it out of the hotpath, calling it +instead from sched_tick() and __schedule(). + +In testing this, we found the importance of ensuring +psi_account_irqtime() is run under the rq_lock, which Johannes +Weiner helpfully explained, so also add some lockdep annotations +to make that requirement clear. + +With this change the performance is back in-line with 5.10: +6.1+fix: +100000000 calls in 24297324597 ns => 242.973 ns per call +100000000 calls in 24318869234 ns => 243.189 ns per call +100000000 calls in 24291564588 ns => 242.916 ns per call + +Reported-by: Jimmy Shiu +Originally-by: Peter Zijlstra +Signed-off-by: John Stultz +Signed-off-by: Peter Zijlstra (Intel) +Reviewed-by: Chengming Zhou +Reviewed-by: Qais Yousef +Link: https://lore.kernel.org/r/20240618215909.4099720-1-jstultz@google.com +Fixes: 52b1364ba0b1 ("sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ pressure") +[jstultz: Fixed up minor collisions w/ 6.6-stable] +Signed-off-by: John Stultz +Signed-off-by: Greg Kroah-Hartman +--- + kernel/sched/core.c | 7 +++++-- + kernel/sched/psi.c | 21 ++++++++++++++++----- + kernel/sched/sched.h | 1 + + kernel/sched/stats.h | 11 ++++++++--- + 4 files changed, 30 insertions(+), 10 deletions(-) + +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -722,7 +722,6 @@ static void update_rq_clock_task(struct + + rq->prev_irq_time += irq_delta; + delta -= irq_delta; +- psi_account_irqtime(rq->curr, irq_delta); + delayacct_irq(rq->curr, irq_delta); + #endif + #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING +@@ -5641,7 +5640,7 @@ void scheduler_tick(void) + { + int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); +- struct task_struct *curr = rq->curr; ++ struct task_struct *curr; + struct rq_flags rf; + unsigned long thermal_pressure; + u64 resched_latency; +@@ -5653,6 +5652,9 @@ void scheduler_tick(void) + + rq_lock(rq, &rf); + ++ curr = rq->curr; ++ psi_account_irqtime(rq, curr, NULL); ++ + update_rq_clock(rq); + thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); + update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure); +@@ -6690,6 +6692,7 @@ static void __sched notrace __schedule(u + ++*switch_count; + + migrate_disable_switch(rq, prev); ++ psi_account_irqtime(rq, prev, next); + psi_sched_switch(prev, next, !task_on_rq_queued(prev)); + + trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state); +--- a/kernel/sched/psi.c ++++ b/kernel/sched/psi.c +@@ -784,6 +784,7 @@ static void psi_group_change(struct psi_ + enum psi_states s; + u32 state_mask; + ++ lockdep_assert_rq_held(cpu_rq(cpu)); + groupc = per_cpu_ptr(group->pcpu, cpu); + + /* +@@ -1002,19 +1003,29 @@ void psi_task_switch(struct task_struct + } + + #ifdef CONFIG_IRQ_TIME_ACCOUNTING +-void psi_account_irqtime(struct task_struct *task, u32 delta) ++void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev) + { +- int cpu = task_cpu(task); ++ int cpu = task_cpu(curr); + struct psi_group *group; + struct psi_group_cpu *groupc; +- u64 now; ++ u64 now, irq; ++ s64 delta; + +- if (!task->pid) ++ if (!curr->pid) ++ return; ++ ++ lockdep_assert_rq_held(rq); ++ group = task_psi_group(curr); ++ if (prev && task_psi_group(prev) == group) + return; + + now = cpu_clock(cpu); ++ irq = irq_time_read(cpu); ++ delta = (s64)(irq - rq->psi_irq_time); ++ if (delta < 0) ++ return; ++ rq->psi_irq_time = irq; + +- group = task_psi_group(task); + do { + if (!group->enabled) + continue; +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -1094,6 +1094,7 @@ struct rq { + + #ifdef CONFIG_IRQ_TIME_ACCOUNTING + u64 prev_irq_time; ++ u64 psi_irq_time; + #endif + #ifdef CONFIG_PARAVIRT + u64 prev_steal_time; +--- a/kernel/sched/stats.h ++++ b/kernel/sched/stats.h +@@ -110,8 +110,12 @@ __schedstats_from_se(struct sched_entity + void psi_task_change(struct task_struct *task, int clear, int set); + void psi_task_switch(struct task_struct *prev, struct task_struct *next, + bool sleep); +-void psi_account_irqtime(struct task_struct *task, u32 delta); +- ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev); ++#else ++static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr, ++ struct task_struct *prev) {} ++#endif /*CONFIG_IRQ_TIME_ACCOUNTING */ + /* + * PSI tracks state that persists across sleeps, such as iowaits and + * memory stalls. As a result, it has to distinguish between sleeps, +@@ -192,7 +196,8 @@ static inline void psi_ttwu_dequeue(stru + static inline void psi_sched_switch(struct task_struct *prev, + struct task_struct *next, + bool sleep) {} +-static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {} ++static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr, ++ struct task_struct *prev) {} + #endif /* CONFIG_PSI */ + + #ifdef CONFIG_SCHED_INFO diff --git a/queue-6.6/series b/queue-6.6/series index b2f07895514..bbf186e4a32 100644 --- a/queue-6.6/series +++ b/queue-6.6/series @@ -105,3 +105,4 @@ mm-damon-core-merge-regions-aggressively-when-max_nr_regions-is-unmet.patch selftests-net-fix-gro.c-compilation-failure-due-to-non-existent-opt_ipproto_off.patch nilfs2-fix-kernel-bug-on-rename-operation-of-broken-directory.patch ext4-avoid-ptr-null-pointer-dereference.patch +sched-move-psi_account_irqtime-out-of-update_rq_clock_task-hotpath.patch