6.13-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Mon, 24 Mar 2025 20:50:55 +0000 (16:50 -0400)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Mon, 24 Mar 2025 20:50:55 +0000 (16:50 -0400)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 24 Mar 2025 20:50:55 +0000 (16:50 -0400)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 24 Mar 2025 20:50:55 +0000 (16:50 -0400)
diff --git a/queue-6.13/revert-sched-core-reduce-cost-of-sched_move_task-when-config-autogroup.patch b/queue-6.13/revert-sched-core-reduce-cost-of-sched_move_task-when-config-autogroup.patch

new file mode 100644 (file)

index 0000000..a85dc29
--- /dev/null
+++ b/queue-6.13/revert-sched-core-reduce-cost-of-sched_move_task-when-config-autogroup.patch
@@ -0,0 +1,185 @@
+From 76f970ce51c80f625eb6ddbb24e9cb51b977b598 Mon Sep 17 00:00:00 2001
+From: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Date: Fri, 14 Mar 2025 16:13:45 +0100
+Subject: Revert "sched/core: Reduce cost of sched_move_task when config autogroup"
+
+From: Dietmar Eggemann <dietmar.eggemann@arm.com>
+
+commit 76f970ce51c80f625eb6ddbb24e9cb51b977b598 upstream.
+
+This reverts commit eff6c8ce8d4d7faef75f66614dd20bb50595d261.
+
+Hazem reported a 30% drop in UnixBench spawn test with commit
+eff6c8ce8d4d ("sched/core: Reduce cost of sched_move_task when config
+autogroup") on a m6g.xlarge AWS EC2 instance with 4 vCPUs and 16 GiB RAM
+(aarch64) (single level MC sched domain):
+
+  https://lkml.kernel.org/r/20250205151026.13061-1-hagarhem@amazon.com
+
+There is an early bail from sched_move_task() if p->sched_task_group is
+equal to p's 'cpu cgroup' (sched_get_task_group()). E.g. both are
+pointing to taskgroup '/user.slice/user-1000.slice/session-1.scope'
+(Ubuntu '22.04.5 LTS').
+
+So in:
+
+  do_exit()
+
+    sched_autogroup_exit_task()
+
+      sched_move_task()
+
+        if sched_get_task_group(p) == p->sched_task_group
+          return
+
+        /* p is enqueued */
+        dequeue_task()              \
+        sched_change_group()        |
+          task_change_group_fair()  |
+            detach_task_cfs_rq()    |                              (1)
+            set_task_rq()           |
+            attach_task_cfs_rq()    |
+        enqueue_task()              /
+
+(1) isn't called for p anymore.
+
+Turns out that the regression is related to sgs->group_util in
+group_is_overloaded() and group_has_capacity(). If (1) isn't called for
+all the 'spawn' tasks then sgs->group_util is ~900 and
+sgs->group_capacity = 1024 (single CPU sched domain) and this leads to
+group_is_overloaded() returning true (2) and group_has_capacity() false
+(3) much more often compared to the case when (1) is called.
+
+I.e. there are much more cases of 'group_is_overloaded' and
+'group_fully_busy' in WF_FORK wakeup sched_balance_find_dst_cpu() which
+then returns much more often a CPU != smp_processor_id() (5).
+
+This isn't good for these extremely short running tasks (FORK + EXIT)
+and also involves calling sched_balance_find_dst_group_cpu() unnecessary
+(single CPU sched domain).
+
+Instead if (1) is called for 'p->flags & PF_EXITING' then the path
+(4),(6) is taken much more often.
+
+  select_task_rq_fair(..., wake_flags = WF_FORK)
+
+    cpu = smp_processor_id()
+
+    new_cpu = sched_balance_find_dst_cpu(..., cpu, ...)
+
+      group = sched_balance_find_dst_group(..., cpu)
+
+        do {
+
+          update_sg_wakeup_stats()
+
+            sgs->group_type = group_classify()
+
+              if group_is_overloaded()                             (2)
+                return group_overloaded
+
+              if !group_has_capacity()                             (3)
+                return group_fully_busy
+
+              return group_has_spare                               (4)
+
+        } while group
+
+        if local_sgs.group_type > idlest_sgs.group_type
+          return idlest                                            (5)
+
+        case group_has_spare:
+
+          if local_sgs.idle_cpus >= idlest_sgs.idle_cpus
+            return NULL                                            (6)
+
+Unixbench Tests './Run -c 4 spawn' on:
+
+(a) VM AWS instance (m7gd.16xlarge) with v6.13 ('maxcpus=4 nr_cpus=4')
+    and Ubuntu 22.04.5 LTS (aarch64).
+
+    Shell & test run in '/user.slice/user-1000.slice/session-1.scope'.
+
+    w/o patch  w/ patch
+    21005      27120
+
+(b) i7-13700K with tip/sched/core ('nosmt maxcpus=8 nr_cpus=8') and
+    Ubuntu 22.04.5 LTS (x86_64).
+
+    Shell & test run in '/A'.
+
+    w/o patch  w/ patch
+    67675      88806
+
+CONFIG_SCHED_AUTOGROUP=y & /sys/proc/kernel/sched_autogroup_enabled equal
+0 or 1.
+
+Reported-by: Hazem Mohamed Abuelfotoh <abuehaze@amazon.com>
+Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
+Tested-by: Hagar Hemdan <hagarhem@amazon.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Link: https://lore.kernel.org/r/20250314151345.275739-1-dietmar.eggemann@arm.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/sched/core.c |   21 +++------------------
+ 1 file changed, 3 insertions(+), 18 deletions(-)
+
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -9010,7 +9010,7 @@ void sched_release_group(struct task_gro
+       spin_unlock_irqrestore(&task_group_lock, flags);
+ }
+ 
+-static struct task_group *sched_get_task_group(struct task_struct *tsk)
++static void sched_change_group(struct task_struct *tsk)
+ {
+       struct task_group *tg;
+ 
+@@ -9022,13 +9022,7 @@ static struct task_group *sched_get_task
+       tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
+                         struct task_group, css);
+       tg = autogroup_task_group(tsk, tg);
+-
+-      return tg;
+-}
+-
+-static void sched_change_group(struct task_struct *tsk, struct task_group *group)
+-{
+-      tsk->sched_task_group = group;
++      tsk->sched_task_group = tg;
+ 
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+       if (tsk->sched_class->task_change_group)
+@@ -9049,20 +9043,11 @@ void sched_move_task(struct task_struct
+ {
+       int queued, running, queue_flags =
+               DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
+-      struct task_group *group;
+       struct rq *rq;
+ 
+       CLASS(task_rq_lock, rq_guard)(tsk);
+       rq = rq_guard.rq;
+ 
+-      /*
+-       * Esp. with SCHED_AUTOGROUP enabled it is possible to get superfluous
+-       * group changes.
+-       */
+-      group = sched_get_task_group(tsk);
+-      if (group == tsk->sched_task_group)
+-              return;
+-
+       update_rq_clock(rq);
+ 
+       running = task_current_donor(rq, tsk);
+@@ -9073,7 +9058,7 @@ void sched_move_task(struct task_struct
+       if (running)
+               put_prev_task(rq, tsk);
+ 
+-      sched_change_group(tsk, group);
++      sched_change_group(tsk);
+       if (!for_autogroup)
+               scx_cgroup_move_task(tsk);
+ 
diff --git a/queue-6.13/series b/queue-6.13/series

index 4f572e20ac92ad4b1ddeec8785a4c661e94445bc..d2e434dfe63995405cf221029eb28a781254323e 100644 (file)
--- a/queue-6.13/series
+++ b/queue-6.13/series
@@ -115,3 +115,4 @@ kvm-arm64-remove-vhe-host-restore-of-cpacr_el1.smen.patch
  kvm-arm64-refactor-exit-handlers.patch
  kvm-arm64-mark-some-header-functions-as-inline.patch
  kvm-arm64-eagerly-switch-zcr_el-1-2.patch
+revert-sched-core-reduce-cost-of-sched_move_task-when-config-autogroup.patch
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Mon, 24 Mar 2025 20:50:55 +0000 (16:50 -0400)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Mon, 24 Mar 2025 20:50:55 +0000 (16:50 -0400)
queue-6.13/revert-sched-core-reduce-cost-of-sched_move_task-when-config-autogroup.patch	[new file with mode: 0644]	patch \| blob
queue-6.13/series		patch \| blob \| blame \| history