]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
6.6-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 4 Nov 2025 02:48:01 +0000 (11:48 +0900)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 4 Nov 2025 02:48:01 +0000 (11:48 +0900)
added patches:
drm-sched-fix-race-in-drm_sched_entity_select_rq.patch
drm-sysfb-do-not-dereference-null-pointer-in-plane-reset.patch
s390-disable-arch_want_optimize_hugetlb_vmemmap.patch
s390-pci-avoid-deadlock-between-pci-error-recovery-and-mlx5-crdump.patch
sched-fair-use-all-little-cpus-for-cpu-bound-workloads.patch
sched-pelt-avoid-underestimation-of-task-utilization.patch

queue-6.6/drm-sched-fix-race-in-drm_sched_entity_select_rq.patch [new file with mode: 0644]
queue-6.6/drm-sysfb-do-not-dereference-null-pointer-in-plane-reset.patch [new file with mode: 0644]
queue-6.6/s390-disable-arch_want_optimize_hugetlb_vmemmap.patch [new file with mode: 0644]
queue-6.6/s390-pci-avoid-deadlock-between-pci-error-recovery-and-mlx5-crdump.patch [new file with mode: 0644]
queue-6.6/sched-fair-use-all-little-cpus-for-cpu-bound-workloads.patch [new file with mode: 0644]
queue-6.6/sched-pelt-avoid-underestimation-of-task-utilization.patch [new file with mode: 0644]
queue-6.6/series

diff --git a/queue-6.6/drm-sched-fix-race-in-drm_sched_entity_select_rq.patch b/queue-6.6/drm-sched-fix-race-in-drm_sched_entity_select_rq.patch
new file mode 100644 (file)
index 0000000..2c68b56
--- /dev/null
@@ -0,0 +1,44 @@
+From stable+bounces-192218-greg=kroah.com@vger.kernel.org Tue Nov  4 00:18:34 2025
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon,  3 Nov 2025 09:59:40 -0500
+Subject: drm/sched: Fix race in drm_sched_entity_select_rq()
+To: stable@vger.kernel.org
+Cc: Philipp Stanner <phasta@kernel.org>, Tvrtko Ursulin <tvrtko.ursulin@igalia.com>, Sasha Levin <sashal@kernel.org>
+Message-ID: <20251103145940.4040983-1-sashal@kernel.org>
+
+From: Philipp Stanner <phasta@kernel.org>
+
+[ Upstream commit d25e3a610bae03bffc5c14b5d944a5d0cd844678 ]
+
+In a past bug fix it was forgotten that entity access must be protected
+by the entity lock. That's a data race and potentially UB.
+
+Move the spin_unlock() to the appropriate position.
+
+Cc: stable@vger.kernel.org # v5.13+
+Fixes: ac4eb83ab255 ("drm/sched: select new rq even if there is only one v3")
+Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
+Signed-off-by: Philipp Stanner <phasta@kernel.org>
+Link: https://patch.msgid.link/20251022063402.87318-2-phasta@kernel.org
+[ adapted lock field name from entity->lock to entity->rq_lock ]
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/scheduler/sched_entity.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/drivers/gpu/drm/scheduler/sched_entity.c
++++ b/drivers/gpu/drm/scheduler/sched_entity.c
+@@ -531,10 +531,11 @@ void drm_sched_entity_select_rq(struct d
+               drm_sched_rq_remove_entity(entity->rq, entity);
+               entity->rq = rq;
+       }
+-      spin_unlock(&entity->rq_lock);
+       if (entity->num_sched_list == 1)
+               entity->sched_list = NULL;
++
++      spin_unlock(&entity->rq_lock);
+ }
+ /**
diff --git a/queue-6.6/drm-sysfb-do-not-dereference-null-pointer-in-plane-reset.patch b/queue-6.6/drm-sysfb-do-not-dereference-null-pointer-in-plane-reset.patch
new file mode 100644 (file)
index 0000000..be99a3c
--- /dev/null
@@ -0,0 +1,55 @@
+From stable+bounces-192201-greg=kroah.com@vger.kernel.org Mon Nov  3 21:50:23 2025
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon,  3 Nov 2025 07:47:27 -0500
+Subject: drm/sysfb: Do not dereference NULL pointer in plane reset
+To: stable@vger.kernel.org
+Cc: Thomas Zimmermann <tzimmermann@suse.de>, Dan Carpenter <dan.carpenter@linaro.org>, Melissa Wen <melissa.srw@gmail.com>, Maarten Lankhorst <maarten.lankhorst@linux.intel.com>, Maxime Ripard <mripard@kernel.org>, David Airlie <airlied@gmail.com>, Simona Vetter <simona@ffwll.ch>, dri-devel@lists.freedesktop.org, Javier Martinez Canillas <javierm@redhat.com>, Sasha Levin <sashal@kernel.org>
+Message-ID: <20251103124727.4003872-1-sashal@kernel.org>
+
+From: Thomas Zimmermann <tzimmermann@suse.de>
+
+[ Upstream commit 14e02ed3876f4ab0ed6d3f41972175f8b8df3d70 ]
+
+The plane state in __drm_gem_reset_shadow_plane() can be NULL. Do not
+deref that pointer, but forward NULL to the other plane-reset helpers.
+Clears plane->state to NULL.
+
+v2:
+- fix typo in commit description (Javier)
+
+Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
+Fixes: b71565022031 ("drm/gem: Export implementation of shadow-plane helpers")
+Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
+Closes: https://lore.kernel.org/dri-devel/aPIDAsHIUHp_qSW4@stanley.mountain/
+Cc: Thomas Zimmermann <tzimmermann@suse.de>
+Cc: Melissa Wen <melissa.srw@gmail.com>
+Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
+Cc: Maxime Ripard <mripard@kernel.org>
+Cc: David Airlie <airlied@gmail.com>
+Cc: Simona Vetter <simona@ffwll.ch>
+Cc: dri-devel@lists.freedesktop.org
+Cc: <stable@vger.kernel.org> # v5.15+
+Reviewed-by: Javier Martinez Canillas <javierm@redhat.com>
+Link: https://patch.msgid.link/20251017091407.58488-1-tzimmermann@suse.de
+[ removed drm_format_conv_state_init() call ]
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/drm_gem_atomic_helper.c |    6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+--- a/drivers/gpu/drm/drm_gem_atomic_helper.c
++++ b/drivers/gpu/drm/drm_gem_atomic_helper.c
+@@ -301,7 +301,11 @@ EXPORT_SYMBOL(drm_gem_destroy_shadow_pla
+ void __drm_gem_reset_shadow_plane(struct drm_plane *plane,
+                                 struct drm_shadow_plane_state *shadow_plane_state)
+ {
+-      __drm_atomic_helper_plane_reset(plane, &shadow_plane_state->base);
++      if (shadow_plane_state) {
++              __drm_atomic_helper_plane_reset(plane, &shadow_plane_state->base);
++      } else {
++              __drm_atomic_helper_plane_reset(plane, NULL);
++      }
+ }
+ EXPORT_SYMBOL(__drm_gem_reset_shadow_plane);
diff --git a/queue-6.6/s390-disable-arch_want_optimize_hugetlb_vmemmap.patch b/queue-6.6/s390-disable-arch_want_optimize_hugetlb_vmemmap.patch
new file mode 100644 (file)
index 0000000..f9132ba
--- /dev/null
@@ -0,0 +1,56 @@
+From stable+bounces-192184-greg=kroah.com@vger.kernel.org Mon Nov  3 20:07:38 2025
+From: Heiko Carstens <hca@linux.ibm.com>
+Date: Mon,  3 Nov 2025 12:05:39 +0100
+Subject: s390: Disable ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
+To: stable@vger.kernel.org
+Cc: Heiko Carstens <hca@linux.ibm.com>, Luiz Capitulino <luizcap@redhat.com>, Gerald Schaefer <gerald.schaefer@linux.ibm.com>, David Hildenbrand <david@redhat.com>
+Message-ID: <20251103110539.3428888-1-hca@linux.ibm.com>
+
+From: Heiko Carstens <hca@linux.ibm.com>
+
+[ Upstream commit 64e2f60f355e556337fcffe80b9bcff1b22c9c42 ]
+
+As reported by Luiz Capitulino enabling HVO on s390 leads to reproducible
+crashes. The problem is that kernel page tables are modified without
+flushing corresponding TLB entries.
+
+Even if it looks like the empty flush_tlb_all() implementation on s390 is
+the problem, it is actually a different problem: on s390 it is not allowed
+to replace an active/valid page table entry with another valid page table
+entry without the detour over an invalid entry. A direct replacement may
+lead to random crashes and/or data corruption.
+
+In order to invalidate an entry special instructions have to be used
+(e.g. ipte or idte). Alternatively there are also special instructions
+available which allow to replace a valid entry with a different valid
+entry (e.g. crdte or cspg).
+
+Given that the HVO code currently does not provide the hooks to allow for
+an implementation which is compliant with the s390 architecture
+requirements, disable ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP again, which is
+basically a revert of the original patch which enabled it.
+
+Reported-by: Luiz Capitulino <luizcap@redhat.com>
+Closes: https://lore.kernel.org/all/20251028153930.37107-1-luizcap@redhat.com/
+Fixes: 00a34d5a99c0 ("s390: select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP")
+Cc: stable@vger.kernel.org
+Tested-by: Luiz Capitulino <luizcap@redhat.com>
+Reviewed-by: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
+Reviewed-by: David Hildenbrand <david@redhat.com>
+[ Adjust context ]
+Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/s390/Kconfig |    1 -
+ 1 file changed, 1 deletion(-)
+
+--- a/arch/s390/Kconfig
++++ b/arch/s390/Kconfig
+@@ -128,7 +128,6 @@ config S390
+       select ARCH_WANT_DEFAULT_BPF_JIT
+       select ARCH_WANT_IPC_PARSE_VERSION
+       select ARCH_WANT_KERNEL_PMD_MKWRITE
+-      select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
+       select BUILDTIME_TABLE_SORT
+       select CLONE_BACKWARDS2
+       select DMA_OPS if PCI
diff --git a/queue-6.6/s390-pci-avoid-deadlock-between-pci-error-recovery-and-mlx5-crdump.patch b/queue-6.6/s390-pci-avoid-deadlock-between-pci-error-recovery-and-mlx5-crdump.patch
new file mode 100644 (file)
index 0000000..fd5098f
--- /dev/null
@@ -0,0 +1,120 @@
+From stable+bounces-192143-greg=kroah.com@vger.kernel.org Mon Nov  3 11:50:17 2025
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun,  2 Nov 2025 21:50:09 -0500
+Subject: s390/pci: Avoid deadlock between PCI error recovery and mlx5 crdump
+To: stable@vger.kernel.org
+Cc: Gerd Bayer <gbayer@linux.ibm.com>, Niklas Schnelle <schnelle@linux.ibm.com>, Heiko Carstens <hca@linux.ibm.com>, Sasha Levin <sashal@kernel.org>
+Message-ID: <20251103025009.3819241-1-sashal@kernel.org>
+
+From: Gerd Bayer <gbayer@linux.ibm.com>
+
+[ Upstream commit 0fd20f65df6aa430454a0deed8f43efa91c54835 ]
+
+Do not block PCI config accesses through pci_cfg_access_lock() when
+executing the s390 variant of PCI error recovery: Acquire just
+device_lock() instead of pci_dev_lock() as powerpc's EEH and
+generig PCI AER processing do.
+
+During error recovery testing a pair of tasks was reported to be hung:
+
+mlx5_core 0000:00:00.1: mlx5_health_try_recover:338:(pid 5553): health recovery flow aborted, PCI reads still not working
+INFO: task kmcheck:72 blocked for more than 122 seconds.
+      Not tainted 5.14.0-570.12.1.bringup7.el9.s390x #1
+"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
+task:kmcheck         state:D stack:0     pid:72    tgid:72    ppid:2      flags:0x00000000
+Call Trace:
+ [<000000065256f030>] __schedule+0x2a0/0x590
+ [<000000065256f356>] schedule+0x36/0xe0
+ [<000000065256f572>] schedule_preempt_disabled+0x22/0x30
+ [<0000000652570a94>] __mutex_lock.constprop.0+0x484/0x8a8
+ [<000003ff800673a4>] mlx5_unload_one+0x34/0x58 [mlx5_core]
+ [<000003ff8006745c>] mlx5_pci_err_detected+0x94/0x140 [mlx5_core]
+ [<0000000652556c5a>] zpci_event_attempt_error_recovery+0xf2/0x398
+ [<0000000651b9184a>] __zpci_event_error+0x23a/0x2c0
+INFO: task kworker/u1664:6:1514 blocked for more than 122 seconds.
+      Not tainted 5.14.0-570.12.1.bringup7.el9.s390x #1
+"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
+task:kworker/u1664:6 state:D stack:0     pid:1514  tgid:1514  ppid:2      flags:0x00000000
+Workqueue: mlx5_health0000:00:00.0 mlx5_fw_fatal_reporter_err_work [mlx5_core]
+Call Trace:
+ [<000000065256f030>] __schedule+0x2a0/0x590
+ [<000000065256f356>] schedule+0x36/0xe0
+ [<0000000652172e28>] pci_wait_cfg+0x80/0xe8
+ [<0000000652172f94>] pci_cfg_access_lock+0x74/0x88
+ [<000003ff800916b6>] mlx5_vsc_gw_lock+0x36/0x178 [mlx5_core]
+ [<000003ff80098824>] mlx5_crdump_collect+0x34/0x1c8 [mlx5_core]
+ [<000003ff80074b62>] mlx5_fw_fatal_reporter_dump+0x6a/0xe8 [mlx5_core]
+ [<0000000652512242>] devlink_health_do_dump.part.0+0x82/0x168
+ [<0000000652513212>] devlink_health_report+0x19a/0x230
+ [<000003ff80075a12>] mlx5_fw_fatal_reporter_err_work+0xba/0x1b0 [mlx5_core]
+
+No kernel log of the exact same error with an upstream kernel is
+available - but the very same deadlock situation can be constructed there,
+too:
+
+- task: kmcheck
+  mlx5_unload_one() tries to acquire devlink lock while the PCI error
+  recovery code has set pdev->block_cfg_access by way of
+  pci_cfg_access_lock()
+- task: kworker
+  mlx5_crdump_collect() tries to set block_cfg_access through
+  pci_cfg_access_lock() while devlink_health_report() had acquired
+  the devlink lock.
+
+A similar deadlock situation can be reproduced by requesting a
+crdump with
+  > devlink health dump show pci/<BDF> reporter fw_fatal
+
+while PCI error recovery is executed on the same <BDF> physical function
+by mlx5_core's pci_error_handlers. On s390 this can be injected with
+  > zpcictl --reset-fw <BDF>
+
+Tests with this patch failed to reproduce that second deadlock situation,
+the devlink command is rejected with "kernel answers: Permission denied" -
+and we get a kernel log message of:
+
+mlx5_core 1ed0:00:00.1: mlx5_crdump_collect:50:(pid 254382): crdump: failed to lock vsc gw err -5
+
+because the config read of VSC_SEMAPHORE is rejected by the underlying
+hardware.
+
+Two prior attempts to address this issue have been discussed and
+ultimately rejected [see link], with the primary argument that s390's
+implementation of PCI error recovery is imposing restrictions that
+neither powerpc's EEH nor PCI AER handling need. Tests show that PCI
+error recovery on s390 is running to completion even without blocking
+access to PCI config space.
+
+Link: https://lore.kernel.org/all/20251007144826.2825134-1-gbayer@linux.ibm.com/
+Cc: stable@vger.kernel.org
+Fixes: 4cdf2f4e24ff ("s390/pci: implement minimal PCI error recovery")
+Reviewed-by: Niklas Schnelle <schnelle@linux.ibm.com>
+Signed-off-by: Gerd Bayer <gbayer@linux.ibm.com>
+Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
+[ Adjust context ]
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/s390/pci/pci_event.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/arch/s390/pci/pci_event.c
++++ b/arch/s390/pci/pci_event.c
+@@ -173,7 +173,7 @@ static pci_ers_result_t zpci_event_attem
+        * is unbound or probed and that userspace can't access its
+        * configuration space while we perform recovery.
+        */
+-      pci_dev_lock(pdev);
++      device_lock(&pdev->dev);
+       if (pdev->error_state == pci_channel_io_perm_failure) {
+               ers_res = PCI_ERS_RESULT_DISCONNECT;
+               goto out_unlock;
+@@ -221,7 +221,7 @@ static pci_ers_result_t zpci_event_attem
+       if (driver->err_handler->resume)
+               driver->err_handler->resume(pdev);
+ out_unlock:
+-      pci_dev_unlock(pdev);
++      device_unlock(&pdev->dev);
+       return ers_res;
+ }
diff --git a/queue-6.6/sched-fair-use-all-little-cpus-for-cpu-bound-workloads.patch b/queue-6.6/sched-fair-use-all-little-cpus-for-cpu-bound-workloads.patch
new file mode 100644 (file)
index 0000000..d4753a5
--- /dev/null
@@ -0,0 +1,114 @@
+From 3af7524b14198f5159a86692d57a9f28ec9375ce Mon Sep 17 00:00:00 2001
+From: Pierre Gondois <pierre.gondois@arm.com>
+Date: Wed, 6 Dec 2023 10:00:43 +0100
+Subject: sched/fair: Use all little CPUs for CPU-bound workloads
+
+From: Pierre Gondois <pierre.gondois@arm.com>
+
+commit 3af7524b14198f5159a86692d57a9f28ec9375ce upstream.
+
+Running N CPU-bound tasks on an N CPUs platform:
+
+- with asymmetric CPU capacity
+
+- not being a DynamIq system (i.e. having a PKG level sched domain
+  without the SD_SHARE_PKG_RESOURCES flag set)
+
+.. might result in a task placement where two tasks run on a big CPU
+and none on a little CPU. This placement could be more optimal by
+using all CPUs.
+
+Testing platform:
+
+  Juno-r2:
+    - 2 big CPUs (1-2), maximum capacity of 1024
+    - 4 little CPUs (0,3-5), maximum capacity of 383
+
+Testing workload ([1]):
+
+  Spawn 6 CPU-bound tasks. During the first 100ms (step 1), each tasks
+  is affine to a CPU, except for:
+
+    - one little CPU which is left idle.
+    - one big CPU which has 2 tasks affine.
+
+  After the 100ms (step 2), remove the cpumask affinity.
+
+Behavior before the patch:
+
+  During step 2, the load balancer running from the idle CPU tags sched
+  domains as:
+
+  - little CPUs: 'group_has_spare'. Cf. group_has_capacity() and
+    group_is_overloaded(), 3 CPU-bound tasks run on a 4 CPUs
+    sched-domain, and the idle CPU provides enough spare capacity
+    regarding the imbalance_pct
+
+  - big CPUs: 'group_overloaded'. Indeed, 3 tasks run on a 2 CPUs
+    sched-domain, so the following path is used:
+
+      group_is_overloaded()
+      \-if (sgs->sum_nr_running <= sgs->group_weight) return true;
+
+    The following path which would change the migration type to
+    'migrate_task' is not taken:
+
+      calculate_imbalance()
+      \-if (env->idle != CPU_NOT_IDLE && env->imbalance == 0)
+
+    as the local group has some spare capacity, so the imbalance
+    is not 0.
+
+  The migration type requested is 'migrate_util' and the busiest
+  runqueue is the big CPU's runqueue having 2 tasks (each having a
+  utilization of 512). The idle little CPU cannot pull one of these
+  task as its capacity is too small for the task. The following path
+  is used:
+
+   detach_tasks()
+   \-case migrate_util:
+     \-if (util > env->imbalance) goto next;
+
+After the patch:
+
+As the number of failed balancing attempts grows (with
+'nr_balance_failed'), progressively make it easier to migrate
+a big task to the idling little CPU. A similar mechanism is
+used for the 'migrate_load' migration type.
+
+Improvement:
+
+Running the testing workload [1] with the step 2 representing
+a ~10s load for a big CPU:
+
+  Before patch: ~19.3s
+  After patch:  ~18s (-6.7%)
+
+Similar issue reported at:
+
+  https://lore.kernel.org/lkml/20230716014125.139577-1-qyousef@layalina.io/
+
+Suggested-by: Vincent Guittot <vincent.guittot@linaro.org>
+Signed-off-by: Pierre Gondois <pierre.gondois@arm.com>
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
+Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Acked-by: Qais Yousef <qyousef@layalina.io>
+Link: https://lore.kernel.org/r/20231206090043.634697-1-pierre.gondois@arm.com
+Cc: John Stultz <jstultz@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/sched/fair.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -9165,7 +9165,7 @@ static int detach_tasks(struct lb_env *e
+               case migrate_util:
+                       util = task_util_est(p);
+-                      if (util > env->imbalance)
++                      if (shr_bound(util, env->sd->nr_balance_failed) > env->imbalance)
+                               goto next;
+                       env->imbalance -= util;
diff --git a/queue-6.6/sched-pelt-avoid-underestimation-of-task-utilization.patch b/queue-6.6/sched-pelt-avoid-underestimation-of-task-utilization.patch
new file mode 100644 (file)
index 0000000..10e3ef9
--- /dev/null
@@ -0,0 +1,74 @@
+From 50181c0cff31281b9f1071575ffba8a102375ece Mon Sep 17 00:00:00 2001
+From: Vincent Guittot <vincent.guittot@linaro.org>
+Date: Wed, 22 Nov 2023 15:01:19 +0100
+Subject: sched/pelt: Avoid underestimation of task utilization
+
+From: Vincent Guittot <vincent.guittot@linaro.org>
+
+commit 50181c0cff31281b9f1071575ffba8a102375ece upstream.
+
+Lukasz Luba reported that a thread's util_est can significantly decrease as
+a result of sharing the CPU with other threads.
+
+The use case can be easily reproduced with a periodic task TA that runs 1ms
+and sleeps 100us. When the task is alone on the CPU, its max utilization and
+its util_est is around 888. If another similar task starts to run on the
+same CPU, TA will have to share the CPU runtime and its maximum utilization
+will decrease around half the CPU capacity (512) then TA's util_est will
+follow this new maximum trend which is only the result of sharing the CPU
+with others tasks.
+
+Such situation can be detected with runnable_avg wich is close or
+equal to util_avg when TA is alone, but increases above util_avg when TA
+shares the CPU with other threads and wait on the runqueue.
+
+[ We prefer an util_est that overestimate rather than under estimate
+  because in 1st case we will not provide enough performance to the
+  task which will remain under-provisioned, whereas in the other case we
+  will create some idle time which will enable to reduce contention and
+  as a result reduces the util_est so the overestimate will be transient
+  whereas the underestimate will remain. ]
+
+[ mingo: Refined the changelog, added comments from the LKML discussion. ]
+
+Reported-by: Lukasz Luba <lukasz.luba@arm.com>
+Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Link: https://lore.kernel.org/lkml/CAKfTPtDd-HhF-YiNTtL9i5k0PfJbF819Yxu4YquzfXgwi7voyw@mail.gmail.com/#t
+Link: https://lore.kernel.org/r/20231122140119.472110-1-vincent.guittot@linaro.org
+Cc: Hongyan Xia <hongyan.xia2@arm.com>
+Cc: John Stultz <jstultz@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/sched/fair.c |   13 +++++++++++++
+ 1 file changed, 13 insertions(+)
+
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -4836,6 +4836,11 @@ static inline unsigned long task_util(st
+       return READ_ONCE(p->se.avg.util_avg);
+ }
++static inline unsigned long task_runnable(struct task_struct *p)
++{
++      return READ_ONCE(p->se.avg.runnable_avg);
++}
++
+ static inline unsigned long _task_util_est(struct task_struct *p)
+ {
+       struct util_est ue = READ_ONCE(p->se.avg.util_est);
+@@ -4955,6 +4960,14 @@ static inline void util_est_update(struc
+               return;
+       /*
++       * To avoid underestimate of task utilization, skip updates of EWMA if
++       * we cannot grant that thread got all CPU time it wanted.
++       */
++      if ((ue.enqueued + UTIL_EST_MARGIN) < task_runnable(p))
++              goto done;
++
++
++      /*
+        * Update Task's estimated utilization
+        *
+        * When *p completes an activation we can consolidate another sample
index 330abcf8d210844d191e7acc8e821f292ca6199f..7478b8bf68ce91ec39dc06291f482c3090e3ca2b 100644 (file)
@@ -48,3 +48,9 @@ s390-pci-restore-irq-unconditionally-for-the-zpci-device.patch
 cpuidle-governors-menu-rearrange-main-loop-in-menu_select.patch
 cpuidle-governors-menu-select-polling-state-in-some-more-cases.patch
 net-phy-dp83867-disable-eee-support-as-not-implemented.patch
+sched-pelt-avoid-underestimation-of-task-utilization.patch
+sched-fair-use-all-little-cpus-for-cpu-bound-workloads.patch
+s390-disable-arch_want_optimize_hugetlb_vmemmap.patch
+drm-sched-fix-race-in-drm_sched_entity_select_rq.patch
+drm-sysfb-do-not-dereference-null-pointer-in-plane-reset.patch
+s390-pci-avoid-deadlock-between-pci-error-recovery-and-mlx5-crdump.patch