From ca80e922b74dbc0693e16c04ca4bfdeba3c7b4c8 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 2 Jan 2026 13:26:55 +0100 Subject: [PATCH] 6.18-stable patches added patches: drm-displayid-add-quirk-to-ignore-displayid-checksum-errors.patch drm-edid-add-drm_edid_ident_init-to-initialize-struct-drm_edid_ident.patch drm-nova-depend-on-config_64bit.patch kvm-s390-fix-gmap_helper_zap_one_page-again.patch mm-huge_memory-merge-uniform_split_supported-and-non_uniform_split_supported.patch sched-core-add-comment-explaining-force-idle-vruntime-snapshots.patch sched-eevdf-fix-min_vruntime-vs-avg_vruntime.patch sched-proxy-yield-the-donor-task.patch sched_ext-fix-incorrect-sched_class-settings-for-per-cpu-migration-tasks.patch series x86-microcode-amd-select-which-microcode-patch-to-load.patch --- ...-to-ignore-displayid-checksum-errors.patch | 137 +++++++ ...-to-initialize-struct-drm_edid_ident.patch | 42 ++ .../drm-nova-depend-on-config_64bit.patch | 31 ++ ...0-fix-gmap_helper_zap_one_page-again.patch | 57 +++ ...rted-and-non_uniform_split_supported.patch | 186 +++++++++ ...aining-force-idle-vruntime-snapshots.patch | 218 ++++++++++ ...vdf-fix-min_vruntime-vs-avg_vruntime.patch | 387 ++++++++++++++++++ .../sched-proxy-yield-the-donor-task.patch | 111 +++++ ...settings-for-per-cpu-migration-tasks.patch | 105 +++++ queue-6.18/series | 10 + ...select-which-microcode-patch-to-load.patch | 185 +++++++++ 11 files changed, 1469 insertions(+) create mode 100644 queue-6.18/drm-displayid-add-quirk-to-ignore-displayid-checksum-errors.patch create mode 100644 queue-6.18/drm-edid-add-drm_edid_ident_init-to-initialize-struct-drm_edid_ident.patch create mode 100644 queue-6.18/drm-nova-depend-on-config_64bit.patch create mode 100644 queue-6.18/kvm-s390-fix-gmap_helper_zap_one_page-again.patch create mode 100644 queue-6.18/mm-huge_memory-merge-uniform_split_supported-and-non_uniform_split_supported.patch create mode 100644 queue-6.18/sched-core-add-comment-explaining-force-idle-vruntime-snapshots.patch create mode 100644 queue-6.18/sched-eevdf-fix-min_vruntime-vs-avg_vruntime.patch create mode 100644 queue-6.18/sched-proxy-yield-the-donor-task.patch create mode 100644 queue-6.18/sched_ext-fix-incorrect-sched_class-settings-for-per-cpu-migration-tasks.patch create mode 100644 queue-6.18/series create mode 100644 queue-6.18/x86-microcode-amd-select-which-microcode-patch-to-load.patch diff --git a/queue-6.18/drm-displayid-add-quirk-to-ignore-displayid-checksum-errors.patch b/queue-6.18/drm-displayid-add-quirk-to-ignore-displayid-checksum-errors.patch new file mode 100644 index 0000000000..92f6baf8d1 --- /dev/null +++ b/queue-6.18/drm-displayid-add-quirk-to-ignore-displayid-checksum-errors.patch @@ -0,0 +1,137 @@ +From stable+bounces-204371-greg=kroah.com@vger.kernel.org Wed Dec 31 17:19:08 2025 +From: Sasha Levin +Date: Wed, 31 Dec 2025 11:18:56 -0500 +Subject: drm/displayid: add quirk to ignore DisplayID checksum errors +To: stable@vger.kernel.org +Cc: "Jani Nikula" , "Tiago Martins Araújo" , "Alex Deucher" , "Sasha Levin" +Message-ID: <20251231161856.3237284-3-sashal@kernel.org> + +From: Jani Nikula + +[ Upstream commit 83cbb4d33dc22b0ca1a4e85c6e892c9b729e28d4 ] + +Add a mechanism for DisplayID specific quirks, and add the first quirk +to ignore DisplayID section checksum errors. + +It would be quite inconvenient to pass existing EDID quirks from +drm_edid.c for DisplayID parsing. Not all places doing DisplayID +iteration have the quirks readily available, and would have to pass it +in all places. Simply add a separate array of DisplayID specific EDID +quirks. We do end up checking it every time we iterate DisplayID blocks, +but hopefully the number of quirks remains small. + +There are a few laptop models with DisplayID checksum failures, leading +to higher refresh rates only present in the DisplayID blocks being +ignored. Add a quirk for the panel in the machines. + +Reported-by: Tiago Martins Araújo +Closes: https://lore.kernel.org/r/CACRbrPGvLP5LANXuFi6z0S7XMbAG4X5y2YOLBDxfOVtfGGqiKQ@mail.gmail.com +Closes: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/14703 +Acked-by: Alex Deucher +Tested-by: Tiago Martins Araújo +Cc: stable@vger.kernel.org +Link: https://patch.msgid.link/c04d81ae648c5f21b3f5b7953f924718051f2798.1761681968.git.jani.nikula@intel.com +Signed-off-by: Jani Nikula +Signed-off-by: Sasha Levin +Signed-off-by: Greg Kroah-Hartman +--- + drivers/gpu/drm/drm_displayid.c | 41 +++++++++++++++++++++++++++---- + drivers/gpu/drm/drm_displayid_internal.h | 2 + + 2 files changed, 39 insertions(+), 4 deletions(-) + +--- a/drivers/gpu/drm/drm_displayid.c ++++ b/drivers/gpu/drm/drm_displayid.c +@@ -9,6 +9,34 @@ + #include "drm_crtc_internal.h" + #include "drm_displayid_internal.h" + ++enum { ++ QUIRK_IGNORE_CHECKSUM, ++}; ++ ++struct displayid_quirk { ++ const struct drm_edid_ident ident; ++ u8 quirks; ++}; ++ ++static const struct displayid_quirk quirks[] = { ++ { ++ .ident = DRM_EDID_IDENT_INIT('C', 'S', 'O', 5142, "MNE007ZA1-5"), ++ .quirks = BIT(QUIRK_IGNORE_CHECKSUM), ++ }, ++}; ++ ++static u8 get_quirks(const struct drm_edid *drm_edid) ++{ ++ int i; ++ ++ for (i = 0; i < ARRAY_SIZE(quirks); i++) { ++ if (drm_edid_match(drm_edid, &quirks[i].ident)) ++ return quirks[i].quirks; ++ } ++ ++ return 0; ++} ++ + static const struct displayid_header * + displayid_get_header(const u8 *displayid, int length, int index) + { +@@ -23,7 +51,7 @@ displayid_get_header(const u8 *displayid + } + + static const struct displayid_header * +-validate_displayid(const u8 *displayid, int length, int idx) ++validate_displayid(const u8 *displayid, int length, int idx, bool ignore_checksum) + { + int i, dispid_length; + u8 csum = 0; +@@ -41,8 +69,11 @@ validate_displayid(const u8 *displayid, + for (i = 0; i < dispid_length; i++) + csum += displayid[idx + i]; + if (csum) { +- DRM_NOTE("DisplayID checksum invalid, remainder is %d\n", csum); +- return ERR_PTR(-EINVAL); ++ DRM_NOTE("DisplayID checksum invalid, remainder is %d%s\n", csum, ++ ignore_checksum ? " (ignoring)" : ""); ++ ++ if (!ignore_checksum) ++ return ERR_PTR(-EINVAL); + } + + return base; +@@ -52,6 +83,7 @@ static const u8 *find_next_displayid_ext + { + const struct displayid_header *base; + const u8 *displayid; ++ bool ignore_checksum = iter->quirks & BIT(QUIRK_IGNORE_CHECKSUM); + + displayid = drm_edid_find_extension(iter->drm_edid, DISPLAYID_EXT, &iter->ext_index); + if (!displayid) +@@ -61,7 +93,7 @@ static const u8 *find_next_displayid_ext + iter->length = EDID_LENGTH - 1; + iter->idx = 1; + +- base = validate_displayid(displayid, iter->length, iter->idx); ++ base = validate_displayid(displayid, iter->length, iter->idx, ignore_checksum); + if (IS_ERR(base)) + return NULL; + +@@ -76,6 +108,7 @@ void displayid_iter_edid_begin(const str + memset(iter, 0, sizeof(*iter)); + + iter->drm_edid = drm_edid; ++ iter->quirks = get_quirks(drm_edid); + } + + static const struct displayid_block * +--- a/drivers/gpu/drm/drm_displayid_internal.h ++++ b/drivers/gpu/drm/drm_displayid_internal.h +@@ -167,6 +167,8 @@ struct displayid_iter { + + u8 version; + u8 primary_use; ++ ++ u8 quirks; + }; + + void displayid_iter_edid_begin(const struct drm_edid *drm_edid, diff --git a/queue-6.18/drm-edid-add-drm_edid_ident_init-to-initialize-struct-drm_edid_ident.patch b/queue-6.18/drm-edid-add-drm_edid_ident_init-to-initialize-struct-drm_edid_ident.patch new file mode 100644 index 0000000000..ba900f9e37 --- /dev/null +++ b/queue-6.18/drm-edid-add-drm_edid_ident_init-to-initialize-struct-drm_edid_ident.patch @@ -0,0 +1,42 @@ +From stable+bounces-204370-greg=kroah.com@vger.kernel.org Wed Dec 31 17:19:04 2025 +From: Sasha Levin +Date: Wed, 31 Dec 2025 11:18:55 -0500 +Subject: drm/edid: add DRM_EDID_IDENT_INIT() to initialize struct drm_edid_ident +To: stable@vger.kernel.org +Cc: "Jani Nikula" , "Tiago Martins Araújo" , "Alex Deucher" , "Sasha Levin" +Message-ID: <20251231161856.3237284-2-sashal@kernel.org> + +From: Jani Nikula + +[ Upstream commit 8b61583f993589a64c061aa91b44f5bd350d90a5 ] + +Add a convenience helper for initializing struct drm_edid_ident. + +Cc: Tiago Martins Araújo +Acked-by: Alex Deucher +Tested-by: Tiago Martins Araújo +Cc: stable@vger.kernel.org +Link: https://patch.msgid.link/710b2ac6a211606ec1f90afa57b79e8c7375a27e.1761681968.git.jani.nikula@intel.com +Signed-off-by: Jani Nikula +Stable-dep-of: 83cbb4d33dc2 ("drm/displayid: add quirk to ignore DisplayID checksum errors") +Signed-off-by: Sasha Levin +Signed-off-by: Greg Kroah-Hartman +--- + include/drm/drm_edid.h | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/include/drm/drm_edid.h ++++ b/include/drm/drm_edid.h +@@ -340,6 +340,12 @@ struct drm_edid_ident { + const char *name; + }; + ++#define DRM_EDID_IDENT_INIT(_vend_chr_0, _vend_chr_1, _vend_chr_2, _product_id, _name) \ ++{ \ ++ .panel_id = drm_edid_encode_panel_id(_vend_chr_0, _vend_chr_1, _vend_chr_2, _product_id), \ ++ .name = _name, \ ++} ++ + #define EDID_PRODUCT_ID(e) ((e)->prod_code[0] | ((e)->prod_code[1] << 8)) + + /* Short Audio Descriptor */ diff --git a/queue-6.18/drm-nova-depend-on-config_64bit.patch b/queue-6.18/drm-nova-depend-on-config_64bit.patch new file mode 100644 index 0000000000..41c6245380 --- /dev/null +++ b/queue-6.18/drm-nova-depend-on-config_64bit.patch @@ -0,0 +1,31 @@ +From ba1b40ed0e34bab597fd90d4c4e9f7397f878c8f Mon Sep 17 00:00:00 2001 +From: Danilo Krummrich +Date: Tue, 28 Oct 2025 12:00:52 +0100 +Subject: drm: nova: depend on CONFIG_64BIT + +From: Danilo Krummrich + +commit ba1b40ed0e34bab597fd90d4c4e9f7397f878c8f upstream. + +nova-core already depends on CONFIG_64BIT, hence also depend on +CONFIG_64BIT for nova-drm. + +Reviewed-by: Alexandre Courbot +Reviewed-by: John Hubbard +Link: https://patch.msgid.link/20251028110058.340320-1-dakr@kernel.org +Signed-off-by: Danilo Krummrich +Cc: Miguel Ojeda +Signed-off-by: Greg Kroah-Hartman +--- + drivers/gpu/drm/nova/Kconfig | 1 + + 1 file changed, 1 insertion(+) + +--- a/drivers/gpu/drm/nova/Kconfig ++++ b/drivers/gpu/drm/nova/Kconfig +@@ -1,5 +1,6 @@ + config DRM_NOVA + tristate "Nova DRM driver" ++ depends on 64BIT + depends on DRM=y + depends on PCI + depends on RUST diff --git a/queue-6.18/kvm-s390-fix-gmap_helper_zap_one_page-again.patch b/queue-6.18/kvm-s390-fix-gmap_helper_zap_one_page-again.patch new file mode 100644 index 0000000000..fa74d87319 --- /dev/null +++ b/queue-6.18/kvm-s390-fix-gmap_helper_zap_one_page-again.patch @@ -0,0 +1,57 @@ +From stable+bounces-204314-greg=kroah.com@vger.kernel.org Wed Dec 31 04:16:32 2025 +From: Sasha Levin +Date: Tue, 30 Dec 2025 22:16:26 -0500 +Subject: KVM: s390: Fix gmap_helper_zap_one_page() again +To: stable@vger.kernel.org +Cc: Claudio Imbrenda , Marc Hartmayer , Christian Borntraeger , Heiko Carstens , Sasha Levin +Message-ID: <20251231031626.2684565-1-sashal@kernel.org> + +From: Claudio Imbrenda + +[ Upstream commit 2f393c228cc519ddf19b8c6c05bf15723241aa96 ] + +A few checks were missing in gmap_helper_zap_one_page(), which can lead +to memory corruption in the guest under specific circumstances. + +Add the missing checks. + +Fixes: 5deafa27d9ae ("KVM: s390: Fix to clear PTE when discarding a swapped page") +Cc: stable@vger.kernel.org +Reported-by: Marc Hartmayer +Tested-by: Marc Hartmayer +Acked-by: Christian Borntraeger +Signed-off-by: Claudio Imbrenda +Signed-off-by: Heiko Carstens +[ adapted ptep_zap_softleaf_entry() and softleaf_from_pte() calls to ptep_zap_swap_entry() and pte_to_swp_entry() ] +Signed-off-by: Sasha Levin +Signed-off-by: Greg Kroah-Hartman +--- + arch/s390/mm/gmap_helpers.c | 9 +++++++-- + 1 file changed, 7 insertions(+), 2 deletions(-) + +--- a/arch/s390/mm/gmap_helpers.c ++++ b/arch/s390/mm/gmap_helpers.c +@@ -47,6 +47,7 @@ static void ptep_zap_swap_entry(struct m + void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr) + { + struct vm_area_struct *vma; ++ unsigned long pgstev; + spinlock_t *ptl; + pgste_t pgste; + pte_t *ptep; +@@ -65,9 +66,13 @@ void gmap_helper_zap_one_page(struct mm_ + if (pte_swap(*ptep)) { + preempt_disable(); + pgste = pgste_get_lock(ptep); ++ pgstev = pgste_val(pgste); + +- ptep_zap_swap_entry(mm, pte_to_swp_entry(*ptep)); +- pte_clear(mm, vmaddr, ptep); ++ if ((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED || ++ (pgstev & _PGSTE_GPS_ZERO)) { ++ ptep_zap_swap_entry(mm, pte_to_swp_entry(*ptep)); ++ pte_clear(mm, vmaddr, ptep); ++ } + + pgste_set_unlock(ptep, pgste); + preempt_enable(); diff --git a/queue-6.18/mm-huge_memory-merge-uniform_split_supported-and-non_uniform_split_supported.patch b/queue-6.18/mm-huge_memory-merge-uniform_split_supported-and-non_uniform_split_supported.patch new file mode 100644 index 0000000000..4eb93f1daa --- /dev/null +++ b/queue-6.18/mm-huge_memory-merge-uniform_split_supported-and-non_uniform_split_supported.patch @@ -0,0 +1,186 @@ +From stable+bounces-204176-greg=kroah.com@vger.kernel.org Tue Dec 30 03:48:38 2025 +From: Sasha Levin +Date: Mon, 29 Dec 2025 21:48:31 -0500 +Subject: mm/huge_memory: merge uniform_split_supported() and non_uniform_split_supported() +To: stable@vger.kernel.org +Cc: Wei Yang , Zi Yan , "David Hildenbrand (Red Hat)" , Baolin Wang , Barry Song , Dev Jain , Lance Yang , Liam Howlett , Lorenzo Stoakes , Nico Pache , Ryan Roberts , Andrew Morton , Sasha Levin +Message-ID: <20251230024831.1972219-1-sashal@kernel.org> + +From: Wei Yang + +[ Upstream commit 8a0e4bdddd1c998b894d879a1d22f1e745606215 ] + +uniform_split_supported() and non_uniform_split_supported() share +significantly similar logic. + +The only functional difference is that uniform_split_supported() includes +an additional check on the requested @new_order. + +The reason for this check comes from the following two aspects: + + * some file system or swap cache just supports order-0 folio + * the behavioral difference between uniform/non-uniform split + +The behavioral difference between uniform split and non-uniform: + + * uniform split splits folio directly to @new_order + * non-uniform split creates after-split folios with orders from + folio_order(folio) - 1 to new_order. + +This means for non-uniform split or !new_order split we should check the +file system and swap cache respectively. + +This commit unifies the logic and merge the two functions into a single +combined helper, removing redundant code and simplifying the split +support checking mechanism. + +Link: https://lkml.kernel.org/r/20251106034155.21398-3-richard.weiyang@gmail.com +Fixes: c010d47f107f ("mm: thp: split huge page to any lower order pages") +Signed-off-by: Wei Yang +Reviewed-by: Zi Yan +Cc: Zi Yan +Cc: "David Hildenbrand (Red Hat)" +Cc: Baolin Wang +Cc: Barry Song +Cc: Dev Jain +Cc: Lance Yang +Cc: Liam Howlett +Cc: Lorenzo Stoakes +Cc: Nico Pache +Cc: Ryan Roberts +Cc: +Signed-off-by: Andrew Morton +[ split_type => uniform_split and replaced SPLIT_TYPE_NON_UNIFORM checks ] +Signed-off-by: Sasha Levin +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/huge_mm.h | 8 ++--- + mm/huge_memory.c | 71 ++++++++++++++++++++---------------------------- + 2 files changed, 33 insertions(+), 46 deletions(-) + +--- a/include/linux/huge_mm.h ++++ b/include/linux/huge_mm.h +@@ -369,10 +369,8 @@ int split_huge_page_to_list_to_order(str + unsigned int new_order); + int min_order_for_split(struct folio *folio); + int split_folio_to_list(struct folio *folio, struct list_head *list); +-bool uniform_split_supported(struct folio *folio, unsigned int new_order, +- bool warns); +-bool non_uniform_split_supported(struct folio *folio, unsigned int new_order, +- bool warns); ++bool folio_split_supported(struct folio *folio, unsigned int new_order, ++ bool uniform_split, bool warns); + int folio_split(struct folio *folio, unsigned int new_order, struct page *page, + struct list_head *list); + /* +@@ -392,7 +390,7 @@ int folio_split(struct folio *folio, uns + static inline int try_folio_split_to_order(struct folio *folio, + struct page *page, unsigned int new_order) + { +- if (!non_uniform_split_supported(folio, new_order, /* warns= */ false)) ++ if (!folio_split_supported(folio, new_order, false, /* warns= */ false)) + return split_huge_page_to_list_to_order(&folio->page, NULL, + new_order); + return folio_split(folio, new_order, page, NULL); +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -3515,8 +3515,8 @@ static int __split_unmapped_folio(struct + return ret; + } + +-bool non_uniform_split_supported(struct folio *folio, unsigned int new_order, +- bool warns) ++bool folio_split_supported(struct folio *folio, unsigned int new_order, ++ bool uniform_split, bool warns) + { + if (folio_test_anon(folio)) { + /* order-1 is not supported for anonymous THP. */ +@@ -3524,48 +3524,41 @@ bool non_uniform_split_supported(struct + "Cannot split to order-1 folio"); + if (new_order == 1) + return false; +- } else if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && +- !mapping_large_folio_support(folio->mapping)) { +- /* +- * No split if the file system does not support large folio. +- * Note that we might still have THPs in such mappings due to +- * CONFIG_READ_ONLY_THP_FOR_FS. But in that case, the mapping +- * does not actually support large folios properly. +- */ +- VM_WARN_ONCE(warns, +- "Cannot split file folio to non-0 order"); +- return false; +- } +- +- /* Only swapping a whole PMD-mapped folio is supported */ +- if (folio_test_swapcache(folio)) { +- VM_WARN_ONCE(warns, +- "Cannot split swapcache folio to non-0 order"); +- return false; +- } +- +- return true; +-} +- +-/* See comments in non_uniform_split_supported() */ +-bool uniform_split_supported(struct folio *folio, unsigned int new_order, +- bool warns) +-{ +- if (folio_test_anon(folio)) { +- VM_WARN_ONCE(warns && new_order == 1, +- "Cannot split to order-1 folio"); +- if (new_order == 1) +- return false; +- } else if (new_order) { ++ } else if (!uniform_split || new_order) { + if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && + !mapping_large_folio_support(folio->mapping)) { ++ /* ++ * We can always split a folio down to a single page ++ * (new_order == 0) uniformly. ++ * ++ * For any other scenario ++ * a) uniform split targeting a large folio ++ * (new_order > 0) ++ * b) any non-uniform split ++ * we must confirm that the file system supports large ++ * folios. ++ * ++ * Note that we might still have THPs in such ++ * mappings, which is created from khugepaged when ++ * CONFIG_READ_ONLY_THP_FOR_FS is enabled. But in that ++ * case, the mapping does not actually support large ++ * folios properly. ++ */ + VM_WARN_ONCE(warns, + "Cannot split file folio to non-0 order"); + return false; + } + } + +- if (new_order && folio_test_swapcache(folio)) { ++ /* ++ * swapcache folio could only be split to order 0 ++ * ++ * non-uniform split creates after-split folios with orders from ++ * folio_order(folio) - 1 to new_order, making it not suitable for any ++ * swapcache folio split. Only uniform split to order-0 can be used ++ * here. ++ */ ++ if ((!uniform_split || new_order) && folio_test_swapcache(folio)) { + VM_WARN_ONCE(warns, + "Cannot split swapcache folio to non-0 order"); + return false; +@@ -3632,11 +3625,7 @@ static int __folio_split(struct folio *f + if (new_order >= folio_order(folio)) + return -EINVAL; + +- if (uniform_split && !uniform_split_supported(folio, new_order, true)) +- return -EINVAL; +- +- if (!uniform_split && +- !non_uniform_split_supported(folio, new_order, true)) ++ if (!folio_split_supported(folio, new_order, uniform_split, /* warn = */ true)) + return -EINVAL; + + is_hzp = is_huge_zero_folio(folio); diff --git a/queue-6.18/sched-core-add-comment-explaining-force-idle-vruntime-snapshots.patch b/queue-6.18/sched-core-add-comment-explaining-force-idle-vruntime-snapshots.patch new file mode 100644 index 0000000000..5cbed64b57 --- /dev/null +++ b/queue-6.18/sched-core-add-comment-explaining-force-idle-vruntime-snapshots.patch @@ -0,0 +1,218 @@ +From stable+bounces-204125-greg=kroah.com@vger.kernel.org Mon Dec 29 20:35:44 2025 +From: Sasha Levin +Date: Mon, 29 Dec 2025 14:35:37 -0500 +Subject: sched/core: Add comment explaining force-idle vruntime snapshots +To: stable@vger.kernel.org +Cc: Peter Zijlstra , Sasha Levin +Message-ID: <20251229193539.1640748-1-sashal@kernel.org> + +From: Peter Zijlstra + +[ Upstream commit 9359d9785d85bb53f1ff1738a59aeeec4b878906 ] + +I always end up having to re-read these emails every time I look at +this code. And a future patch is going to change this story a little. +This means it is past time to stick them in a comment so it can be +modified and stay current. + +Signed-off-by: Peter Zijlstra (Intel) +Link: https://lkml.kernel.org/r/20200506143506.GH5298@hirez.programming.kicks-ass.net +Link: https://lkml.kernel.org/r/20200515103844.GG2978@hirez.programming.kicks-ass.net +Link: https://patch.msgid.link/20251106111603.GB4068168@noisy.programming.kicks-ass.net +Stable-dep-of: 79f3f9bedd14 ("sched/eevdf: Fix min_vruntime vs avg_vruntime") +Signed-off-by: Sasha Levin +Signed-off-by: Greg Kroah-Hartman +--- + kernel/sched/fair.c | 181 ++++++++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 181 insertions(+) + +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -13014,6 +13014,187 @@ static inline void task_tick_core(struct + } + + /* ++ * Consider any infeasible weight scenario. Take for instance two tasks, ++ * each bound to their respective sibling, one with weight 1 and one with ++ * weight 2. Then the lower weight task will run ahead of the higher weight ++ * task without bound. ++ * ++ * This utterly destroys the concept of a shared time base. ++ * ++ * Remember; all this is about a proportionally fair scheduling, where each ++ * tasks receives: ++ * ++ * w_i ++ * dt_i = ---------- dt (1) ++ * \Sum_j w_j ++ * ++ * which we do by tracking a virtual time, s_i: ++ * ++ * 1 ++ * s_i = --- d[t]_i (2) ++ * w_i ++ * ++ * Where d[t] is a delta of discrete time, while dt is an infinitesimal. ++ * The immediate corollary is that the ideal schedule S, where (2) to use ++ * an infinitesimal delta, is: ++ * ++ * 1 ++ * S = ---------- dt (3) ++ * \Sum_i w_i ++ * ++ * From which we can define the lag, or deviation from the ideal, as: ++ * ++ * lag(i) = S - s_i (4) ++ * ++ * And since the one and only purpose is to approximate S, we get that: ++ * ++ * \Sum_i w_i lag(i) := 0 (5) ++ * ++ * If this were not so, we no longer converge to S, and we can no longer ++ * claim our scheduler has any of the properties we derive from S. This is ++ * exactly what you did above, you broke it! ++ * ++ * ++ * Let's continue for a while though; to see if there is anything useful to ++ * be learned. We can combine (1)-(3) or (4)-(5) and express S in s_i: ++ * ++ * \Sum_i w_i s_i ++ * S = -------------- (6) ++ * \Sum_i w_i ++ * ++ * Which gives us a way to compute S, given our s_i. Now, if you've read ++ * our code, you know that we do not in fact do this, the reason for this ++ * is two-fold. Firstly, computing S in that way requires a 64bit division ++ * for every time we'd use it (see 12), and secondly, this only describes ++ * the steady-state, it doesn't handle dynamics. ++ * ++ * Anyway, in (6): s_i -> x + (s_i - x), to get: ++ * ++ * \Sum_i w_i (s_i - x) ++ * S - x = -------------------- (7) ++ * \Sum_i w_i ++ * ++ * Which shows that S and s_i transform alike (which makes perfect sense ++ * given that S is basically the (weighted) average of s_i). ++ * ++ * Then: ++ * ++ * x -> s_min := min{s_i} (8) ++ * ++ * to obtain: ++ * ++ * \Sum_i w_i (s_i - s_min) ++ * S = s_min + ------------------------ (9) ++ * \Sum_i w_i ++ * ++ * Which already looks familiar, and is the basis for our current ++ * approximation: ++ * ++ * S ~= s_min (10) ++ * ++ * Now, obviously, (10) is absolute crap :-), but it sorta works. ++ * ++ * So the thing to remember is that the above is strictly UP. It is ++ * possible to generalize to multiple runqueues -- however it gets really ++ * yuck when you have to add affinity support, as illustrated by our very ++ * first counter-example. ++ * ++ * Luckily I think we can avoid needing a full multi-queue variant for ++ * core-scheduling (or load-balancing). The crucial observation is that we ++ * only actually need this comparison in the presence of forced-idle; only ++ * then do we need to tell if the stalled rq has higher priority over the ++ * other. ++ * ++ * [XXX assumes SMT2; better consider the more general case, I suspect ++ * it'll work out because our comparison is always between 2 rqs and the ++ * answer is only interesting if one of them is forced-idle] ++ * ++ * And (under assumption of SMT2) when there is forced-idle, there is only ++ * a single queue, so everything works like normal. ++ * ++ * Let, for our runqueue 'k': ++ * ++ * T_k = \Sum_i w_i s_i ++ * W_k = \Sum_i w_i ; for all i of k (11) ++ * ++ * Then we can write (6) like: ++ * ++ * T_k ++ * S_k = --- (12) ++ * W_k ++ * ++ * From which immediately follows that: ++ * ++ * T_k + T_l ++ * S_k+l = --------- (13) ++ * W_k + W_l ++ * ++ * On which we can define a combined lag: ++ * ++ * lag_k+l(i) := S_k+l - s_i (14) ++ * ++ * And that gives us the tools to compare tasks across a combined runqueue. ++ * ++ * ++ * Combined this gives the following: ++ * ++ * a) when a runqueue enters force-idle, sync it against it's sibling rq(s) ++ * using (7); this only requires storing single 'time'-stamps. ++ * ++ * b) when comparing tasks between 2 runqueues of which one is forced-idle, ++ * compare the combined lag, per (14). ++ * ++ * Now, of course cgroups (I so hate them) make this more interesting in ++ * that a) seems to suggest we need to iterate all cgroup on a CPU at such ++ * boundaries, but I think we can avoid that. The force-idle is for the ++ * whole CPU, all it's rqs. So we can mark it in the root and lazily ++ * propagate downward on demand. ++ */ ++ ++/* ++ * So this sync is basically a relative reset of S to 0. ++ * ++ * So with 2 queues, when one goes idle, we drop them both to 0 and one ++ * then increases due to not being idle, and the idle one builds up lag to ++ * get re-elected. So far so simple, right? ++ * ++ * When there's 3, we can have the situation where 2 run and one is idle, ++ * we sync to 0 and let the idle one build up lag to get re-election. Now ++ * suppose another one also drops idle. At this point dropping all to 0 ++ * again would destroy the built-up lag from the queue that was already ++ * idle, not good. ++ * ++ * So instead of syncing everything, we can: ++ * ++ * less := !((s64)(s_a - s_b) <= 0) ++ * ++ * (v_a - S_a) - (v_b - S_b) == v_a - v_b - S_a + S_b ++ * == v_a - (v_b - S_a + S_b) ++ * ++ * IOW, we can recast the (lag) comparison to a one-sided difference. ++ * So if then, instead of syncing the whole queue, sync the idle queue ++ * against the active queue with S_a + S_b at the point where we sync. ++ * ++ * (XXX consider the implication of living in a cyclic group: N / 2^n N) ++ * ++ * This gives us means of syncing single queues against the active queue, ++ * and for already idle queues to preserve their build-up lag. ++ * ++ * Of course, then we get the situation where there's 2 active and one ++ * going idle, who do we pick to sync against? Theory would have us sync ++ * against the combined S, but as we've already demonstrated, there is no ++ * such thing in infeasible weight scenarios. ++ * ++ * One thing I've considered; and this is where that core_active rudiment ++ * came from, is having active queues sync up between themselves after ++ * every tick. This limits the observed divergence due to the work ++ * conservancy. ++ * ++ * On top of that, we can improve upon things by moving away from our ++ * horrible (10) hack and moving to (9) and employing (13) here. ++ */ ++ ++/* + * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed. + */ + static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq, diff --git a/queue-6.18/sched-eevdf-fix-min_vruntime-vs-avg_vruntime.patch b/queue-6.18/sched-eevdf-fix-min_vruntime-vs-avg_vruntime.patch new file mode 100644 index 0000000000..94aad23ce7 --- /dev/null +++ b/queue-6.18/sched-eevdf-fix-min_vruntime-vs-avg_vruntime.patch @@ -0,0 +1,387 @@ +From stable+bounces-204126-greg=kroah.com@vger.kernel.org Mon Dec 29 20:35:47 2025 +From: Sasha Levin +Date: Mon, 29 Dec 2025 14:35:38 -0500 +Subject: sched/eevdf: Fix min_vruntime vs avg_vruntime +To: stable@vger.kernel.org +Cc: Peter Zijlstra , Zicheng Qu , Sasha Levin +Message-ID: <20251229193539.1640748-2-sashal@kernel.org> + +From: Peter Zijlstra + +[ Upstream commit 79f3f9bedd149ea438aaeb0fb6a083637affe205 ] + +Basically, from the constraint that the sum of lag is zero, you can +infer that the 0-lag point is the weighted average of the individual +vruntime, which is what we're trying to compute: + + \Sum w_i * v_i + avg = -------------- + \Sum w_i + +Now, since vruntime takes the whole u64 (worse, it wraps), this +multiplication term in the numerator is not something we can compute; +instead we do the min_vruntime (v0 henceforth) thing like: + + v_i = (v_i - v0) + v0 + +This does two things: + - it keeps the key: (v_i - v0) 'small'; + - it creates a relative 0-point in the modular space. + +If you do that subtitution and work it all out, you end up with: + + \Sum w_i * (v_i - v0) + avg = --------------------- + v0 + \Sum w_i + +Since you cannot very well track a ratio like that (and not suffer +terrible numerical problems) we simpy track the numerator and +denominator individually and only perform the division when strictly +needed. + +Notably, the numerator lives in cfs_rq->avg_vruntime and the denominator +lives in cfs_rq->avg_load. + +The one extra 'funny' is that these numbers track the entities in the +tree, and current is typically outside of the tree, so avg_vruntime() +adds current when needed before doing the division. + +(vruntime_eligible() elides the division by cross-wise multiplication) + +Anyway, as mentioned above, we currently use the CFS era min_vruntime +for this purpose. However, this thing can only move forward, while the +above avg can in fact move backward (when a non-eligible task leaves, +the average becomes smaller), this can cause trouble when through +happenstance (or construction) these values drift far enough apart to +wreck the game. + +Replace cfs_rq::min_vruntime with cfs_rq::zero_vruntime which is kept +near/at avg_vruntime, following its motion. + +The down-side is that this requires computing the avg more often. + +Fixes: 147f3efaa241 ("sched/fair: Implement an EEVDF-like scheduling policy") +Reported-by: Zicheng Qu +Signed-off-by: Peter Zijlstra (Intel) +Link: https://patch.msgid.link/20251106111741.GC4068168@noisy.programming.kicks-ass.net +Cc: stable@vger.kernel.org +Signed-off-by: Sasha Levin +Signed-off-by: Greg Kroah-Hartman +--- + kernel/sched/debug.c | 8 +-- + kernel/sched/fair.c | 114 +++++++++++---------------------------------------- + kernel/sched/sched.h | 4 - + 3 files changed, 31 insertions(+), 95 deletions(-) + +--- a/kernel/sched/debug.c ++++ b/kernel/sched/debug.c +@@ -796,7 +796,7 @@ static void print_rq(struct seq_file *m, + + void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) + { +- s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, left_deadline = -1, spread; ++ s64 left_vruntime = -1, zero_vruntime, right_vruntime = -1, left_deadline = -1, spread; + struct sched_entity *last, *first, *root; + struct rq *rq = cpu_rq(cpu); + unsigned long flags; +@@ -819,15 +819,15 @@ void print_cfs_rq(struct seq_file *m, in + last = __pick_last_entity(cfs_rq); + if (last) + right_vruntime = last->vruntime; +- min_vruntime = cfs_rq->min_vruntime; ++ zero_vruntime = cfs_rq->zero_vruntime; + raw_spin_rq_unlock_irqrestore(rq, flags); + + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_deadline", + SPLIT_NS(left_deadline)); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_vruntime", + SPLIT_NS(left_vruntime)); +- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", +- SPLIT_NS(min_vruntime)); ++ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "zero_vruntime", ++ SPLIT_NS(zero_vruntime)); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime", + SPLIT_NS(avg_vruntime(cfs_rq))); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime", +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -554,7 +554,7 @@ static inline bool entity_before(const s + + static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) + { +- return (s64)(se->vruntime - cfs_rq->min_vruntime); ++ return (s64)(se->vruntime - cfs_rq->zero_vruntime); + } + + #define __node_2_se(node) \ +@@ -606,13 +606,13 @@ static inline s64 entity_key(struct cfs_ + * + * Which we track using: + * +- * v0 := cfs_rq->min_vruntime ++ * v0 := cfs_rq->zero_vruntime + * \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime + * \Sum w_i := cfs_rq->avg_load + * +- * Since min_vruntime is a monotonic increasing variable that closely tracks +- * the per-task service, these deltas: (v_i - v), will be in the order of the +- * maximal (virtual) lag induced in the system due to quantisation. ++ * Since zero_vruntime closely tracks the per-task service, these ++ * deltas: (v_i - v), will be in the order of the maximal (virtual) lag ++ * induced in the system due to quantisation. + * + * Also, we use scale_load_down() to reduce the size. + * +@@ -671,7 +671,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq) + avg = div_s64(avg, load); + } + +- return cfs_rq->min_vruntime + avg; ++ return cfs_rq->zero_vruntime + avg; + } + + /* +@@ -732,7 +732,7 @@ static int vruntime_eligible(struct cfs_ + load += weight; + } + +- return avg >= (s64)(vruntime - cfs_rq->min_vruntime) * load; ++ return avg >= (s64)(vruntime - cfs_rq->zero_vruntime) * load; + } + + int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) +@@ -740,42 +740,14 @@ int entity_eligible(struct cfs_rq *cfs_r + return vruntime_eligible(cfs_rq, se->vruntime); + } + +-static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime) ++static void update_zero_vruntime(struct cfs_rq *cfs_rq) + { +- u64 min_vruntime = cfs_rq->min_vruntime; +- /* +- * open coded max_vruntime() to allow updating avg_vruntime +- */ +- s64 delta = (s64)(vruntime - min_vruntime); +- if (delta > 0) { +- avg_vruntime_update(cfs_rq, delta); +- min_vruntime = vruntime; +- } +- return min_vruntime; +-} +- +-static void update_min_vruntime(struct cfs_rq *cfs_rq) +-{ +- struct sched_entity *se = __pick_root_entity(cfs_rq); +- struct sched_entity *curr = cfs_rq->curr; +- u64 vruntime = cfs_rq->min_vruntime; ++ u64 vruntime = avg_vruntime(cfs_rq); ++ s64 delta = (s64)(vruntime - cfs_rq->zero_vruntime); + +- if (curr) { +- if (curr->on_rq) +- vruntime = curr->vruntime; +- else +- curr = NULL; +- } +- +- if (se) { +- if (!curr) +- vruntime = se->min_vruntime; +- else +- vruntime = min_vruntime(vruntime, se->min_vruntime); +- } ++ avg_vruntime_update(cfs_rq, delta); + +- /* ensure we never gain time by being placed backwards. */ +- cfs_rq->min_vruntime = __update_min_vruntime(cfs_rq, vruntime); ++ cfs_rq->zero_vruntime = vruntime; + } + + static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq) +@@ -848,6 +820,7 @@ RB_DECLARE_CALLBACKS(static, min_vruntim + static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) + { + avg_vruntime_add(cfs_rq, se); ++ update_zero_vruntime(cfs_rq); + se->min_vruntime = se->vruntime; + se->min_slice = se->slice; + rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, +@@ -859,6 +832,7 @@ static void __dequeue_entity(struct cfs_ + rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, + &min_vruntime_cb); + avg_vruntime_sub(cfs_rq, se); ++ update_zero_vruntime(cfs_rq); + } + + struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq) +@@ -1226,7 +1200,6 @@ static void update_curr(struct cfs_rq *c + + curr->vruntime += calc_delta_fair(delta_exec, curr); + resched = update_deadline(cfs_rq, curr); +- update_min_vruntime(cfs_rq); + + if (entity_is_task(curr)) { + /* +@@ -3808,15 +3781,6 @@ static void reweight_entity(struct cfs_r + if (!curr) + __enqueue_entity(cfs_rq, se); + cfs_rq->nr_queued++; +- +- /* +- * The entity's vruntime has been adjusted, so let's check +- * whether the rq-wide min_vruntime needs updated too. Since +- * the calculations above require stable min_vruntime rather +- * than up-to-date one, we do the update at the end of the +- * reweight process. +- */ +- update_min_vruntime(cfs_rq); + } + } + +@@ -5432,15 +5396,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, st + + update_cfs_group(se); + +- /* +- * Now advance min_vruntime if @se was the entity holding it back, +- * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be +- * put back on, and if we advance min_vruntime, we'll be placed back +- * further than we started -- i.e. we'll be penalized. +- */ +- if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE) +- update_min_vruntime(cfs_rq); +- + if (flags & DEQUEUE_DELAYED) + finish_delayed_dequeue_entity(se); + +@@ -9028,7 +8983,6 @@ static void yield_task_fair(struct rq *r + if (entity_eligible(cfs_rq, se)) { + se->vruntime = se->deadline; + se->deadline += calc_delta_fair(se->slice, se); +- update_min_vruntime(cfs_rq); + } + } + +@@ -13077,23 +13031,6 @@ static inline void task_tick_core(struct + * Which shows that S and s_i transform alike (which makes perfect sense + * given that S is basically the (weighted) average of s_i). + * +- * Then: +- * +- * x -> s_min := min{s_i} (8) +- * +- * to obtain: +- * +- * \Sum_i w_i (s_i - s_min) +- * S = s_min + ------------------------ (9) +- * \Sum_i w_i +- * +- * Which already looks familiar, and is the basis for our current +- * approximation: +- * +- * S ~= s_min (10) +- * +- * Now, obviously, (10) is absolute crap :-), but it sorta works. +- * + * So the thing to remember is that the above is strictly UP. It is + * possible to generalize to multiple runqueues -- however it gets really + * yuck when you have to add affinity support, as illustrated by our very +@@ -13115,23 +13052,23 @@ static inline void task_tick_core(struct + * Let, for our runqueue 'k': + * + * T_k = \Sum_i w_i s_i +- * W_k = \Sum_i w_i ; for all i of k (11) ++ * W_k = \Sum_i w_i ; for all i of k (8) + * + * Then we can write (6) like: + * + * T_k +- * S_k = --- (12) ++ * S_k = --- (9) + * W_k + * + * From which immediately follows that: + * + * T_k + T_l +- * S_k+l = --------- (13) ++ * S_k+l = --------- (10) + * W_k + W_l + * + * On which we can define a combined lag: + * +- * lag_k+l(i) := S_k+l - s_i (14) ++ * lag_k+l(i) := S_k+l - s_i (11) + * + * And that gives us the tools to compare tasks across a combined runqueue. + * +@@ -13142,7 +13079,7 @@ static inline void task_tick_core(struct + * using (7); this only requires storing single 'time'-stamps. + * + * b) when comparing tasks between 2 runqueues of which one is forced-idle, +- * compare the combined lag, per (14). ++ * compare the combined lag, per (11). + * + * Now, of course cgroups (I so hate them) make this more interesting in + * that a) seems to suggest we need to iterate all cgroup on a CPU at such +@@ -13190,12 +13127,11 @@ static inline void task_tick_core(struct + * every tick. This limits the observed divergence due to the work + * conservancy. + * +- * On top of that, we can improve upon things by moving away from our +- * horrible (10) hack and moving to (9) and employing (13) here. ++ * On top of that, we can improve upon things by employing (10) here. + */ + + /* +- * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed. ++ * se_fi_update - Update the cfs_rq->zero_vruntime_fi in a CFS hierarchy if needed. + */ + static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq, + bool forceidle) +@@ -13209,7 +13145,7 @@ static void se_fi_update(const struct sc + cfs_rq->forceidle_seq = fi_seq; + } + +- cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime; ++ cfs_rq->zero_vruntime_fi = cfs_rq->zero_vruntime; + } + } + +@@ -13262,11 +13198,11 @@ bool cfs_prio_less(const struct task_str + + /* + * Find delta after normalizing se's vruntime with its cfs_rq's +- * min_vruntime_fi, which would have been updated in prior calls ++ * zero_vruntime_fi, which would have been updated in prior calls + * to se_fi_update(). + */ + delta = (s64)(sea->vruntime - seb->vruntime) + +- (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi); ++ (s64)(cfs_rqb->zero_vruntime_fi - cfs_rqa->zero_vruntime_fi); + + return delta > 0; + } +@@ -13502,7 +13438,7 @@ static void set_next_task_fair(struct rq + void init_cfs_rq(struct cfs_rq *cfs_rq) + { + cfs_rq->tasks_timeline = RB_ROOT_CACHED; +- cfs_rq->min_vruntime = (u64)(-(1LL << 20)); ++ cfs_rq->zero_vruntime = (u64)(-(1LL << 20)); + raw_spin_lock_init(&cfs_rq->removed.lock); + } + +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -682,10 +682,10 @@ struct cfs_rq { + s64 avg_vruntime; + u64 avg_load; + +- u64 min_vruntime; ++ u64 zero_vruntime; + #ifdef CONFIG_SCHED_CORE + unsigned int forceidle_seq; +- u64 min_vruntime_fi; ++ u64 zero_vruntime_fi; + #endif + + struct rb_root_cached tasks_timeline; diff --git a/queue-6.18/sched-proxy-yield-the-donor-task.patch b/queue-6.18/sched-proxy-yield-the-donor-task.patch new file mode 100644 index 0000000000..cc3328bd90 --- /dev/null +++ b/queue-6.18/sched-proxy-yield-the-donor-task.patch @@ -0,0 +1,111 @@ +From 127b90315ca07ccad2618db7ba950a63e3b32d22 Mon Sep 17 00:00:00 2001 +From: Fernand Sieber +Date: Thu, 6 Nov 2025 12:40:10 +0200 +Subject: sched/proxy: Yield the donor task + +From: Fernand Sieber + +commit 127b90315ca07ccad2618db7ba950a63e3b32d22 upstream. + +When executing a task in proxy context, handle yields as if they were +requested by the donor task. This matches the traditional PI semantics +of yield() as well. + +This avoids scenario like proxy task yielding, pick next task selecting the +same previous blocked donor, running the proxy task again, etc. + +Reported-by: kernel test robot +Closes: https://lore.kernel.org/oe-lkp/202510211205.1e0f5223-lkp@intel.com +Suggested-by: Peter Zijlstra +Signed-off-by: Fernand Sieber +Signed-off-by: Peter Zijlstra (Intel) +Link: https://patch.msgid.link/20251106104022.195157-1-sieberf@amazon.com +Cc: Holger Hoffstätte +Signed-off-by: Greg Kroah-Hartman +--- + kernel/sched/deadline.c | 2 +- + kernel/sched/ext.c | 4 ++-- + kernel/sched/fair.c | 2 +- + kernel/sched/rt.c | 2 +- + kernel/sched/syscalls.c | 5 +++-- + 5 files changed, 8 insertions(+), 7 deletions(-) + +--- a/kernel/sched/deadline.c ++++ b/kernel/sched/deadline.c +@@ -2143,7 +2143,7 @@ static void yield_task_dl(struct rq *rq) + * it and the bandwidth timer will wake it up and will give it + * new scheduling parameters (thanks to dl_yielded=1). + */ +- rq->curr->dl.dl_yielded = 1; ++ rq->donor->dl.dl_yielded = 1; + + update_rq_clock(rq); + update_curr_dl(rq); +--- a/kernel/sched/ext.c ++++ b/kernel/sched/ext.c +@@ -1493,7 +1493,7 @@ static bool dequeue_task_scx(struct rq * + static void yield_task_scx(struct rq *rq) + { + struct scx_sched *sch = scx_root; +- struct task_struct *p = rq->curr; ++ struct task_struct *p = rq->donor; + + if (SCX_HAS_OP(sch, yield)) + SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, p, NULL); +@@ -1504,7 +1504,7 @@ static void yield_task_scx(struct rq *rq + static bool yield_to_task_scx(struct rq *rq, struct task_struct *to) + { + struct scx_sched *sch = scx_root; +- struct task_struct *from = rq->curr; ++ struct task_struct *from = rq->donor; + + if (SCX_HAS_OP(sch, yield)) + return SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -8993,7 +8993,7 @@ static void put_prev_task_fair(struct rq + */ + static void yield_task_fair(struct rq *rq) + { +- struct task_struct *curr = rq->curr; ++ struct task_struct *curr = rq->donor; + struct cfs_rq *cfs_rq = task_cfs_rq(curr); + struct sched_entity *se = &curr->se; + +--- a/kernel/sched/rt.c ++++ b/kernel/sched/rt.c +@@ -1490,7 +1490,7 @@ static void requeue_task_rt(struct rq *r + + static void yield_task_rt(struct rq *rq) + { +- requeue_task_rt(rq, rq->curr, 0); ++ requeue_task_rt(rq, rq->donor, 0); + } + + static int find_lowest_rq(struct task_struct *task); +--- a/kernel/sched/syscalls.c ++++ b/kernel/sched/syscalls.c +@@ -1351,7 +1351,7 @@ static void do_sched_yield(void) + rq = this_rq_lock_irq(&rf); + + schedstat_inc(rq->yld_count); +- current->sched_class->yield_task(rq); ++ rq->donor->sched_class->yield_task(rq); + + preempt_disable(); + rq_unlock_irq(rq, &rf); +@@ -1420,12 +1420,13 @@ EXPORT_SYMBOL(yield); + */ + int __sched yield_to(struct task_struct *p, bool preempt) + { +- struct task_struct *curr = current; ++ struct task_struct *curr; + struct rq *rq, *p_rq; + int yielded = 0; + + scoped_guard (raw_spinlock_irqsave, &p->pi_lock) { + rq = this_rq(); ++ curr = rq->donor; + + again: + p_rq = task_rq(p); diff --git a/queue-6.18/sched_ext-fix-incorrect-sched_class-settings-for-per-cpu-migration-tasks.patch b/queue-6.18/sched_ext-fix-incorrect-sched_class-settings-for-per-cpu-migration-tasks.patch new file mode 100644 index 0000000000..64efa45a54 --- /dev/null +++ b/queue-6.18/sched_ext-fix-incorrect-sched_class-settings-for-per-cpu-migration-tasks.patch @@ -0,0 +1,105 @@ +From stable+bounces-204127-greg=kroah.com@vger.kernel.org Mon Dec 29 20:36:45 2025 +From: Sasha Levin +Date: Mon, 29 Dec 2025 14:36:40 -0500 +Subject: sched_ext: Fix incorrect sched_class settings for per-cpu migration tasks +To: stable@vger.kernel.org +Cc: Zqiang , Andrea Righi , Tejun Heo , Sasha Levin +Message-ID: <20251229193640.1641653-1-sashal@kernel.org> + +From: Zqiang + +[ Upstream commit 1dd6c84f1c544e552848a8968599220bd464e338 ] + +When loading the ebpf scheduler, the tasks in the scx_tasks list will +be traversed and invoke __setscheduler_class() to get new sched_class. +however, this would also incorrectly set the per-cpu migration +task's->sched_class to rt_sched_class, even after unload, the per-cpu +migration task's->sched_class remains sched_rt_class. + +The log for this issue is as follows: + +./scx_rustland --stats 1 +[ 199.245639][ T630] sched_ext: "rustland" does not implement cgroup cpu.weight +[ 199.269213][ T630] sched_ext: BPF scheduler "rustland" enabled +04:25:09 [INFO] RustLand scheduler attached + +bpftrace -e 'iter:task /strcontains(ctx->task->comm, "migration")/ +{ printf("%s:%d->%pS\n", ctx->task->comm, ctx->task->pid, ctx->task->sched_class); }' +Attaching 1 probe... +migration/0:24->rt_sched_class+0x0/0xe0 +migration/1:27->rt_sched_class+0x0/0xe0 +migration/2:33->rt_sched_class+0x0/0xe0 +migration/3:39->rt_sched_class+0x0/0xe0 +migration/4:45->rt_sched_class+0x0/0xe0 +migration/5:52->rt_sched_class+0x0/0xe0 +migration/6:58->rt_sched_class+0x0/0xe0 +migration/7:64->rt_sched_class+0x0/0xe0 + +sched_ext: BPF scheduler "rustland" disabled (unregistered from user space) +EXIT: unregistered from user space +04:25:21 [INFO] Unregister RustLand scheduler + +bpftrace -e 'iter:task /strcontains(ctx->task->comm, "migration")/ +{ printf("%s:%d->%pS\n", ctx->task->comm, ctx->task->pid, ctx->task->sched_class); }' +Attaching 1 probe... +migration/0:24->rt_sched_class+0x0/0xe0 +migration/1:27->rt_sched_class+0x0/0xe0 +migration/2:33->rt_sched_class+0x0/0xe0 +migration/3:39->rt_sched_class+0x0/0xe0 +migration/4:45->rt_sched_class+0x0/0xe0 +migration/5:52->rt_sched_class+0x0/0xe0 +migration/6:58->rt_sched_class+0x0/0xe0 +migration/7:64->rt_sched_class+0x0/0xe0 + +This commit therefore generate a new scx_setscheduler_class() and +add check for stop_sched_class to replace __setscheduler_class(). + +Fixes: f0e1a0643a59 ("sched_ext: Implement BPF extensible scheduler class") +Cc: stable@vger.kernel.org # v6.12+ +Signed-off-by: Zqiang +Reviewed-by: Andrea Righi +Signed-off-by: Tejun Heo +[ Adjust context ] +Signed-off-by: Sasha Levin +Signed-off-by: Greg Kroah-Hartman +--- + kernel/sched/ext.c | 14 ++++++++++---- + 1 file changed, 10 insertions(+), 4 deletions(-) + +--- a/kernel/sched/ext.c ++++ b/kernel/sched/ext.c +@@ -210,6 +210,14 @@ static struct scx_dispatch_q *find_user_ + return rhashtable_lookup_fast(&sch->dsq_hash, &dsq_id, dsq_hash_params); + } + ++static const struct sched_class *scx_setscheduler_class(struct task_struct *p) ++{ ++ if (p->sched_class == &stop_sched_class) ++ return &stop_sched_class; ++ ++ return __setscheduler_class(p->policy, p->prio); ++} ++ + /* + * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX + * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate +@@ -3994,8 +4002,7 @@ static void scx_disable_workfn(struct kt + scx_task_iter_start(&sti); + while ((p = scx_task_iter_next_locked(&sti))) { + const struct sched_class *old_class = p->sched_class; +- const struct sched_class *new_class = +- __setscheduler_class(p->policy, p->prio); ++ const struct sched_class *new_class = scx_setscheduler_class(p); + struct sched_enq_and_set_ctx ctx; + + if (old_class != new_class && p->se.sched_delayed) +@@ -4779,8 +4786,7 @@ static int scx_enable(struct sched_ext_o + scx_task_iter_start(&sti); + while ((p = scx_task_iter_next_locked(&sti))) { + const struct sched_class *old_class = p->sched_class; +- const struct sched_class *new_class = +- __setscheduler_class(p->policy, p->prio); ++ const struct sched_class *new_class = scx_setscheduler_class(p); + struct sched_enq_and_set_ctx ctx; + + if (!tryget_task_struct(p)) diff --git a/queue-6.18/series b/queue-6.18/series new file mode 100644 index 0000000000..60388b54c4 --- /dev/null +++ b/queue-6.18/series @@ -0,0 +1,10 @@ +sched-proxy-yield-the-donor-task.patch +drm-nova-depend-on-config_64bit.patch +x86-microcode-amd-select-which-microcode-patch-to-load.patch +sched-core-add-comment-explaining-force-idle-vruntime-snapshots.patch +sched-eevdf-fix-min_vruntime-vs-avg_vruntime.patch +sched_ext-fix-incorrect-sched_class-settings-for-per-cpu-migration-tasks.patch +mm-huge_memory-merge-uniform_split_supported-and-non_uniform_split_supported.patch +kvm-s390-fix-gmap_helper_zap_one_page-again.patch +drm-edid-add-drm_edid_ident_init-to-initialize-struct-drm_edid_ident.patch +drm-displayid-add-quirk-to-ignore-displayid-checksum-errors.patch diff --git a/queue-6.18/x86-microcode-amd-select-which-microcode-patch-to-load.patch b/queue-6.18/x86-microcode-amd-select-which-microcode-patch-to-load.patch new file mode 100644 index 0000000000..8b6331ceae --- /dev/null +++ b/queue-6.18/x86-microcode-amd-select-which-microcode-patch-to-load.patch @@ -0,0 +1,185 @@ +From 8d171045069c804e5ffaa18be590c42c6af0cf3f Mon Sep 17 00:00:00 2001 +From: "Borislav Petkov (AMD)" +Date: Thu, 25 Sep 2025 13:46:00 +0200 +Subject: x86/microcode/AMD: Select which microcode patch to load + +From: Borislav Petkov (AMD) + +commit 8d171045069c804e5ffaa18be590c42c6af0cf3f upstream. + +All microcode patches up to the proper BIOS Entrysign fix are loaded +only after the sha256 signature carried in the driver has been verified. + +Microcode patches after the Entrysign fix has been applied, do not need +that signature verification anymore. + +In order to not abandon machines which haven't received the BIOS update +yet, add the capability to select which microcode patch to load. + +The corresponding microcode container supplied through firmware-linux +has been modified to carry two patches per CPU type +(family/model/stepping) so that the proper one gets selected. + +Signed-off-by: Borislav Petkov (AMD) +Tested-by: Waiman Long +Link: https://patch.msgid.link/20251027133818.4363-1-bp@kernel.org +Signed-off-by: Greg Kroah-Hartman +--- + +--- + arch/x86/kernel/cpu/microcode/amd.c | 113 ++++++++++++++++++++++-------------- + 1 file changed, 72 insertions(+), 41 deletions(-) + +--- a/arch/x86/kernel/cpu/microcode/amd.c ++++ b/arch/x86/kernel/cpu/microcode/amd.c +@@ -186,50 +186,61 @@ static u32 cpuid_to_ucode_rev(unsigned i + return p.ucode_rev; + } + ++static u32 get_cutoff_revision(u32 rev) ++{ ++ switch (rev >> 8) { ++ case 0x80012: return 0x8001277; break; ++ case 0x80082: return 0x800820f; break; ++ case 0x83010: return 0x830107c; break; ++ case 0x86001: return 0x860010e; break; ++ case 0x86081: return 0x8608108; break; ++ case 0x87010: return 0x8701034; break; ++ case 0x8a000: return 0x8a0000a; break; ++ case 0xa0010: return 0xa00107a; break; ++ case 0xa0011: return 0xa0011da; break; ++ case 0xa0012: return 0xa001243; break; ++ case 0xa0082: return 0xa00820e; break; ++ case 0xa1011: return 0xa101153; break; ++ case 0xa1012: return 0xa10124e; break; ++ case 0xa1081: return 0xa108109; break; ++ case 0xa2010: return 0xa20102f; break; ++ case 0xa2012: return 0xa201212; break; ++ case 0xa4041: return 0xa404109; break; ++ case 0xa5000: return 0xa500013; break; ++ case 0xa6012: return 0xa60120a; break; ++ case 0xa7041: return 0xa704109; break; ++ case 0xa7052: return 0xa705208; break; ++ case 0xa7080: return 0xa708009; break; ++ case 0xa70c0: return 0xa70C009; break; ++ case 0xaa001: return 0xaa00116; break; ++ case 0xaa002: return 0xaa00218; break; ++ case 0xb0021: return 0xb002146; break; ++ case 0xb0081: return 0xb008111; break; ++ case 0xb1010: return 0xb101046; break; ++ case 0xb2040: return 0xb204031; break; ++ case 0xb4040: return 0xb404031; break; ++ case 0xb4041: return 0xb404101; break; ++ case 0xb6000: return 0xb600031; break; ++ case 0xb6080: return 0xb608031; break; ++ case 0xb7000: return 0xb700031; break; ++ default: break; ++ ++ } ++ return 0; ++} ++ + static bool need_sha_check(u32 cur_rev) + { ++ u32 cutoff; ++ + if (!cur_rev) { + cur_rev = cpuid_to_ucode_rev(bsp_cpuid_1_eax); + pr_info_once("No current revision, generating the lowest one: 0x%x\n", cur_rev); + } + +- switch (cur_rev >> 8) { +- case 0x80012: return cur_rev <= 0x8001277; break; +- case 0x80082: return cur_rev <= 0x800820f; break; +- case 0x83010: return cur_rev <= 0x830107c; break; +- case 0x86001: return cur_rev <= 0x860010e; break; +- case 0x86081: return cur_rev <= 0x8608108; break; +- case 0x87010: return cur_rev <= 0x8701034; break; +- case 0x8a000: return cur_rev <= 0x8a0000a; break; +- case 0xa0010: return cur_rev <= 0xa00107a; break; +- case 0xa0011: return cur_rev <= 0xa0011da; break; +- case 0xa0012: return cur_rev <= 0xa001243; break; +- case 0xa0082: return cur_rev <= 0xa00820e; break; +- case 0xa1011: return cur_rev <= 0xa101153; break; +- case 0xa1012: return cur_rev <= 0xa10124e; break; +- case 0xa1081: return cur_rev <= 0xa108109; break; +- case 0xa2010: return cur_rev <= 0xa20102f; break; +- case 0xa2012: return cur_rev <= 0xa201212; break; +- case 0xa4041: return cur_rev <= 0xa404109; break; +- case 0xa5000: return cur_rev <= 0xa500013; break; +- case 0xa6012: return cur_rev <= 0xa60120a; break; +- case 0xa7041: return cur_rev <= 0xa704109; break; +- case 0xa7052: return cur_rev <= 0xa705208; break; +- case 0xa7080: return cur_rev <= 0xa708009; break; +- case 0xa70c0: return cur_rev <= 0xa70C009; break; +- case 0xaa001: return cur_rev <= 0xaa00116; break; +- case 0xaa002: return cur_rev <= 0xaa00218; break; +- case 0xb0021: return cur_rev <= 0xb002146; break; +- case 0xb0081: return cur_rev <= 0xb008111; break; +- case 0xb1010: return cur_rev <= 0xb101046; break; +- case 0xb2040: return cur_rev <= 0xb204031; break; +- case 0xb4040: return cur_rev <= 0xb404031; break; +- case 0xb4041: return cur_rev <= 0xb404101; break; +- case 0xb6000: return cur_rev <= 0xb600031; break; +- case 0xb6080: return cur_rev <= 0xb608031; break; +- case 0xb7000: return cur_rev <= 0xb700031; break; +- default: break; +- } ++ cutoff = get_cutoff_revision(cur_rev); ++ if (cutoff) ++ return cur_rev <= cutoff; + + pr_info("You should not be seeing this. Please send the following couple of lines to x86--kernel.org\n"); + pr_info("CPUID(1).EAX: 0x%x, current revision: 0x%x\n", bsp_cpuid_1_eax, cur_rev); +@@ -494,6 +505,7 @@ static int verify_patch(const u8 *buf, s + { + u8 family = x86_family(bsp_cpuid_1_eax); + struct microcode_header_amd *mc_hdr; ++ u32 cur_rev, cutoff, patch_rev; + u32 sh_psize; + u16 proc_id; + u8 patch_fam; +@@ -533,11 +545,32 @@ static int verify_patch(const u8 *buf, s + proc_id = mc_hdr->processor_rev_id; + patch_fam = 0xf + (proc_id >> 12); + +- ucode_dbg("Patch-ID 0x%08x: family: 0x%x\n", mc_hdr->patch_id, patch_fam); +- + if (patch_fam != family) + return 1; + ++ cur_rev = get_patch_level(); ++ ++ /* No cutoff revision means old/unaffected by signing algorithm weakness => matches */ ++ cutoff = get_cutoff_revision(cur_rev); ++ if (!cutoff) ++ goto ok; ++ ++ patch_rev = mc_hdr->patch_id; ++ ++ ucode_dbg("cur_rev: 0x%x, cutoff: 0x%x, patch_rev: 0x%x\n", ++ cur_rev, cutoff, patch_rev); ++ ++ if (cur_rev <= cutoff && patch_rev <= cutoff) ++ goto ok; ++ ++ if (cur_rev > cutoff && patch_rev > cutoff) ++ goto ok; ++ ++ return 1; ++ ++ok: ++ ucode_dbg("Patch-ID 0x%08x: family: 0x%x\n", mc_hdr->patch_id, patch_fam); ++ + return 0; + } + +@@ -606,8 +639,6 @@ static size_t parse_container(u8 *ucode, + + mc = (struct microcode_amd *)(buf + SECTION_HDR_SIZE); + +- ucode_dbg("patch_id: 0x%x\n", mc->hdr.patch_id); +- + if (mc_patch_matches(mc, eq_id)) { + desc->psize = patch_size; + desc->mc = mc; -- 2.47.3